1f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang/*
2f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *
4f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  Use of this source code is governed by a BSD-style license
5f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  that can be found in the LICENSE file in the root of the source
6f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  tree. An additional intellectual property rights grant can be found
7f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  in the file PATENTS. All contributing project authors may
8f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang *  be found in the AUTHORS file in the root of the source tree.
9f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang */
10f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
11f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#include "libyuv/row.h"
12f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
13f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
14f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangnamespace libyuv {
15f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangextern "C" {
16f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
17f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
18f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// This module is for GCC x86 and x64.
19f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if !defined(LIBYUV_DISABLE_X86) && \
20f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
22f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
24f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Constants for ARGB
25b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
26b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        13, 65, 33, 0, 13, 65, 33, 0};
27f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
28f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// JPeg full range.
29b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
30b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         15, 75, 38, 0, 15, 75, 38, 0};
31f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
32f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
33f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
34f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
35b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
36b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        112, -74, -38, 0, 112, -74, -38, 0};
37f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
38b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
39b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         127, -84, -43, 0, 127, -84, -43, 0};
40f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
41f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic vec8 kARGBToV = {
42b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
43f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang};
44f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
45b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
46b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         -20, -107, 127, 0, -20, -107, 127, 0};
47f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
48f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Constants for BGRA
49b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
50b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        0, 33, 65, 13, 0, 33, 65, 13};
51f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
52b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
53b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        0, -38, -74, 112, 0, -38, -74, 112};
54f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
55b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
56b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        0, 112, -94, -18, 0, 112, -94, -18};
57f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
58f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Constants for ABGR
59b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
60b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        33, 65, 13, 0, 33, 65, 13, 0};
61f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
62b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
63b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        -38, -74, 112, 0, -38, -74, 112, 0};
64f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
65b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
66b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        112, -94, -18, 0, 112, -94, -18, 0};
67f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
68f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Constants for RGBA.
69b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
70b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        0, 13, 65, 33, 0, 13, 65, 33};
71f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
72b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
73b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        0, 112, -74, -38, 0, 112, -74, -38};
74f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
75b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
76b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        0, -18, -94, 112, 0, -18, -94, 112};
77f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
78b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
79b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
80f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
81f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 7 bit fixed point 0.5.
82b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
83f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
84b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
85b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
86f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
87b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
88b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                            0x8080u, 0x8080u, 0x8080u, 0x8080u};
89f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
90f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
91f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_RGB24TOARGBROW_SSSE3
92f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
93f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for converting RGB24 to ARGB.
94b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u,  5u,  13u,
95b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                        6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
96f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
97f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for converting RAW to ARGB.
98b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
99b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                      8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for converting RAW to RGB24.  First 8.
102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const uvec8 kShuffleMaskRAWToRGB24_0 = {
103b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
104b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for converting RAW to RGB24.  Middle 8.
107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const uvec8 kShuffleMaskRAWToRGB24_1 = {
108b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
109b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for converting RAW to RGB24.  Last 8.
112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const uvec8 kShuffleMaskRAWToRGB24_2 = {
113b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
114b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
116f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for converting ARGB to RGB24.
117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic uvec8 kShuffleMaskARGBToRGB24 = {
118b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for converting ARGB to RAW.
121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic uvec8 kShuffleMaskARGBToRAW = {
122b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic uvec8 kShuffleMaskARGBToRGB24_0 = {
126b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// YUY2 shuf 16 Y to 32 Y.
129b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
130b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
131b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// YUY2 shuf 8 UV to 16 UV.
134b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
135b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
136b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// UYVY shuf 16 Y to 32 Y.
139b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
140b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
141b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// UYVY shuf 8 UV to 16 UV.
144b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
145b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
146b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
148f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// NV21 shuf 8 VU to 16 UV.
149f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const lvec8 kShuffleNV21 = {
150b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
151b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang};
153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_RGB24TOARGBROW_SSSE3
154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_J400TOARGBROW_SSE2
156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm5                    \n"
160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
161b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(0) ",%%xmm0         \n"
163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,0) ",%0            \n"
164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm0                   \n"
165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm0,%%xmm0                   \n"
167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm1,%%xmm1                   \n"
168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm0                   \n"
169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm1                   \n"
170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_y),     // %0
176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_J400TOARGBROW_SSE2
182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_RGB24TOARGBROW_SSSE3
184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm5                    \n"
188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
190b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x30,0) ",%0           \n"
195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm3,%%xmm2                   \n"
196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "palignr   $0x8,%%xmm1,%%xmm2              \n"
197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm2                   \n"
198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm2                   \n"
199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "palignr   $0xc,%%xmm0,%%xmm1              \n"
200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm0                   \n"
201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm0                   \n"
203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm1                   \n"
204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm1                   \n"
206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "palignr   $0x4,%%xmm3,%%xmm3              \n"
207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm3                   \n"
208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm3                   \n"
210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_rgb24),  // %0
215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleMaskRGB24ToARGB)  // %3
218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm5                    \n"
226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
228b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x30,0) ",%0           \n"
233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm3,%%xmm2                   \n"
234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "palignr   $0x8,%%xmm1,%%xmm2              \n"
235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm2                   \n"
236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm2                   \n"
237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "palignr   $0xc,%%xmm0,%%xmm1              \n"
238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm0                   \n"
239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm0                   \n"
241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm1                   \n"
242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm1                   \n"
244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "palignr   $0x4,%%xmm3,%%xmm3              \n"
245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm3                   \n"
246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm3                   \n"
248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
251f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_raw),   // %0
253f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleMaskRAWToARGB)  // %3
256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
257f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
259f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang   "movdqa     %3,%%xmm3                       \n"
263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang   "movdqa     %4,%%xmm4                       \n"
264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang   "movdqa     %5,%%xmm5                       \n"
265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
266b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
267f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
268f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x18,0) ",%0           \n"
271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm3,%%xmm0                   \n"
272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm1                   \n"
273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm5,%%xmm2                   \n"
274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x18,1) ",%1           \n"
278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_raw),    // %0
281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_rgb24),  // %1
282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)       // %2
283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleMaskRAWToRGB24_0),  // %3
284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kShuffleMaskRAWToRGB24_1),  // %4
285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kShuffleMaskRAWToRGB24_2)   // %5
286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
292f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       $0x1080108,%%eax                \n"
293f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%eax,%%xmm5                    \n"
294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
295f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       $0x20802080,%%eax               \n"
296f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%eax,%%xmm6                    \n"
297f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
298f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm3,%%xmm3                   \n"
299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0xb,%%xmm3                     \n"
300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm4,%%xmm4                   \n"
301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0xa,%%xmm4                     \n"
302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x5,%%xmm4                     \n"
303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm7,%%xmm7                   \n"
304f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0x8,%%xmm7                     \n"
305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
308b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm2                   \n"
312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm3,%%xmm1                   \n"
313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0xb,%%xmm2                     \n"
314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm5,%%xmm1                   \n"
315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm5,%%xmm2                   \n"
316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0x8,%%xmm1                     \n"
317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm2,%%xmm1                   \n"
318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm4,%%xmm0                   \n"
319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm6,%%xmm0                   \n"
320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm7,%%xmm0                   \n"
321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm1,%%xmm2                   \n"
322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm1                   \n"
323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm0,%%xmm2                   \n"
324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
331f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)   // %2
332f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "eax", NACL_R14
334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
337f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
339f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
340f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       $0x1080108,%%eax                \n"
341f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%eax,%%xmm5                    \n"
342f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       $0x42004200,%%eax               \n"
344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%eax,%%xmm6                    \n"
345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm3,%%xmm3                   \n"
347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0xb,%%xmm3                     \n"
348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm3,%%xmm4                   \n"
349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x6,%%xmm4                     \n"
350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm7,%%xmm7                   \n"
351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0x8,%%xmm7                     \n"
352f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
353f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
354f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
355b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
356f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
357f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
358f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm2                   \n"
359f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0x1,%%xmm1                     \n"
360f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0xb,%%xmm2                     \n"
361f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm3,%%xmm1                   \n"
362f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm5,%%xmm2                   \n"
363f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm5,%%xmm1                   \n"
364f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0x8,%%xmm1                     \n"
365f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm2,%%xmm1                   \n"
366f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm2                   \n"
367f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm4,%%xmm0                   \n"
368f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm2                     \n"
369f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm6,%%xmm0                   \n"
370f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm7,%%xmm2                   \n"
371f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm2,%%xmm0                   \n"
372f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm1,%%xmm2                   \n"
373f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm1                   \n"
374f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm0,%%xmm2                   \n"
375f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
376f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
377f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
378f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
379f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
380f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
381f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
382f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)   // %2
383f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
384f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "eax", NACL_R14
385f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
386f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
387f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
388f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
389f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
390f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
391f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       $0xf0f0f0f,%%eax                \n"
392f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%eax,%%xmm4                    \n"
393f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
394f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm4,%%xmm5                   \n"
395f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x4,%%xmm5                     \n"
396f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
397f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
398f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
399b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
400f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
401f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm2                   \n"
402f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm4,%%xmm0                   \n"
403f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm2                   \n"
404f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
405f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm3                   \n"
406f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0x4,%%xmm1                     \n"
407f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x4,%%xmm3                     \n"
408f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm1,%%xmm0                   \n"
409f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm3,%%xmm2                   \n"
410f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
411f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm2,%%xmm0                   \n"
412f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm2,%%xmm1                   \n"
413f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
414f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
415f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
416f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
417f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
418f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
419f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
420f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)   // %2
421f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
422f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "eax", NACL_R14
423f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
424f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
425f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
426f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
427f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
428f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
429f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm6                       \n"
430f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
431b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
432f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
433f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
434f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
435f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
436f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
437f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm0                   \n"
438f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm1                   \n"
439f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm2                   \n"
440f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm3                   \n"
441f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm1,%%xmm4                   \n"
442f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrldq    $0x4,%%xmm1                     \n"
443f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslldq    $0xc,%%xmm4                     \n"
444f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm5                   \n"
445f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm4,%%xmm0                   \n"
446f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslldq    $0x8,%%xmm5                     \n"
447f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
448f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm1                   \n"
449f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrldq    $0x8,%%xmm2                     \n"
450f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslldq    $0x4,%%xmm3                     \n"
451f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm3,%%xmm2                   \n"
452f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
453f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
454f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x30,1) ",%1           \n"
455f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
456f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
457f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
458f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
459f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)   // %2
460f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleMaskARGBToRGB24)  // %3
461f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
462f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
463f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
464f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
465f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
466f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
467f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm6                       \n"
468f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
469b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
470f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
471f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
472f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
473f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
474f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
475f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm0                   \n"
476f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm1                   \n"
477f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm2                   \n"
478f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm3                   \n"
479f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm1,%%xmm4                   \n"
480f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrldq    $0x4,%%xmm1                     \n"
481f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslldq    $0xc,%%xmm4                     \n"
482f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm5                   \n"
483f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm4,%%xmm0                   \n"
484f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslldq    $0x8,%%xmm5                     \n"
485f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
486f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm1                   \n"
487f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrldq    $0x8,%%xmm2                     \n"
488f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslldq    $0x4,%%xmm3                     \n"
489f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm3,%%xmm2                   \n"
490f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
491f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
492f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x30,1) ",%1           \n"
493f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
494f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
495f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
496f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
497f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)   // %2
498f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleMaskARGBToRAW)  // %3
499f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
500f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
501f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
502f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
503f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
504f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
505f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm3,%%xmm3                   \n"
506f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x1b,%%xmm3                    \n"
507f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm4,%%xmm4                   \n"
508f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x1a,%%xmm4                    \n"
509f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x5,%%xmm4                     \n"
510f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
511f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0xb,%%xmm5                     \n"
512f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
513b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
514f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
515f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
516f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm2                   \n"
517f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x8,%%xmm0                     \n"
518f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x3,%%xmm1                     \n"
519f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x5,%%xmm2                     \n"
520f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrad     $0x10,%%xmm0                    \n"
521f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm3,%%xmm1                   \n"
522f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm4,%%xmm2                   \n"
523f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm0                   \n"
524f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm2,%%xmm1                   \n"
525f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm1,%%xmm0                   \n"
526f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm0,%%xmm0                   \n"
527f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
528f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
529f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
530f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%2                         \n"
531f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
532f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
533f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
534f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)   // %2
535f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
536f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
537f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
538f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
539b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBToRGB565DitherRow_SSE2(const uint8* src,
540b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                uint8* dst,
541b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                const uint32 dither4,
542b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                int width) {
543b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile(
544b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movd       %3,%%xmm6                      \n"
545b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "punpcklbw  %%xmm6,%%xmm6                  \n"
546b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movdqa     %%xmm6,%%xmm7                  \n"
547b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "punpcklwd  %%xmm6,%%xmm6                  \n"
548b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "punpckhwd  %%xmm7,%%xmm7                  \n"
549b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pcmpeqb    %%xmm3,%%xmm3                  \n"
550b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "psrld      $0x1b,%%xmm3                   \n"
551b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pcmpeqb    %%xmm4,%%xmm4                  \n"
552b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "psrld      $0x1a,%%xmm4                   \n"
553b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pslld      $0x5,%%xmm4                    \n"
554b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pcmpeqb    %%xmm5,%%xmm5                  \n"
555b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pslld      $0xb,%%xmm5                    \n"
556b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
557b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      LABELALIGN
558b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "1:                                        \n"
559b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movdqu     (%0),%%xmm0                    \n"
560b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "paddusb    %%xmm6,%%xmm0                  \n"
561b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movdqa     %%xmm0,%%xmm1                  \n"
562b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movdqa     %%xmm0,%%xmm2                  \n"
563b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pslld      $0x8,%%xmm0                    \n"
564b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "psrld      $0x3,%%xmm1                    \n"
565b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "psrld      $0x5,%%xmm2                    \n"
566b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "psrad      $0x10,%%xmm0                   \n"
567b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pand       %%xmm3,%%xmm1                  \n"
568b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pand       %%xmm4,%%xmm2                  \n"
569b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pand       %%xmm5,%%xmm0                  \n"
570b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "por        %%xmm2,%%xmm1                  \n"
571b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "por        %%xmm1,%%xmm0                  \n"
572b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "packssdw   %%xmm0,%%xmm0                  \n"
573b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "lea        0x10(%0),%0                    \n"
574b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movq       %%xmm0,(%1)                    \n"
575b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "lea        0x8(%1),%1                     \n"
576b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        $0x4,%2                        \n"
577b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "jg        1b                              \n"
578b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      : "+r"(src),    // %0
579b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(dst),    // %1
580b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(width)   // %2
581b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      : "m"(dither4)  // %3
582b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
583b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "xmm7");
584f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
585f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
586f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
587b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBToRGB565DitherRow_AVX2(const uint8* src,
588b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                uint8* dst,
589b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                const uint32 dither4,
590b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                int width) {
591b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile(
592b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vbroadcastss %3,%%xmm6                    \n"
593b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
594b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
595b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
596b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
597b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
598b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
599b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
600b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpslld     $0x5,%%ymm4,%%ymm4             \n"
601b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpslld     $0xb,%%ymm3,%%ymm5             \n"
602b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
603b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      LABELALIGN
604b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "1:                                        \n"
605b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vmovdqu    (%0),%%ymm0                    \n"
606b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
607b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
608b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
609b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
610b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
611b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
612b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
613b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
614b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
615b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
616b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
617b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "lea        0x20(%0),%0                    \n"
618b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vmovdqu    %%xmm0,(%1)                    \n"
619b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "lea        0x10(%1),%1                    \n"
620b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        $0x8,%2                        \n"
621b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "jg         1b                             \n"
622b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vzeroupper                                \n"
623b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      : "+r"(src),    // %0
624b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(dst),    // %1
625b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(width)   // %2
626b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      : "m"(dither4)  // %3
627b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
628b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "xmm7");
629f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
630f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
631f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
632f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
633f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
634f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm4,%%xmm4                   \n"
635f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x1b,%%xmm4                    \n"
636f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm4,%%xmm5                   \n"
637f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x5,%%xmm5                     \n"
638f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm4,%%xmm6                   \n"
639f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0xa,%%xmm6                     \n"
640f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm7,%%xmm7                   \n"
641f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0xf,%%xmm7                     \n"
642b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
643f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
644b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
645f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
646f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
647f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm2                   \n"
648f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm3                   \n"
649f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrad     $0x10,%%xmm0                    \n"
650f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x3,%%xmm1                     \n"
651f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x6,%%xmm2                     \n"
652f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x9,%%xmm3                     \n"
653f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm7,%%xmm0                   \n"
654f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm4,%%xmm1                   \n"
655f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm2                   \n"
656f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm6,%%xmm3                   \n"
657f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm1,%%xmm0                   \n"
658f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm3,%%xmm2                   \n"
659f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm2,%%xmm0                   \n"
660f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm0,%%xmm0                   \n"
661f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
662f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
663f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
664f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%2                         \n"
665f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
666f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
667f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
668f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)   // %2
669f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :: "memory", "cc",
670f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
671f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
672f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
673f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
674f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
675f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
676f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm4,%%xmm4                   \n"
677f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0xc,%%xmm4                     \n"
678f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm4,%%xmm3                   \n"
679f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm3                     \n"
680b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
681f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
682b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
683f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
684f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
685f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm3,%%xmm0                   \n"
686f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm4,%%xmm1                   \n"
687f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlq     $0x4,%%xmm0                     \n"
688f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlq     $0x8,%%xmm1                     \n"
689f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm1,%%xmm0                   \n"
690f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
691f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
692f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
693f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
694f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%2                         \n"
695f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
696f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
697f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
698f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)   // %2
699f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
700f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
701f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
702f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_RGB24TOARGBROW_SSSE3
703f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
704f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOYROW_SSSE3
705f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
706f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
707f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
708f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
709f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm5                       \n"
710b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
711f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
712b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
713f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
714f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
715f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
716f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
717f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
718f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
719f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
720f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm3                   \n"
721f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
722f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm0                   \n"
723f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm3,%%xmm2                   \n"
724f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm0                     \n"
725f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm2                     \n"
726f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm2,%%xmm0                   \n"
727f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
728f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
729f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
730f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
731f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
732f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
733f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
734f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
735f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kARGBToY),   // %3
736f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddY16)     // %4
737f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
738f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
739f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
740f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOYROW_SSSE3
741f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
742f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOYJROW_SSSE3
743f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
744f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
745f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
746f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
747f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
748f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm5                       \n"
749b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
750f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
751b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
752f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
753f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
754f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
755f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
756f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
757f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
758f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
759f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm3                   \n"
760f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
761f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm0                   \n"
762f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm3,%%xmm2                   \n"
763f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm5,%%xmm0                   \n"
764f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm5,%%xmm2                   \n"
765f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm0                     \n"
766f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm2                     \n"
767f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm2,%%xmm0                   \n"
768f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
769f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
770f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
771f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
772f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
773f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
774f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
775f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kARGBToYJ),  // %3
776f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddYJ64)    // %4
777f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
778f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
779f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
780f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOYJROW_SSSE3
781f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
782f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOYROW_AVX2
783f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// vpermd for vphaddw + vpackuswb vpermd.
784b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
785f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
786f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
787f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
788f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
789f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %3,%%ymm4                  \n"
790f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %4,%%ymm5                  \n"
791f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %5,%%ymm6                      \n"
792b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
793f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
794b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
795f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
796f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
797f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
798f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
799f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
800f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
801f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
802f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
803f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x80,0) ",%0           \n"
804f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
805f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
806f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
807f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
808f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
809f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
810f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
811f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
812f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
813f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
814f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
815f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
816f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
817f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
818f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
819f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kARGBToY),   // %3
820f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddY16),    // %4
821f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kPermdARGBToY_AVX)  // %5
822f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
824f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
825f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOYROW_AVX2
826f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
827f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOYJROW_AVX2
828f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
829f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
830f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
831f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %3,%%ymm4                  \n"
832f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %4,%%ymm5                  \n"
833f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %5,%%ymm6                      \n"
834b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
835f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
836b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
837f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
838f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
839f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
840f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
841f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
842f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
843f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
844f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
845f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x80,0) ",%0           \n"
846f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
847f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
848f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
849f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
850f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
851f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
852f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
853f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
854f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
855f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
856f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
857f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
858f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
859f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
860f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
861f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
862f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kARGBToYJ),   // %3
863f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddYJ64),    // %4
864f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kPermdARGBToY_AVX)  // %5
865f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
866f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
867f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
868f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOYJROW_AVX2
869f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
870f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOUVROW_SSSE3
871b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBToUVRow_SSSE3(const uint8* src_argb0,
872b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int src_stride_argb,
873b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_u,
874b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_v,
875b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int width) {
876f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
877f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %5,%%xmm3                       \n"
878f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %6,%%xmm4                       \n"
879f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %7,%%xmm5                       \n"
880f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
881b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
882f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
883b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
884f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
885f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
886f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
887f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
888f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
889f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm1                   \n"
890f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
891f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
892f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
893f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
894f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
895f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm6                   \n"
896f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
897f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
898f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm7                   \n"
899f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm1,%%xmm0             \n"
900f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
901f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
902f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm7                   \n"
903f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm6,%%xmm2             \n"
904f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
905f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
906f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
907f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm6                   \n"
908f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
909f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
910f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm1                   \n"
911f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm6                   \n"
912f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm2,%%xmm0                   \n"
913f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm6,%%xmm1                   \n"
914f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm0                     \n"
915f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm1                     \n"
916f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packsswb  %%xmm1,%%xmm0                   \n"
917f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
918f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlps    %%xmm0," MEMACCESS(1) "         \n"
919f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
920f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
921f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
922f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
923f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),       // %0
924f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),           // %1
925f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),           // %2
926f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)           // %3
927f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride_argb)), // %4
928f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToV),  // %5
929f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToU),  // %6
930f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddUV128)  // %7
931f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
932f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
933f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
934f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
935f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOUVROW_SSSE3
936f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
937f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOUVROW_AVX2
938f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// vpshufb for vphaddw + vpackuswb packed to shorts.
939f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const lvec8 kShufARGBToUV_AVX = {
940b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
941b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
942b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBToUVRow_AVX2(const uint8* src_argb0,
943b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int src_stride_argb,
944b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_u,
945b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_v,
946b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int width) {
947f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
948f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %5,%%ymm5                  \n"
949f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %6,%%ymm6                  \n"
950f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %7,%%ymm7                  \n"
951b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        %1,%2                          \n"
952b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
953f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
954b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
955f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
956f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
957f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
958f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
959f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
960f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
961f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
962f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
963b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea        " MEMLEA(0x80,0) ",%0          \n"
964f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
965f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
966f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
967f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
968f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
969f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
970f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
971f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
972f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
973f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
974f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
975f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
976f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
977f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
978f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
979f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
980f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
981f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %8,%%ymm0,%%ymm0               \n"
982f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
983f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
984f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
985f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
986b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea        " MEMLEA(0x10,1) ",%1          \n"
987b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        $0x20,%3                       \n"
988b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg         1b                             \n"
989f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
990f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),       // %0
991f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),           // %1
992f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),           // %2
993f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)           // %3
994f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride_argb)), // %4
995f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddUV128),  // %5
996f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToV),   // %6
997f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToU),   // %7
998f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kShufARGBToUV_AVX)  // %8
999f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
1000f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1001f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1002f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1003f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOUVROW_AVX2
1004f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1005f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOUVJROW_AVX2
1006b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBToUVJRow_AVX2(const uint8* src_argb0,
1007b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int src_stride_argb,
1008b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_u,
1009b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_v,
1010b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int width) {
1011f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1012f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %5,%%ymm5                  \n"
1013f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %6,%%ymm6                  \n"
1014f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %7,%%ymm7                  \n"
1015b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        %1,%2                          \n"
1016b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1017f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1018b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1019f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
1020f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
1021f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
1022f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
1023f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
1024f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
1025f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
1026f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
1027f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x80,0) ",%0           \n"
1028f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
1029f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
1030f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
1031f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
1032f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
1033f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
1034f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1035f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
1036f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
1037f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
1038f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
1039f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
1040f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
1041f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
1042f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
1043f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
1044f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
1045f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
1046f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
1047f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %8,%%ymm0,%%ymm0               \n"
1048f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1049f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1050f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1051f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
1052f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%3                        \n"
1053f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1054f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
1055f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),       // %0
1056f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),           // %1
1057f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),           // %2
1058f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)           // %3
1059f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride_argb)), // %4
1060f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddUVJ128),  // %5
1061f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToVJ),  // %6
1062f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToUJ),  // %7
1063f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kShufARGBToUV_AVX)  // %8
1064f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
1065f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1066f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1067f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1068f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOUVJROW_AVX2
1069f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1070f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOUVJROW_SSSE3
1071b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBToUVJRow_SSSE3(const uint8* src_argb0,
1072b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int src_stride_argb,
1073b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst_u,
1074b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst_v,
1075b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int width) {
1076f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1077f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %5,%%xmm3                       \n"
1078f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %6,%%xmm4                       \n"
1079f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %7,%%xmm5                       \n"
1080f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
1081b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1082f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1083b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1084f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1085f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1086f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
1087f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1088f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1089f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm1                   \n"
1090f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1091f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1092f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
1093f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1094f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1095f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm6                   \n"
1096f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1097f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
1098f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm7                   \n"
1099f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
1102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm7                   \n"
1103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1104f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
1106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
1107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm6                   \n"
1108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
1109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
1110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm1                   \n"
1111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm6                   \n"
1112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm2,%%xmm0                   \n"
1113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm6,%%xmm1                   \n"
1114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm5,%%xmm0                   \n"
1115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm5,%%xmm1                   \n"
1116f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm0                     \n"
1117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm1                     \n"
1118f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packsswb  %%xmm1,%%xmm0                   \n"
1119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
1122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
1123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),       // %0
1125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),           // %1
1126f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),           // %2
1127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)           // %3
1128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride_argb)), // %4
1129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToVJ),  // %5
1130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToUJ),  // %6
1131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddUVJ128)  // %7
1132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
1133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1134f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1135f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOUVJROW_SSSE3
1137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBTOUV444ROW_SSSE3
1139b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBToUV444Row_SSSE3(const uint8* src_argb,
1140b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          uint8* dst_u,
1141b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          uint8* dst_v,
1142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                          int width) {
1143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm3                       \n"
1145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %5,%%xmm4                       \n"
1146f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %6,%%xmm5                       \n"
1147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
1148b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1149f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1150b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
1156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
1157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
1158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm6                   \n"
1159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm0                   \n"
1160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm6,%%xmm2                   \n"
1161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm0                     \n"
1162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm2                     \n"
1163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packsswb  %%xmm2,%%xmm0                   \n"
1164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
1165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm0                   \n"
1171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm1                   \n"
1172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm2                   \n"
1173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm6                   \n"
1174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm0                   \n"
1175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm6,%%xmm2                   \n"
1176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm0                     \n"
1177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm2                     \n"
1178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packsswb  %%xmm2,%%xmm0                   \n"
1179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
1180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
1181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
1182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
1183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
1184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),        // %0
1186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),           // %1
1187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),           // %2
1188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)           // %3
1189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kARGBToV),  // %4
1190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToU),  // %5
1191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddUV128)  // %6
1192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
1193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm6"
1194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBTOUV444ROW_SSSE3
1197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
1199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm5                       \n"
1201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
1202b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1204b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
1210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
1211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
1212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm3                   \n"
1213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
1214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm0                   \n"
1215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm3,%%xmm2                   \n"
1216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm0                     \n"
1217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm2                     \n"
1218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm2,%%xmm0                   \n"
1219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
1220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
1222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
1223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_bgra),  // %0
1225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
1226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
1227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kBGRAToY),   // %3
1228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddY16)     // %4
1229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1233b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid BGRAToUVRow_SSSE3(const uint8* src_bgra0,
1234b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int src_stride_bgra,
1235b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_u,
1236b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_v,
1237b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int width) {
1238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %5,%%xmm3                       \n"
1240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %6,%%xmm4                       \n"
1241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %7,%%xmm5                       \n"
1242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
1243b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1245b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
1249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1251f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm1                   \n"
1252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1253f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
1255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1257f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm6                   \n"
1258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1259f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
1260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm7                   \n"
1261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
1264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm7                   \n"
1265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1266f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1267f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
1268f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
1269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm6                   \n"
1270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
1271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
1272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm1                   \n"
1273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm6                   \n"
1274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm2,%%xmm0                   \n"
1275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm6,%%xmm1                   \n"
1276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm0                     \n"
1277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm1                     \n"
1278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packsswb  %%xmm1,%%xmm0                   \n"
1279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
1280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
1283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
1284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_bgra0),       // %0
1286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),           // %1
1287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),           // %2
1288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)           // %3
1289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride_bgra)), // %4
1290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kBGRAToV),  // %5
1291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kBGRAToU),  // %6
1292f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddUV128)  // %7
1293f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
1294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1295f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1296f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1297f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1298f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
1299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm5                       \n"
1301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
1302b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1304b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1308f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
1310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
1311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
1312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm3                   \n"
1313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
1314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm0                   \n"
1315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm3,%%xmm2                   \n"
1316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm0                     \n"
1317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm2                     \n"
1318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm2,%%xmm0                   \n"
1319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
1320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
1322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
1323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_abgr),  // %0
1325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
1326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
1327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kABGRToY),   // %3
1328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddY16)     // %4
1329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1331f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1332f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
1334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm5                       \n"
1336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
1337b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1339b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1340f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1341f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1342f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
1345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
1346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
1347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm3                   \n"
1348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
1349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm0                   \n"
1350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm3,%%xmm2                   \n"
1351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm0                     \n"
1352f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm2                     \n"
1353f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm2,%%xmm0                   \n"
1354f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
1355f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1356f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
1357f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
1358f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1359f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_rgba),  // %0
1360f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
1361f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
1362f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kRGBAToY),   // %3
1363f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddY16)     // %4
1364f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1365f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1366f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1367f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1368b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ABGRToUVRow_SSSE3(const uint8* src_abgr0,
1369b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int src_stride_abgr,
1370b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_u,
1371b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_v,
1372b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int width) {
1373f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1374f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %5,%%xmm3                       \n"
1375f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %6,%%xmm4                       \n"
1376f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %7,%%xmm5                       \n"
1377f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
1378b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1379f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1380b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1381f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1382f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1383f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
1384f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1385f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1386f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm1                   \n"
1387f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1388f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1389f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
1390f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1391f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1392f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm6                   \n"
1393f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1394f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
1395f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm7                   \n"
1396f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1397f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1398f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
1399f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm7                   \n"
1400f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1401f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1402f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
1403f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
1404f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm6                   \n"
1405f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
1406f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
1407f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm1                   \n"
1408f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm6                   \n"
1409f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm2,%%xmm0                   \n"
1410f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm6,%%xmm1                   \n"
1411f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm0                     \n"
1412f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm1                     \n"
1413f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packsswb  %%xmm1,%%xmm0                   \n"
1414f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
1415f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1416f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1417f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
1418f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
1419f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1420f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_abgr0),       // %0
1421f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),           // %1
1422f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),           // %2
1423f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)           // %3
1424f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride_abgr)), // %4
1425f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kABGRToV),  // %5
1426f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kABGRToU),  // %6
1427f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddUV128)  // %7
1428f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
1429f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1430f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1431f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1432f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1433b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid RGBAToUVRow_SSSE3(const uint8* src_rgba0,
1434b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int src_stride_rgba,
1435b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_u,
1436b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_v,
1437b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int width) {
1438f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1439f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %5,%%xmm3                       \n"
1440f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %6,%%xmm4                       \n"
1441f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %7,%%xmm5                       \n"
1442f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
1443b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1444f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1445b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1446f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1447f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1448f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
1449f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1450f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1451f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm1                   \n"
1452f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1453f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1454f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
1455f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1456f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1457f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm6                   \n"
1458f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1459f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
1460f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm7                   \n"
1461f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1462f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1463f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm0                   \n"
1464f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm7                   \n"
1465f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1466f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1467f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm7,%%xmm2                   \n"
1468f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
1469f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm6                   \n"
1470f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
1471f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm2                   \n"
1472f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm1                   \n"
1473f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm6                   \n"
1474f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm2,%%xmm0                   \n"
1475f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm6,%%xmm1                   \n"
1476f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm0                     \n"
1477f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x8,%%xmm1                     \n"
1478f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packsswb  %%xmm1,%%xmm0                   \n"
1479f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddb     %%xmm5,%%xmm0                   \n"
1480f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1481f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1482f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
1483f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
1484f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1485f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_rgba0),       // %0
1486f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),           // %1
1487f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),           // %2
1488f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)           // %3
1489f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride_rgba)), // %4
1490f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kRGBAToV),  // %5
1491f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kRGBAToU),  // %6
1492f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddUV128)  // %7
1493f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
1494f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1495f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1496f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1497f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1498f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1499f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1500f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 UV from 444
1501b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READYUV444 \
1502b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movq       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1503f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1504f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
1505f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1506f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1507f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1508f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1509f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1510f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 4 UV from 422, upsample to 8 UV
1511b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READYUV422 \
1512b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1513f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1514f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1515f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1516f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1517f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1518f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1519f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1520f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1521f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
1522b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READYUVA422 \
1523b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1524f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1525f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1526f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1527f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1528f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1529f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1530f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
1531f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
1532f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
1533f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1534f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 4 UV from NV12, upsample to 8 UV
1535b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READNV12 \
1536b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movq       " MEMACCESS([uv_buf]) ",%%xmm0                    \n"            \
1537f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
1538f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1539f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1540f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1541f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1542f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1543f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 4 VU from NV21, upsample to 8 UV
1544b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READNV21 \
1545b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movq       " MEMACCESS([vu_buf]) ",%%xmm0                    \n"            \
1546f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
1547f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
1548f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1549f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1550f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1551f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1552f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1553b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READYUY2 \
1554b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                  \n"            \
1555f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
1556f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
1557f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
1558f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
1559f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1560f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1561b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READUYVY \
1562b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                  \n"            \
1563f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
1564f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
1565f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
1566f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
1567f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1568f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(__x86_64__)
1569b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define YUVTORGB_SETUP(yuvconstants) \
1570b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8              \n"            \
1571f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
1572f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
1573f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
1574f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
1575f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
1576f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
1577f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Convert 8 pixels: 8 UV and 8 Y
1578b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define YUVTORGB(yuvconstants)                                    \
1579b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqa     %%xmm0,%%xmm1                                   \n" \
1580b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqa     %%xmm0,%%xmm2                                   \n" \
1581b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqa     %%xmm0,%%xmm3                                   \n" \
1582b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqa     %%xmm11,%%xmm0                                  \n" \
1583b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
1584b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "psubw      %%xmm1,%%xmm0                                   \n" \
1585b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqa     %%xmm12,%%xmm1                                  \n" \
1586b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
1587b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "psubw      %%xmm2,%%xmm1                                   \n" \
1588b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqa     %%xmm13,%%xmm2                                  \n" \
1589b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
1590b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "psubw      %%xmm3,%%xmm2                                   \n" \
1591b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
1592b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "paddsw     %%xmm4,%%xmm0                                   \n" \
1593b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "paddsw     %%xmm4,%%xmm1                                   \n" \
1594b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "paddsw     %%xmm4,%%xmm2                                   \n" \
1595b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "psraw      $0x6,%%xmm0                                     \n" \
1596b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "psraw      $0x6,%%xmm1                                     \n" \
1597b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "psraw      $0x6,%%xmm2                                     \n" \
1598b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "packuswb   %%xmm0,%%xmm0                                   \n" \
1599b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "packuswb   %%xmm1,%%xmm1                                   \n" \
1600b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "packuswb   %%xmm2,%%xmm2                                   \n"
1601f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define YUVTORGB_REGS \
1602b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1603f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1604f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#else
1605f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define YUVTORGB_SETUP(yuvconstants)
1606f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Convert 8 pixels: 8 UV and 8 Y
1607b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define YUVTORGB(yuvconstants) \
1608b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "movdqa     %%xmm0,%%xmm1                                     \n"            \
1609f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     %%xmm0,%%xmm2                                   \n"            \
1610f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     %%xmm0,%%xmm3                                   \n"            \
1611f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
1612f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
1613f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw      %%xmm1,%%xmm0                                   \n"            \
1614f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
1615f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
1616f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw      %%xmm2,%%xmm1                                   \n"            \
1617f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
1618f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
1619f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw      %%xmm3,%%xmm2                                   \n"            \
1620f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
1621f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddsw     %%xmm4,%%xmm0                                   \n"            \
1622f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddsw     %%xmm4,%%xmm1                                   \n"            \
1623f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddsw     %%xmm4,%%xmm2                                   \n"            \
1624f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw      $0x6,%%xmm0                                     \n"            \
1625f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw      $0x6,%%xmm1                                     \n"            \
1626f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw      $0x6,%%xmm2                                     \n"            \
1627f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb   %%xmm0,%%xmm0                                   \n"            \
1628f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb   %%xmm1,%%xmm1                                   \n"            \
1629f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb   %%xmm2,%%xmm2                                   \n"
1630f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define YUVTORGB_REGS
1631f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
1632f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1633f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Store 8 ARGB values.
1634b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define STOREARGB \
1635b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "punpcklbw  %%xmm1,%%xmm0                                      \n"           \
1636f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
1637f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     %%xmm0,%%xmm1                                    \n"           \
1638f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
1639f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
1640f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
1641f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
1642f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
1643f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1644f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Store 8 RGBA values.
1645b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define STORERGBA \
1646b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "pcmpeqb   %%xmm5,%%xmm5                                       \n"           \
1647f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
1648f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
1649f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm5,%%xmm0                                     \n"           \
1650f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1651f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1652f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
1653f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
1654f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
1655f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1656f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1657f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const uint8* u_buf,
1658f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const uint8* v_buf,
1659f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                uint8* dst_argb,
1660f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const struct YuvConstants* yuvconstants,
1661f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                int width) {
1662f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1663f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1664f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
1665f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1666b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1667f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1668b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1669f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUV444
1670f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1671f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB
1672f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%[width]                   \n"
1673f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1674f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
1675f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
1676f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
1677f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1678f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1679f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1680f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1681f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1682f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1683f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1684f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1685f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1686f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                 const uint8* u_buf,
1687f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                 const uint8* v_buf,
1688f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                 uint8* dst_rgb24,
1689f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                 const struct YuvConstants* yuvconstants,
1690f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                 int width) {
1691f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1692f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1693f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1694f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
1695f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
1696b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1697f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1698b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1699f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUV422
1700f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1701f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm1,%%xmm0                   \n"
1702f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm2,%%xmm2                   \n"
1703f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
1704f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm2,%%xmm0                   \n"
1705f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm2,%%xmm1                   \n"
1706f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm5,%%xmm0                   \n"
1707f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm6,%%xmm1                   \n"
1708f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "palignr   $0xc,%%xmm0,%%xmm1              \n"
1709f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1710f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1711f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1712f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subl      $0x8,%[width]                   \n"
1713f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1714f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
1715f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
1716f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
1717f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
1718b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#if defined(__i386__)
1719f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+m"(width)     // %[width]
1720f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#else
1721f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1722f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
1723f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
1724f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1725f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1726f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1727f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1728f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1729f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1730f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1731f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1732f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const uint8* u_buf,
1733f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const uint8* v_buf,
1734f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                uint8* dst_argb,
1735f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const struct YuvConstants* yuvconstants,
1736f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                int width) {
1737f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1738f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1739f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
1740f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1741b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1742f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1743b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1744f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUV422
1745f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1746f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB
1747f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%[width]                   \n"
1748f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1749f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
1750f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
1751f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
1752f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1753f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1754f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1755f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1756f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1757f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1758f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1759f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1760f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_I422ALPHATOARGBROW_SSSE3
1761f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1762f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                     const uint8* u_buf,
1763f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                     const uint8* v_buf,
1764f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                     const uint8* a_buf,
1765f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                     uint8* dst_argb,
1766f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                     const struct YuvConstants* yuvconstants,
1767f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                     int width) {
1768b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
1769f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1770f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1771f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
1772b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1773f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1774b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1775f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUVA422
1776f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1777f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB
1778f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subl      $0x8,%[width]                   \n"
1779f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1780f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
1781f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
1782f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
1783f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [a_buf]"+r"(a_buf),    // %[a_buf]
1784f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1785b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#if defined(__i386__)
1786f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+m"(width)     // %[width]
1787f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#else
1788f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1789f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
1790f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1791f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1792f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1793f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1794b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
1795f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1796f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I422ALPHATOARGBROW_SSSE3
1797f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1798f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1799f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const uint8* uv_buf,
1800f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                uint8* dst_argb,
1801f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const struct YuvConstants* yuvconstants,
1802f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                int width) {
1803b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
1804f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1805f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1806f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1807b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1808f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1809b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1810f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READNV12
1811f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1812f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB
1813f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%[width]                   \n"
1814f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1815f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
1816f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1817f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1818f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1819f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1820f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1821f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1822f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1823b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
1824f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1825f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1826f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1827f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const uint8* vu_buf,
1828f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                uint8* dst_argb,
1829f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const struct YuvConstants* yuvconstants,
1830f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                int width) {
1831b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
1832f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1833f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1834f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1835b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1836f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1837b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1838f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READNV21
1839f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1840f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB
1841f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%[width]                   \n"
1842f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1843f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
1844f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
1845f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1846f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1847f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1848f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleNV21]"m"(kShuffleNV21)
1849f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1850f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1851f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1852b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
1853f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1854f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1855f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1856f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                uint8* dst_argb,
1857f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const struct YuvConstants* yuvconstants,
1858f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                int width) {
1859b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
1860f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1861f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1862f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1863b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1864f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1865b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1866f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUY2
1867f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1868f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB
1869f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%[width]                   \n"
1870f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1871f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
1872f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1873f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1874f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1875f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1876f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1877f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1878f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1879f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1880b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
1881f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1882f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1883f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1884f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                uint8* dst_argb,
1885f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const struct YuvConstants* yuvconstants,
1886f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                int width) {
1887b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
1888f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1889f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1890f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1891b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1892f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1893b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1894f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READUYVY
1895f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1896f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB
1897f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%[width]                   \n"
1898f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1899f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
1900f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1901f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1902f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1903f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleUYVYY]"m"(kShuffleUYVYY),
1904f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1905f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1906f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1907f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1908b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
1909f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1910f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1911f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1912f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const uint8* u_buf,
1913f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const uint8* v_buf,
1914f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                uint8* dst_rgba,
1915f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                const struct YuvConstants* yuvconstants,
1916f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                int width) {
1917f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
1918f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP(yuvconstants)
1919f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
1920f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1921b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
1922f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
1923b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
1924f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUV422
1925f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB(yuvconstants)
1926f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STORERGBA
1927f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%[width]                   \n"
1928f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
1929f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
1930f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
1931f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
1932f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
1933f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
1934f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1935f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1936f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1937f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
1938f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
1939f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1940f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I422TOARGBROW_SSSE3
1941f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1942f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 16 UV from 444
1943b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READYUV444_AVX2 \
1944b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1945f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
1946f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
1947f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1948f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
1949f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1950f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1951f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1952f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1953f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1954f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1955f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 UV from 422, upsample to 16 UV.
1956b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READYUV422_AVX2 \
1957b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1958f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1959f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1960f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1961f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1962f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1963f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1964f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1965f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1966f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1967f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1968f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
1969b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READYUVA422_AVX2 \
1970b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1971f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1972f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1973f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1974f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1975f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1976f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1977f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1978f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1979f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
1980f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
1981f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
1982f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
1983f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1984f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 UV from NV12, upsample to 16 UV.
1985b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READNV12_AVX2 \
1986b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                        \n"        \
1987f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
1988f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1989f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1990f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1991f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1992f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1993f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1994f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
1995f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 VU from NV21, upsample to 16 UV.
1996b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READNV21_AVX2 \
1997b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                        \n"        \
1998f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
1999f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2000f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
2001f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
2002f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
2003f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
2004f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
2005f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2006f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2007b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READYUY2_AVX2 \
2008b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                      \n"        \
2009f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
2010f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
2011f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
2012f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
2013f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2014f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2015b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define READUYVY_AVX2 \
2016b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                     \n"        \
2017f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
2018f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
2019f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
2020f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
2021f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2022f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(__x86_64__)
2023b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define YUVTORGB_SETUP_AVX2(yuvconstants) \
2024b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8              \n"           \
2025f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
2026f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
2027f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
2028f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
2029f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
2030f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
2031b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2032b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define YUVTORGB_AVX2(yuvconstants)                                   \
2033b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
2034b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
2035b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
2036b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
2037b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
2038b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
2039b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
2040b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
2041b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
2042b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n" \
2043b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
2044b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
2045b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
2046b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
2047b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
2048b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2049b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2050f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define YUVTORGB_REGS_AVX2 \
2051b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2052b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2053f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#else  // Convert 16 pixels: 16 UV and 16 Y.
2054b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2055f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define YUVTORGB_SETUP_AVX2(yuvconstants)
2056b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define YUVTORGB_AVX2(yuvconstants) \
2057b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2     \n"        \
2058f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
2059f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
2060f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
2061f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
2062f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
2063f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
2064f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
2065f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
2066f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
2067f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
2068f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
2069f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
2070f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
2071f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
2072f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
2073f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
2074f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
2075f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2076f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#define YUVTORGB_REGS_AVX2
2077f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
2078f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2079f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Store 16 ARGB values.
2080b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#define STOREARGB_AVX2 \
2081b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                  \n"        \
2082f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2083f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
2084f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
2085f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
2086f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
2087f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
2088f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
2089f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
2090f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2091f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_I444TOARGBROW_AVX2
2092f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels
2093f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2094f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
2095f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const uint8* u_buf,
2096f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const uint8* v_buf,
2097f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               uint8* dst_argb,
2098f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const struct YuvConstants* yuvconstants,
2099f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               int width) {
2100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP_AVX2(yuvconstants)
2102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
2103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2104b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2106b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUV444_AVX2
2108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_AVX2(yuvconstants)
2109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB_AVX2
2110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%[width]                  \n"
2111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
2114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
2115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
2116f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
2118f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I444TOARGBROW_AVX2
2124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_I422TOARGBROW_AVX2)
2126f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels
2127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const uint8* u_buf,
2130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const uint8* v_buf,
2131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               uint8* dst_argb,
2132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const struct YuvConstants* yuvconstants,
2133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               int width) {
2134f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2135f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP_AVX2(yuvconstants)
2136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
2137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2138b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2140b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2141f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUV422_AVX2
2142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_AVX2(yuvconstants)
2143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB_AVX2
2144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%[width]                  \n"
2145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2146b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2148f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
2149f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
2150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
2151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
2153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I422TOARGBROW_AVX2
2159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_I422ALPHATOARGBROW_AVX2)
2161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels
2162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2164b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    const uint8* u_buf,
2165b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    const uint8* v_buf,
2166b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    const uint8* a_buf,
2167b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    uint8* dst_argb,
2168b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    const struct YuvConstants* yuvconstants,
2169b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    int width) {
2170b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
2171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP_AVX2(yuvconstants)
2173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
2174b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2176b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUVA422_AVX2
2178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_AVX2(yuvconstants)
2179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB_AVX2
2180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "subl      $0x10,%[width]                  \n"
2181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
2184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
2185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
2186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [a_buf]"+r"(a_buf),    // %[a_buf]
2187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2188b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#if defined(__i386__)
2189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+m"(width)     // %[width]
2190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#else
2191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
2192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
2193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2197b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
2198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I422ALPHATOARGBROW_AVX2
2200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_I422TORGBAROW_AVX2)
2202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels
2203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const uint8* u_buf,
2206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const uint8* v_buf,
2207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               uint8* dst_argb,
2208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const struct YuvConstants* yuvconstants,
2209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               int width) {
2210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP_AVX2(yuvconstants)
2212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %[u_buf],%[v_buf]               \n"
2213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2214b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2216b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUV422_AVX2
2218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_AVX2(yuvconstants)
2219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Step 3: Weave into RGBA
2221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%[width]                  \n"
2231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
2234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [u_buf]"+r"(u_buf),    // %[u_buf]
2235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [v_buf]"+r"(v_buf),    // %[v_buf]
2236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
2238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I422TORGBAROW_AVX2
2244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_NV12TOARGBROW_AVX2)
2246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels.
2247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const uint8* uv_buf,
2250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               uint8* dst_argb,
2251f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const struct YuvConstants* yuvconstants,
2252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               int width) {
2253b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
2254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP_AVX2(yuvconstants)
2256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2257b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2259b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READNV12_AVX2
2261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_AVX2(yuvconstants)
2262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB_AVX2
2263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%[width]                  \n"
2264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2266f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
2267f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2268f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
2270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2274b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
2275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_NV12TOARGBROW_AVX2
2277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_NV21TOARGBROW_AVX2)
2279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels.
2280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const uint8* vu_buf,
2283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               uint8* dst_argb,
2284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const struct YuvConstants* yuvconstants,
2285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               int width) {
2286b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
2287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP_AVX2(yuvconstants)
2289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2290b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2292b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2293f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READNV21_AVX2
2294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_AVX2(yuvconstants)
2295f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB_AVX2
2296f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%[width]                  \n"
2297f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2298f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [y_buf]"+r"(y_buf),    // %[y_buf]
2300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
2303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2304f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleNV21]"m"(kShuffleNV21)
2305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2308b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
2309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_NV21TOARGBROW_AVX2
2311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_YUY2TOARGBROW_AVX2)
2313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels.
2314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               uint8* dst_argb,
2317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const struct YuvConstants* yuvconstants,
2318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               int width) {
2319b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
2320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP_AVX2(yuvconstants)
2322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2323b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2325b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READYUY2_AVX2
2327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_AVX2(yuvconstants)
2328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB_AVX2
2329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%[width]                  \n"
2330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2331f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2332f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
2335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2337f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2339f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2340f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2341b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
2342f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_YUY2TOARGBROW_AVX2
2344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(HAS_UYVYTOARGBROW_AVX2)
2346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels.
2347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               uint8* dst_argb,
2350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               const struct YuvConstants* yuvconstants,
2351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                               int width) {
2352b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format off
2353f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2354f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_SETUP_AVX2(yuvconstants)
2355f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2356b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2357f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2358b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2359f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    READUYVY_AVX2
2360f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    YUVTORGB_AVX2(yuvconstants)
2361f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    STOREARGB_AVX2
2362f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%[width]                  \n"
2363f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2364f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2365f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
2366f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2367f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [width]"+rm"(width)    // %[width]
2368f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2369f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleUYVYY]"m"(kShuffleUYVYY),
2370f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2371f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2372f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2373f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2374b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  // clang-format on
2375f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2376f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_UYVYTOARGBROW_AVX2
2377f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2378f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_I400TOARGBROW_SSE2
2379f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2380f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2381f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
2382f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%eax,%%xmm2                    \n"
2383f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2384f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
2385f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%eax,%%xmm3                    \n"
2386f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2387f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm4,%%xmm4                   \n"
2388f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm4                    \n"
2389b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2390f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2391b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2392f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2393f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(0) ",%%xmm0         \n"
2394f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,0) ",%0            \n"
2395f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm0                   \n"
2396f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm0                   \n"
2397f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubusw   %%xmm3,%%xmm0                   \n"
2398f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $6, %%xmm0                      \n"
2399f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
2400f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2401f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Step 2: Weave into ARGB
2402f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm0                   \n"
2403f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
2404f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm0,%%xmm0                   \n"
2405f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm1,%%xmm1                   \n"
2406f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm4,%%xmm0                   \n"
2407f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm4,%%xmm1                   \n"
2408f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2409f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2410f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
2411f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2412f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
2413f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2414f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(y_buf),     // %0
2415f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
2416f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)     // %2
2417f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2418f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "eax"
2419f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2420f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2421f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2422f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I400TOARGBROW_SSE2
2423f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2424f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_I400TOARGBROW_AVX2
2425f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2426f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// note: vpunpcklbw mutates and vpackuswb unmutates.
2427f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2428f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2429f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
2430f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovd      %%eax,%%xmm2                   \n"
2431f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastss %%xmm2,%%ymm2                \n"
2432f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
2433f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovd      %%eax,%%xmm3                   \n"
2434f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastss %%xmm3,%%ymm3                \n"
2435f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
2436f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
2437f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2438f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2439b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2440f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2441f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
2442f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x10,0) ",%0          \n"
2443f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2444f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
2445f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
2446f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
2447f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
2448f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
2449f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
2450f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2451f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
2452f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
2453f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
2454f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
2455f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2456f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
2457f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
2458f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        $0x10,%2                       \n"
2459f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2460f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2461f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(y_buf),     // %0
2462f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
2463f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)     // %2
2464f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2465f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "eax"
2466f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2467f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2468f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2469f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I400TOARGBROW_AVX2
2470f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2471f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_MIRRORROW_SSSE3
2472f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for reversing the bytes.
2473b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
2474b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
2475f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2476f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2477f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  intptr_t temp_width = (intptr_t)(width);
2478f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2479f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm5                       \n"
2480b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2481f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2482b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2483f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2484f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm5,%%xmm0                   \n"
2485f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2486f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
2487f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
2488f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2489f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
2490f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
2491f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(temp_width)  // %2
2492f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleMirror) // %3
2493f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
2494f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm5"
2495f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2496f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2497f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_MIRRORROW_SSSE3
2498f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2499f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_MIRRORROW_AVX2
2500f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2501f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  intptr_t temp_width = (intptr_t)(width);
2502f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2503f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %3,%%ymm5                  \n"
2504b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2505f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2506b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2507f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
2508f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
2509f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
2510f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2511f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
2512f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
2513f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2514f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2515f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
2516f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
2517f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(temp_width)  // %2
2518f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleMirror) // %3
2519f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
2520f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm5"
2521f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2522f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2523f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_MIRRORROW_AVX2
2524f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2525f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_MIRRORUVROW_SSSE3
2526f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for reversing the bytes of UV channels.
2527b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
2528b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
2529b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid MirrorUVRow_SSSE3(const uint8* src,
2530b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_u,
2531b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_v,
2532f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                       int width) {
2533f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  intptr_t temp_width = (intptr_t)(width);
2534f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2535f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm1                       \n"
2536f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
2537f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
2538b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2539f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2540b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2541f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2542f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(-0x10,0) ",%0          \n"
2543f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm1,%%xmm0                   \n"
2544f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
2545f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
2546f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
2547f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $8,%3                           \n"
2548f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2549f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),      // %0
2550f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),    // %1
2551f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),    // %2
2552f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(temp_width)  // %3
2553f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleMirrorUV)  // %4
2554f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
2555f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1"
2556f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2557f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2558f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_MIRRORUVROW_SSSE3
2559f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2560f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBMIRRORROW_SSE2
2561f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2562f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2563f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  intptr_t temp_width = (intptr_t)(width);
2564f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2565f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
2566b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2567f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2568b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2569f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2570f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
2571f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(-0x10,0) ",%0          \n"
2572f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2573f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
2574f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%2                         \n"
2575f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2576f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
2577f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
2578f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(temp_width)  // %2
2579f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2580f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
2581f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0"
2582f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2583f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2584f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBMIRRORROW_SSE2
2585f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2586f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBMIRRORROW_AVX2
2587f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for reversing the bytes.
2588b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
2589f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2590f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  intptr_t temp_width = (intptr_t)(width);
2591f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2592f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %3,%%ymm5                      \n"
2593b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2594f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2595b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2596f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2597f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2598f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,1) ",%1          \n"
2599f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        $0x8,%2                        \n"
2600f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg         1b                             \n"
2601f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2602f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),  // %0
2603f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),  // %1
2604f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(temp_width)  // %2
2605f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kARGBShuffleMirror_AVX2) // %3
2606f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
2607f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm5"
2608f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2609f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2610f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBMIRRORROW_AVX2
2611f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2612f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_SPLITUVROW_AVX2
2613b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid SplitUVRow_AVX2(const uint8* src_uv,
2614b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_u,
2615b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_v,
2616f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                     int width) {
2617f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2618b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2619b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
2620b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        %1,%2                          \n"
2621b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2622f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2623b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2624b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
2625b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
2626b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea        " MEMLEA(0x40,0) ",%0          \n"
2627b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
2628b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
2629b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
2630b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
2631b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
2632b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
2633b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2634b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2635b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2636b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)           //  vmovdqu %%ymm2,(%1,%2)
2637b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea        " MEMLEA(0x20,1) ",%1          \n"
2638b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        $0x20,%3                       \n"
2639b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg         1b                             \n"
2640b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vzeroupper                                \n"
2641f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_uv),     // %0
2642f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),      // %1
2643f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),      // %2
2644f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)         // %3
2645f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2646f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
2647f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2648f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2649f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2650f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_SPLITUVROW_AVX2
2651f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2652f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_SPLITUVROW_SSE2
2653b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid SplitUVRow_SSE2(const uint8* src_uv,
2654b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_u,
2655b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_v,
2656f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                     int width) {
2657f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2658b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "pcmpeqb    %%xmm5,%%xmm5                  \n"
2659b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "psrlw      $0x8,%%xmm5                    \n"
2660b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        %1,%2                          \n"
2661b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2662f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2663b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2664b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
2665b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1  \n"
2666b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea        " MEMLEA(0x20,0) ",%0          \n"
2667b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqa     %%xmm0,%%xmm2                  \n"
2668b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqa     %%xmm1,%%xmm3                  \n"
2669b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "pand       %%xmm5,%%xmm0                  \n"
2670b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "pand       %%xmm5,%%xmm1                  \n"
2671b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "packuswb   %%xmm1,%%xmm0                  \n"
2672b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "psrlw      $0x8,%%xmm2                    \n"
2673b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "psrlw      $0x8,%%xmm3                    \n"
2674b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "packuswb   %%xmm3,%%xmm2                  \n"
2675b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqu     %%xmm0," MEMACCESS(1) "        \n"
2676b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)           //  movdqu     %%xmm2,(%1,%2)
2677b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea        " MEMLEA(0x10,1) ",%1          \n"
2678b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        $0x10,%3                       \n"
2679b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg         1b                             \n"
2680f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_uv),     // %0
2681f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),      // %1
2682f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),      // %2
2683f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)         // %3
2684f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2685f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
2686f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2687f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2688f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2689f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_SPLITUVROW_SSE2
2690f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2691f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_MERGEUVROW_AVX2
2692b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid MergeUVRow_AVX2(const uint8* src_u,
2693b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     const uint8* src_v,
2694b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_uv,
2695f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                     int width) {
2696f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2697b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub       %0,%1                           \n"
2698b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2699b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    LABELALIGN
2700b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2701b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2702b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)           //  vmovdqu (%0,%1,1),%%ymm1
2703b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea       " MEMLEA(0x20,0) ",%0           \n"
2704b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
2705b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
2706b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2707f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2708f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2709f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2710b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea       " MEMLEA(0x40,2) ",%2           \n"
2711b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub       $0x20,%3                        \n"
2712b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg        1b                              \n"
2713b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vzeroupper                                \n"
2714f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_u),     // %0
2715f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_v),     // %1
2716f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_uv),    // %2
2717f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)      // %3
2718f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2719f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
2720f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2"
2721f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2722f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2723f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_MERGEUVROW_AVX2
2724f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2725f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_MERGEUVROW_SSE2
2726b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid MergeUVRow_SSE2(const uint8* src_u,
2727b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     const uint8* src_v,
2728b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_uv,
2729f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                     int width) {
2730f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2731b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub       %0,%1                           \n"
2732b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2733f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2734b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2735b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2736b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
2737b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea       " MEMLEA(0x10,0) ",%0           \n"
2738b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqa    %%xmm0,%%xmm2                   \n"
2739b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "punpcklbw %%xmm1,%%xmm0                   \n"
2740b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "punpckhbw %%xmm1,%%xmm2                   \n"
2741b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
2742b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
2743b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea       " MEMLEA(0x20,2) ",%2           \n"
2744b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub       $0x10,%3                        \n"
2745b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg        1b                              \n"
2746f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_u),     // %0
2747f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_v),     // %1
2748f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_uv),    // %2
2749f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)      // %3
2750f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2751f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
2752f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2"
2753f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2754f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2755f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_MERGEUVROW_SSE2
2756f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2757f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_COPYROW_SSE2
2758f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2759f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2760f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "test       $0xf,%0                        \n"
2761f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jne        2f                             \n"
2762f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "test       $0xf,%1                        \n"
2763f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jne        2f                             \n"
2764b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2765f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2766b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2767f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
2768f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2769f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
2770f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
2771f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2772f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
2773f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
2774f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2775f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       9f                              \n"
2776b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2777f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2778f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "2:                                          \n"
2779f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2780f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2781f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
2782f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2783f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2784f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
2785f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
2786f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        2b                              \n"
2787f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "9:                                          \n"
2788f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),   // %0
2789f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),   // %1
2790f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(count)  // %2
2791f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2792f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
2793f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1"
2794f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2795f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2796f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_COPYROW_SSE2
2797f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2798f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_COPYROW_AVX
2799f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2800f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2801f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2802b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2803f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2804f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2805f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
2806f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2807f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
2808f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
2809f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x40,%2                        \n"
2810f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2811f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),   // %0
2812f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),   // %1
2813f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(count)  // %2
2814f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2815f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
2816f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1"
2817f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2818f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2819f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_COPYROW_AVX
2820f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2821f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_COPYROW_ERMS
2822f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Multiple of 1.
2823f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2824f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  size_t width_tmp = (size_t)(width);
2825b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile("rep movsb " MEMMOVESTRING(0, 1) "          \n"
2826b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "+S"(src),       // %0
2827b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                 "+D"(dst),       // %1
2828b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                 "+c"(width_tmp)  // %2
2829b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               :
2830b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "memory", "cc");
2831f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2832f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_COPYROW_ERMS
2833f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2834f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBCOPYALPHAROW_SSE2
2835f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// width in pixels
2836f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2837f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2838f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm0,%%xmm0                   \n"
2839f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm0                    \n"
2840f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm1,%%xmm1                   \n"
2841f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x8,%%xmm1                     \n"
2842b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2843f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2844b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2845f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
2846f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
2847f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
2848f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2849f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2850f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm0,%%xmm2                   \n"
2851f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm0,%%xmm3                   \n"
2852f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm1,%%xmm4                   \n"
2853f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm1,%%xmm5                   \n"
2854f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm4,%%xmm2                   \n"
2855f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm3                   \n"
2856f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2857f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2858f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
2859f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
2860f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2861f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),   // %0
2862f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),   // %1
2863f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)  // %2
2864f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2865f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
2866f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2867f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2868f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2869f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBCOPYALPHAROW_SSE2
2870f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2871f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBCOPYALPHAROW_AVX2
2872f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// width in pixels
2873f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2874f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2875f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2876f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2877b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2878f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2879b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2880f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
2881f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
2882f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
2883f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2884f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2885f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2886f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2887f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
2888f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
2889f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2890f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
2891f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),   // %0
2892f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),   // %1
2893f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)  // %2
2894f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2895f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
2896f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2"
2897f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2898f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2899f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBCOPYALPHAROW_AVX2
2900f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2901f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
2902f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// width in pixels
2903f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
2904b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
2905f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2906b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2907f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
2908f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
2909f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20, 0) ", %0         \n"
2910f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x18, %%xmm0                   \n"
2911f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x18, %%xmm1                   \n"
2912f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm1, %%xmm0                  \n"
2913f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0, %%xmm0                  \n"
2914f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
2915f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8, 1) ", %1          \n"
2916f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8, %2                        \n"
2917f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2918f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
2919f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_a),     // %1
2920f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)     // %2
2921f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
2922f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
2923f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1"
2924f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
2925f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
2926f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
2927f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
2928b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
2929b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic const uvec8 kShuffleAlphaShort_AVX2 = {
2930b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
2931b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
2932b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2933b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
2934b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
2935b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqa    %3,%%ymm4                      \n"
2936b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vbroadcastf128 %4,%%ymm5                  \n"
2937b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2938b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    LABELALIGN
2939b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2940b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu   " MEMACCESS(0) ", %%ymm0        \n"
2941b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu   " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
2942b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n" // vpsrld $0x18, %%ymm0
2943b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
2944b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu   " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
2945b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu   " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
2946b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea       " MEMLEA(0x80, 0) ", %0         \n"
2947b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
2948b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
2949b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
2950b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
2951b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
2952b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
2953b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2954b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "lea       " MEMLEA(0x20,1) ",%1           \n"
2955b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        $0x20, %2                      \n"
2956b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg         1b                             \n"
2957b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vzeroupper                                \n"
2958b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "+r"(src_argb),  // %0
2959b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(dst_a),     // %1
2960b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+rm"(width)     // %2
2961b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "m"(kPermdARGBToY_AVX),  // %3
2962b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "m"(kShuffleAlphaShort_AVX2)  // %4
2963b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "memory", "cc"
2964b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2965b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  );
2966b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard}
2967b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
2968b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2969f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2970f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// width in pixels
2971f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2972f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
2973f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm0,%%xmm0                   \n"
2974f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm0                    \n"
2975f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm1,%%xmm1                   \n"
2976f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x8,%%xmm1                     \n"
2977b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
2978f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
2979b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
2980f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(0) ",%%xmm2         \n"
2981f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,0) ",%0            \n"
2982f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm2,%%xmm2                   \n"
2983f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm2,%%xmm3                   \n"
2984f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm2,%%xmm2                   \n"
2985f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2986f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2987f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm0,%%xmm2                   \n"
2988f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm0,%%xmm3                   \n"
2989f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm1,%%xmm4                   \n"
2990f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm1,%%xmm5                   \n"
2991f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm4,%%xmm2                   \n"
2992f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm3                   \n"
2993f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2994f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2995f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
2996f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
2997f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
2998f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),   // %0
2999f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),   // %1
3000f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)  // %2
3001f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3002f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3003f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3004f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3005f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3006f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3007f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3008f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3009f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// width in pixels
3010f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3011f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3012f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
3013f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
3014b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3015f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3016b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3017f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
3018f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
3019f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
3020f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
3021f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
3022f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
3023f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
3024f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
3025f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
3026f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
3027f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
3028f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3029f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3030f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src),   // %0
3031f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),   // %1
3032f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)  // %2
3033f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3034f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3035f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2"
3036f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3037f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3038f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3039f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3040f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_SETROW_X86
3041f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid SetRow_X86(uint8* dst, uint8 v8, int width) {
3042f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  size_t width_tmp = (size_t)(width >> 2);
3043f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
3044b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
3045b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "+D"(dst),       // %0
3046b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                 "+c"(width_tmp)  // %1
3047b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "a"(v32)         // %2
3048b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "memory", "cc");
3049f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3050f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3051f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid SetRow_ERMS(uint8* dst, uint8 v8, int width) {
3052f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  size_t width_tmp = (size_t)(width);
3053b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile("rep stosb " MEMSTORESTRING(al, 0) "        \n"
3054b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "+D"(dst),       // %0
3055b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                 "+c"(width_tmp)  // %1
3056b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "a"(v8)          // %2
3057b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "memory", "cc");
3058f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3059f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3060f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
3061f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  size_t width_tmp = (size_t)(width);
3062b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
3063b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "+D"(dst_argb),  // %0
3064b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                 "+c"(width_tmp)  // %1
3065b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "a"(v32)         // %2
3066b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard               : "memory", "cc");
3067f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3068f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_SETROW_X86
3069f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3070f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_YUY2TOYROW_SSE2
3071f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3072f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3073f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3074f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm5                     \n"
3075b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3076f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3077b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3078f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3079f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3080f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3081f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm0                   \n"
3082f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm1                   \n"
3083f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
3084f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3085f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
3086f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
3087f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3088f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_yuy2),  // %0
3089f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
3090f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
3091f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3092f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3093f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm5"
3094f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3095f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3096f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3097b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid YUY2ToUVRow_SSE2(const uint8* src_yuy2,
3098b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int stride_yuy2,
3099b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_u,
3100b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_v,
3101b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int width) {
3102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3104f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm5                     \n"
3105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
3106b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3108b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm2,%%xmm0                   \n"
3115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm3,%%xmm1                   \n"
3116f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm0                     \n"
3117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3118f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
3119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
3120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm0                   \n"
3121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
3122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm1                   \n"
3124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
3125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3126f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
3127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
3128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_yuy2),    // %0
3130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),       // %1
3131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),       // %2
3132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %3
3133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(stride_yuy2))  // %4
3134f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3135f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3140b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_u,
3141b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_v,
3142b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int width) {
3143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm5                     \n"
3146f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
3147b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3148f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3149b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm0                     \n"
3154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
3156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
3157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm0                   \n"
3158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
3159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm1                   \n"
3161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
3162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
3164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
3165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_yuy2),    // %0
3167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),       // %1
3168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),       // %2
3169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %3
3170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm5"
3173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
3177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3179b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm0                     \n"
3184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
3186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
3188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
3189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_uyvy),  // %0
3191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
3192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
3193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1"
3196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3199b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid UYVYToUVRow_SSE2(const uint8* src_uyvy,
3200b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int stride_uyvy,
3201b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_u,
3202b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_v,
3203b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int width) {
3204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm5                     \n"
3207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
3208b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3210b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm2,%%xmm0                   \n"
3217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm3,%%xmm1                   \n"
3218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm0                   \n"
3219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm1                   \n"
3220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
3221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
3222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm0                   \n"
3223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
3224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm1                   \n"
3226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
3227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
3229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
3230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_uyvy),    // %0
3232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),       // %1
3233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),       // %2
3234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %3
3235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(stride_uyvy))  // %4
3236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3242b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_u,
3243b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_v,
3244b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int width) {
3245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm5                     \n"
3248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
3249b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3251b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3253f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm0                   \n"
3256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm1                   \n"
3257f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
3258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
3259f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm0                   \n"
3260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
3261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm1                   \n"
3263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
3264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
3266f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
3267f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3268f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_uyvy),    // %0
3269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),       // %1
3270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),       // %2
3271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %3
3272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm5"
3275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_YUY2TOYROW_SSE2
3278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_YUY2TOYROW_AVX2
3280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3284b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3286b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
3290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3292f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3293f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3295f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea      " MEMLEA(0x20,1) ",%1            \n"
3296f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
3297f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3298f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_yuy2),  // %0
3300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
3301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
3302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3304f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm5"
3305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3308b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid YUY2ToUVRow_AVX2(const uint8* src_yuy2,
3309b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int stride_yuy2,
3310b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_u,
3311b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_v,
3312b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int width) {
3313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
3317b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3319b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
3325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3328f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3331f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3332f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3337f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea      " MEMLEA(0x10,1) ",%1            \n"
3338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%3                        \n"
3339f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3340f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3341f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_yuy2),    // %0
3342f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),       // %1
3343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),       // %2
3344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %3
3345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(stride_yuy2))  // %4
3346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm5"
3348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3352b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_u,
3353b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_v,
3354b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int width) {
3355f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3356f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3357f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3358f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
3359b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3360f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3361b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3362f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3363f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3364f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
3365f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3366f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3367f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3368f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3369f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3370f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3371f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3372f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3373f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3374f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3375f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3376f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3377f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea      " MEMLEA(0x10,1) ",%1            \n"
3378f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%3                        \n"
3379f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3380f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3381f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_yuy2),    // %0
3382f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),       // %1
3383f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),       // %2
3384f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %3
3385f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3386f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3387f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm5"
3388f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3389f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3390f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3391f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
3392f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3393f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3394b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3395f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3396f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3397f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
3398f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3399f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3400f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3401f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3402f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3403f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea      " MEMLEA(0x20,1) ",%1            \n"
3404f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
3405f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3406f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3407f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_uyvy),  // %0
3408f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),     // %1
3409f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
3410f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3411f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3412f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm5"
3413f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3414f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3415b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid UYVYToUVRow_AVX2(const uint8* src_uyvy,
3416b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int stride_uyvy,
3417b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_u,
3418b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      uint8* dst_v,
3419b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                      int width) {
3420f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3421f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3422f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3423f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
3424f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3425f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3426b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3427f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3428f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3429f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3430f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3431f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
3432f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3433f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3434f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3435f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3436f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3437f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3438f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3439f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3440f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3441f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3442f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3443f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3444f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea      " MEMLEA(0x10,1) ",%1            \n"
3445f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%3                        \n"
3446f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3447f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3448f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_uyvy),    // %0
3449f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),       // %1
3450f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),       // %2
3451f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %3
3452f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(stride_uyvy))  // %4
3453f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3454f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm5"
3455f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3456f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3457f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3458f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3459b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_u,
3460b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_v,
3461b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int width) {
3462f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3463f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3464f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
3465f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                           \n"
3466b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3467f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3468b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3469f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3470f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3471f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
3472f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3473f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3474f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3475f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3476f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3477f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3478f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3479f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3480f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3481f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3482f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3483f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3484f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea      " MEMLEA(0x10,1) ",%1            \n"
3485f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%3                        \n"
3486f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3487f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3488f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_uyvy),    // %0
3489f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_u),       // %1
3490f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_v),       // %2
3491f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %3
3492f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
3493f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3494f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm5"
3495f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3496f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3497f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_YUY2TOYROW_AVX2
3498f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3499f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBBLENDROW_SSSE3
3500f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table for isolating alpha.
3501b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
3502b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
3503f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3504f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Blend 8 pixels at a time
3505b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBBlendRow_SSSE3(const uint8* src_argb0,
3506b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        const uint8* src_argb1,
3507b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst_argb,
3508b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int width) {
3509f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3510f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm7,%%xmm7                   \n"
3511f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0xf,%%xmm7                     \n"
3512f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm6,%%xmm6                   \n"
3513f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm6                     \n"
3514f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3515f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0x8,%%xmm5                     \n"
3516f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm4,%%xmm4                   \n"
3517f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm4                    \n"
3518f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
3519f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jl        49f                             \n"
3520f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3521f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
3522f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3523f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "40:                                         \n"
3524f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3525f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
3526f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm3,%%xmm0                   \n"
3527f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm4,%%xmm3                   \n"
3528f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3529f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %4,%%xmm3                       \n"
3530f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm6,%%xmm2                   \n"
3531f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm7,%%xmm3                   \n"
3532f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmullw    %%xmm3,%%xmm2                   \n"
3533f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3534f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
3535f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3536f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm4,%%xmm0                   \n"
3537f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmullw    %%xmm3,%%xmm1                   \n"
3538f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm2                     \n"
3539f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddusb   %%xmm2,%%xmm0                   \n"
3540f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm1                   \n"
3541f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddusb   %%xmm1,%%xmm0                   \n"
3542f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3543f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
3544f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
3545f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       40b                             \n"
3546f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3547f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "49:                                         \n"
3548f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       $0x3,%3                         \n"
3549f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jl        99f                             \n"
3550f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3551f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 1 pixel loop.
3552f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "91:                                         \n"
3553f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      " MEMACCESS(0) ",%%xmm3         \n"
3554f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,0) ",%0            \n"
3555f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm3,%%xmm0                   \n"
3556f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm4,%%xmm3                   \n"
3557f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      " MEMACCESS(1) ",%%xmm2         \n"
3558f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %4,%%xmm3                       \n"
3559f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm6,%%xmm2                   \n"
3560f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm7,%%xmm3                   \n"
3561f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmullw    %%xmm3,%%xmm2                   \n"
3562f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      " MEMACCESS(1) ",%%xmm1         \n"
3563f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,1) ",%1            \n"
3564f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3565f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm4,%%xmm0                   \n"
3566f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmullw    %%xmm3,%%xmm1                   \n"
3567f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm2                     \n"
3568f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddusb   %%xmm2,%%xmm0                   \n"
3569f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm5,%%xmm1                   \n"
3570f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddusb   %%xmm1,%%xmm0                   \n"
3571f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0," MEMACCESS(2) "         \n"
3572f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,2) ",%2            \n"
3573f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x1,%3                         \n"
3574f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       91b                             \n"
3575f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "99:                                         \n"
3576f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),    // %0
3577f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb1),    // %1
3578f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),     // %2
3579f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)         // %3
3580f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleAlpha)  // %4
3581f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3582f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3583f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3584f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3585f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBBLENDROW_SSSE3
3586f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3587f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_BLENDPLANEROW_SSSE3
3588f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Blend 8 pixels at a time.
3589f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// unsigned version of math
3590f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// =((A2*C2)+(B2*(255-C2))+255)/256
3591f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// signed version of math
3592f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3593b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid BlendPlaneRow_SSSE3(const uint8* src0,
3594b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         const uint8* src1,
3595b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         const uint8* alpha,
3596b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst,
3597b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int width) {
3598b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile(
3599b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pcmpeqb    %%xmm5,%%xmm5                  \n"
3600b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "psllw      $0x8,%%xmm5                    \n"
3601b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "mov        $0x80808080,%%eax              \n"
3602b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movd       %%eax,%%xmm6                   \n"
3603b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pshufd     $0x0,%%xmm6,%%xmm6             \n"
3604b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "mov        $0x807f807f,%%eax              \n"
3605b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movd       %%eax,%%xmm7                   \n"
3606b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pshufd     $0x0,%%xmm7,%%xmm7             \n"
3607b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        %2,%0                          \n"
3608b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        %2,%1                          \n"
3609b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        %2,%3                          \n"
3610b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3611b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      // 8 pixel loop.
3612b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      LABELALIGN
3613b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "1:                                        \n"
3614b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movq       (%2),%%xmm0                    \n"
3615b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "punpcklbw  %%xmm0,%%xmm0                  \n"
3616b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pxor       %%xmm5,%%xmm0                  \n"
3617b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movq       (%0,%2,1),%%xmm1               \n"
3618b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movq       (%1,%2,1),%%xmm2               \n"
3619b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "punpcklbw  %%xmm2,%%xmm1                  \n"
3620b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "psubb      %%xmm6,%%xmm1                  \n"
3621b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "pmaddubsw  %%xmm1,%%xmm0                  \n"
3622b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "paddw      %%xmm7,%%xmm0                  \n"
3623b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "psrlw      $0x8,%%xmm0                    \n"
3624b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "packuswb   %%xmm0,%%xmm0                  \n"
3625b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "movq       %%xmm0,(%3,%2,1)               \n"
3626b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "lea        0x8(%2),%2                     \n"
3627b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        $0x8,%4                        \n"
3628b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "jg        1b                              \n"
3629b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      : "+r"(src0),   // %0
3630b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(src1),   // %1
3631b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(alpha),  // %2
3632b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(dst),    // %3
3633b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+rm"(width)  // %4
3634b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        ::"memory",
3635b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
3636f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3637f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_BLENDPLANEROW_SSSE3
3638f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3639f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_BLENDPLANEROW_AVX2
3640f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Blend 32 pixels at a time.
3641f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// unsigned version of math
3642f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// =((A2*C2)+(B2*(255-C2))+255)/256
3643f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// signed version of math
3644f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3645b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid BlendPlaneRow_AVX2(const uint8* src0,
3646b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        const uint8* src1,
3647b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        const uint8* alpha,
3648b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst,
3649b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int width) {
3650b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile(
3651b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3652b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
3653b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "mov        $0x80808080,%%eax              \n"
3654b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vmovd      %%eax,%%xmm6                   \n"
3655b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vbroadcastss %%xmm6,%%ymm6                \n"
3656b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "mov        $0x807f807f,%%eax              \n"
3657b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vmovd      %%eax,%%xmm7                   \n"
3658b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vbroadcastss %%xmm7,%%ymm7                \n"
3659b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        %2,%0                          \n"
3660b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        %2,%1                          \n"
3661b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        %2,%3                          \n"
3662b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
3663b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      // 32 pixel loop.
3664b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      LABELALIGN
3665b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "1:                                        \n"
3666b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vmovdqu    (%2),%%ymm0                    \n"
3667b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
3668b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
3669b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
3670b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
3671b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vmovdqu    (%0,%2,1),%%ymm1               \n"
3672b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vmovdqu    (%1,%2,1),%%ymm2               \n"
3673b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
3674b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
3675b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
3676b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
3677b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
3678b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
3679b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
3680b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
3681b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
3682b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3683b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
3684b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vmovdqu    %%ymm0,(%3,%2,1)               \n"
3685b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "lea        0x20(%2),%2                    \n"
3686b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "sub        $0x20,%4                       \n"
3687b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "jg        1b                              \n"
3688b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      "vzeroupper                                \n"
3689b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard      : "+r"(src0),   // %0
3690b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(src1),   // %1
3691b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(alpha),  // %2
3692b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+r"(dst),    // %3
3693b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "+rm"(width)  // %4
3694b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        ::"memory",
3695b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
3696b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard        "xmm7");
3697f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3698f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_BLENDPLANEROW_AVX2
3699f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3700f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBATTENUATEROW_SSSE3
3701f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table duplicating alpha
3702b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
3703b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
3704b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3705b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                               15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
3706f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Attenuate 4 pixels at a time.
3707f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3708f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3709f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm3,%%xmm3                   \n"
3710f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm3                    \n"
3711f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
3712f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm5                       \n"
3713f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3714f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
3715f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3716b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3717f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3718f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm4,%%xmm0                   \n"
3719f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3720f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm1,%%xmm1                   \n"
3721f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm1,%%xmm0                   \n"
3722f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3723f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm5,%%xmm1                   \n"
3724f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3725f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm2,%%xmm2                   \n"
3726f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm1                   \n"
3727f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3728f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
3729f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm3,%%xmm2                   \n"
3730f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm0                     \n"
3731f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
3732f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
3733f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm2,%%xmm0                   \n"
3734f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3735f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
3736f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%2                         \n"
3737f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3738f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),    // %0
3739f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),    // %1
3740f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
3741f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleAlpha0),  // %3
3742f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kShuffleAlpha1)  // %4
3743f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3744f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3745f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3746f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3747f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBATTENUATEROW_SSSE3
3748f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3749f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBATTENUATEROW_AVX2
3750f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table duplicating alpha.
3751b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
3752b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                         128u, 128u, 14u,  15u, 14u, 15u,
3753b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                         14u,  15u,  128u, 128u};
3754f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Attenuate 8 pixels at a time.
3755f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3756f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3757f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %3,%%ymm4                  \n"
3758f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3759f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
3760f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        %0,%1                          \n"
3761f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3762f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
3763f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3764b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3765f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3766f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3767f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3768f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
3769f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
3770f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3771f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3772f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
3773f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3774f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
3775f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3776f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
3777f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3778f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3779f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        $0x8,%2                        \n"
3780f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3781f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3782f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),    // %0
3783f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),    // %1
3784f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
3785f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kShuffleAlpha_AVX2)  // %3
3786f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3787f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3788f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3789f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3790f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBATTENUATEROW_AVX2
3791f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3792f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBUNATTENUATEROW_SSE2
3793f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Unattenuate 4 pixels at a time.
3794b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBUnattenuateRow_SSE2(const uint8* src_argb,
3795b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                             uint8* dst_argb,
3796f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                             int width) {
3797f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uintptr_t alpha;
3798f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3799f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
3800f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3801b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3802f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3803f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3804f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm0                   \n"
3805f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3806f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3807f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3808f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3809f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3810f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlhps   %%xmm3,%%xmm2                   \n"
3811f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm0                   \n"
3812f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3813f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3814f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm1,%%xmm1                   \n"
3815f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3816f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3817f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3818f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3819f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3820f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlhps   %%xmm3,%%xmm2                   \n"
3821f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm1                   \n"
3822f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
3823f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
3824f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3825f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
3826f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%2                         \n"
3827f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3828f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),     // %0
3829f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),     // %1
3830f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width),        // %2
3831f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "=&r"(alpha)        // %3
3832f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(fixed_invtbl8)  // %4
3833f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3834f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3835f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3836f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3837f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBUNATTENUATEROW_SSE2
3838f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3839f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBUNATTENUATEROW_AVX2
3840f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shuffle table duplicating alpha.
3841f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangstatic const uvec8 kUnattenShuffleAlpha_AVX2 = {
3842b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
3843f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Unattenuate 8 pixels at a time.
3844b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBUnattenuateRow_AVX2(const uint8* src_argb,
3845b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                             uint8* dst_argb,
3846f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                             int width) {
3847f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uintptr_t alpha;
3848f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3849f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        %0,%1                          \n"
3850f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 %5,%%ymm5                  \n"
3851f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3852f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
3853f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3854b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3855f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // replace VPGATHER
3856f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3857f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3858f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3859f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3860f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3861f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
3862f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3863f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3864f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3865f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
3866f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
3867f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3868f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
3869f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3870f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
3871f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
3872f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3873f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
3874f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3875f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
3876f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
3877f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
3878f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
3879f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // end of VPGATHER
3880f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3881f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3882f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3883f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3884f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
3885f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
3886f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
3887f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
3888f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3889f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3890f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3891f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3892f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3893f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        $0x8,%2                        \n"
3894f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3895f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
3896f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),      // %0
3897f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),      // %1
3898f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width),         // %2
3899f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "=&r"(alpha)         // %3
3900f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(fixed_invtbl8),  // %4
3901f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kUnattenShuffleAlpha_AVX2)  // %5
3902f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
3903f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3904f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3905f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3906f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBUNATTENUATEROW_AVX2
3907f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3908f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBGRAYROW_SSSE3
3909f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3910f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3911f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3912f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm4                       \n"
3913f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm5                       \n"
3914f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3915f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
3916f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3917b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3918f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3919f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3920f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm0                   \n"
3921f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
3922f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm0                   \n"
3923f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm5,%%xmm0                   \n"
3924f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm0                     \n"
3925f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
3926f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3927f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
3928f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
3929f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x18,%%xmm2                    \n"
3930f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x18,%%xmm3                    \n"
3931f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm3,%%xmm2                   \n"
3932f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm2,%%xmm2                   \n"
3933f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm3                   \n"
3934f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm0                   \n"
3935f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm2,%%xmm3                   \n"
3936f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
3937f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm3,%%xmm0                   \n"
3938f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm3,%%xmm1                   \n"
3939f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3940f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
3941f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
3942f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
3943f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
3944f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),   // %0
3945f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),   // %1
3946f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)       // %2
3947f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kARGBToYJ),   // %3
3948f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kAddYJ64)     // %4
3949f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
3950f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3951f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
3952f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
3953f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBGRAYROW_SSSE3
3954f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3955f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBSEPIAROW_SSSE3
3956f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang//    b = (r * 35 + g * 68 + b * 17) >> 7
3957f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang//    g = (r * 45 + g * 88 + b * 22) >> 7
3958f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang//    r = (r * 50 + g * 98 + b * 24) >> 7
3959f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Constant for ARGB color to sepia tone
3960b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
3961b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                             17, 68, 35, 0, 17, 68, 35, 0};
3962f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3963b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
3964b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                             22, 88, 45, 0, 22, 88, 45, 0};
3965f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3966b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
3967b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                             24, 98, 50, 0, 24, 98, 50, 0};
3968f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3969f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3970f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3971f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
3972f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %2,%%xmm2                       \n"
3973f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %3,%%xmm3                       \n"
3974f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %4,%%xmm4                       \n"
3975f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
3976f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
3977f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
3978b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
3979f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3980f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
3981f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm2,%%xmm0                   \n"
3982f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm2,%%xmm6                   \n"
3983f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm6,%%xmm0                   \n"
3984f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm0                     \n"
3985f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
3986f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3987f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3988f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm5                   \n"
3989f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm1                   \n"
3990f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm5                   \n"
3991f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm5                     \n"
3992f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm5,%%xmm5                   \n"
3993f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm0                   \n"
3994f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3995f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3996f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm5                   \n"
3997f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
3998f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm1,%%xmm5                   \n"
3999f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x7,%%xmm5                     \n"
4000f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm5,%%xmm5                   \n"
4001f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4002f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4003f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x18,%%xmm6                    \n"
4004f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x18,%%xmm1                    \n"
4005f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm6                   \n"
4006f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm6,%%xmm6                   \n"
4007f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm6,%%xmm5                   \n"
4008f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
4009f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm5,%%xmm0                   \n"
4010f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm5,%%xmm1                   \n"
4011f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
4012f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
4013f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
4014f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%1                         \n"
4015f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4016f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_argb),      // %0
4017f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %1
4018f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "m"(kARGBToSepiaB),  // %2
4019f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToSepiaG),  // %3
4020f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "m"(kARGBToSepiaR)   // %4
4021f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4022f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4023f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4024f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4025f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBSEPIAROW_SSSE3
4026f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4027f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4028f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Tranform 8 ARGB pixels (32 bytes) with color matrix.
4029f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Same as Sepia except matrix is provided.
4030b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
4031b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              uint8* dst_argb,
4032b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              const int8* matrix_argb,
4033b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                              int width) {
4034f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4035f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
4036f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
4037f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
4038f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
4039f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
4040f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4041f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
4042f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4043b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4044f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4045f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4046f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm2,%%xmm0                   \n"
4047f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm2,%%xmm7                   \n"
4048f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4049f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4050f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm6                   \n"
4051f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm1                   \n"
4052f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddsw   %%xmm7,%%xmm0                   \n"
4053f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddsw   %%xmm1,%%xmm6                   \n"
4054f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x6,%%xmm0                     \n"
4055f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x6,%%xmm6                     \n"
4056f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
4057f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm6,%%xmm6                   \n"
4058f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm6,%%xmm0                   \n"
4059f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4060f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4061f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm1                   \n"
4062f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm4,%%xmm7                   \n"
4063f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddsw   %%xmm7,%%xmm1                   \n"
4064f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4065f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4066f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm5,%%xmm6                   \n"
4067f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm5,%%xmm7                   \n"
4068f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddsw   %%xmm7,%%xmm6                   \n"
4069f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x6,%%xmm1                     \n"
4070f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psraw     $0x6,%%xmm6                     \n"
4071f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm1                   \n"
4072f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm6,%%xmm6                   \n"
4073f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm6,%%xmm1                   \n"
4074f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm6                   \n"
4075f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm1,%%xmm0                   \n"
4076f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm1,%%xmm6                   \n"
4077f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4078f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
4079f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
4080f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
4081f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
4082f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4083f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),      // %0
4084f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),      // %1
4085f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)          // %2
4086f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(matrix_argb)     // %3
4087f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4088f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4089f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4090f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4091f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4092f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4093f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBQUANTIZEROW_SSE2
4094f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Quantize 4 ARGB pixels (16 bytes).
4095b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBQuantizeRow_SSE2(uint8* dst_argb,
4096b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int scale,
4097b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int interval_size,
4098b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int interval_offset,
4099b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int width) {
4100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %2,%%xmm2                       \n"
4102f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %3,%%xmm3                       \n"
4103f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %4,%%xmm4                       \n"
4104f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4105f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
4106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
4108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
4109f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
4110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm5,%%xmm5                   \n"
4111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm6,%%xmm6                   \n"
4112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm6                    \n"
4113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
4115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4116b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4118f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm0                   \n"
4119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm0                   \n"
4120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm5,%%xmm1                   \n"
4122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm1                   \n"
4123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmullw    %%xmm3,%%xmm0                   \n"
4124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
4125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmullw    %%xmm3,%%xmm1                   \n"
4126f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm6,%%xmm7                   \n"
4127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm4,%%xmm0                   \n"
4128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm4,%%xmm1                   \n"
4129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
4130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm7,%%xmm0                   \n"
4131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
4132f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4133f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%1                         \n"
4134f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4135f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_argb),       // %0
4136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)           // %1
4137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(scale),           // %2
4138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "r"(interval_size),   // %3
4139f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "r"(interval_offset)  // %4
4140f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4141f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBQUANTIZEROW_SSE2
4145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4146f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBSHADEROW_SSE2
4147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Shade 4 pixels at a time by specified value.
4148b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBShadeRow_SSE2(const uint8* src_argb,
4149b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       uint8* dst_argb,
4150b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                       int width,
4151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                       uint32 value) {
4152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %3,%%xmm2                       \n"
4154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm2,%%xmm2                   \n"
4155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklqdq %%xmm2,%%xmm2                  \n"
4156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
4158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4159b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
4163f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm0                   \n"
4164f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm1,%%xmm1                   \n"
4165f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm0                   \n"
4166f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm1                   \n"
4167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm0                     \n"
4168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw     $0x8,%%xmm1                     \n"
4169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
4170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
4172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%2                         \n"
4173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
4175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
4176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)      // %2
4177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(value)       // %3
4178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2"
4180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4181f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBSHADEROW_SSE2
4183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBMULTIPLYROW_SSE2
4185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4186b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBMultiplyRow_SSE2(const uint8* src_argb0,
4187b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* src_argb1,
4188b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          uint8* dst_argb,
4189b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int width) {
4190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4191b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "pxor      %%xmm5,%%xmm5                   \n"
4192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
4194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4195b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
4199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
4200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0,%%xmm1                   \n"
4201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2,%%xmm3                   \n"
4202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm0                   \n"
4203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm1,%%xmm1                   \n"
4204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm2                   \n"
4205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm5,%%xmm3                   \n"
4206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm2,%%xmm0                   \n"
4207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm3,%%xmm1                   \n"
4208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
4209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
4211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
4212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),  // %0
4214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb1),  // %1
4215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),   // %2
4216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)       // %3
4217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBMULTIPLYROW_SSE2
4223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBMULTIPLYROW_AVX2
4225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4226b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBMultiplyRow_AVX2(const uint8* src_argb0,
4227b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* src_argb1,
4228b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          uint8* dst_argb,
4229b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int width) {
4230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
4232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
4234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4235b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
4237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,0) ",%0          \n"
4238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
4239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,1) ",%1          \n"
4240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
4241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
4242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
4243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
4244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
4246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,2) ",%2           \n"
4249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        $0x8,%3                        \n"
4250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4251f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
4252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),  // %0
4253f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb1),  // %1
4254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),   // %2
4255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)       // %3
4256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4257f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#if defined(__AVX2__)
4259f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
4261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBMULTIPLYROW_AVX2
4264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBADDROW_SSE2
4266f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4267b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBAddRow_SSE2(const uint8* src_argb0,
4268b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     const uint8* src_argb1,
4269b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_argb,
4270b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     int width) {
4271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
4273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4274b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
4279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddusb   %%xmm1,%%xmm0                   \n"
4280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
4282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
4283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),  // %0
4285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb1),  // %1
4286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),   // %2
4287f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)       // %3
4288f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4289f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1"
4291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4292f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4293f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBADDROW_SSE2
4294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4295f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBADDROW_AVX2
4296f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4297b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBAddRow_AVX2(const uint8* src_argb0,
4298b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     const uint8* src_argb1,
4299b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_argb,
4300b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     int width) {
4301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
4303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4304b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,0) ",%0          \n"
4307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4308f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,1) ",%1          \n"
4309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,2) ",%2          \n"
4311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        $0x8,%3                        \n"
4312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
4314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),  // %0
4315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb1),  // %1
4316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),   // %2
4317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)       // %3
4318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0"
4321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4323f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBADDROW_AVX2
4324f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4325f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBSUBTRACTROW_SSE2
4326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4327b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBSubtractRow_SSE2(const uint8* src_argb0,
4328b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* src_argb1,
4329b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          uint8* dst_argb,
4330b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int width) {
4331f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4332f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
4333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4334b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4337f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
4339f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubusb   %%xmm1,%%xmm0                   \n"
4340f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4341f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
4342f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
4343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),  // %0
4345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb1),  // %1
4346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),   // %2
4347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)       // %3
4348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1"
4351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4352f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4353f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBSUBTRACTROW_SSE2
4354f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4355f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBSUBTRACTROW_AVX2
4356f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4357b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBSubtractRow_AVX2(const uint8* src_argb0,
4358b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* src_argb1,
4359b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          uint8* dst_argb,
4360b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int width) {
4361f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4362f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
4363f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4364b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4365f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4366f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,0) ",%0          \n"
4367f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4368f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,1) ",%1          \n"
4369f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4370f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea        " MEMLEA(0x20,2) ",%2          \n"
4371f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        $0x8,%3                        \n"
4372b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg         1b                             \n"
4373f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
4374f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb0),  // %0
4375f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb1),  // %1
4376f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),   // %2
4377f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)       // %3
4378f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4379f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4380f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0"
4381f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4382f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4383f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBSUBTRACTROW_AVX2
4384f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4385f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_SOBELXROW_SSE2
4386f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// SobelX as a matrix is
4387f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// -1  0  1
4388f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// -2  0  2
4389f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// -1  0  1
4390b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid SobelXRow_SSE2(const uint8* src_y0,
4391b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                    const uint8* src_y1,
4392b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                    const uint8* src_y2,
4393b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                    uint8* dst_sobelx,
4394b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                    int width) {
4395f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4396f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
4397f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%2                           \n"
4398f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%3                           \n"
4399f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm5,%%xmm5                   \n"
4400f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4401f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
4402f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4403b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4404f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(0) ",%%xmm0         \n"
4405f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
4406f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm0                   \n"
4407f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm1                   \n"
4408f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw     %%xmm1,%%xmm0                   \n"
4409f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4410f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
4411f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm1                   \n"
4412f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm2                   \n"
4413f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw     %%xmm2,%%xmm1                   \n"
4414f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
4415f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
4416f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm2                   \n"
4417f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm3                   \n"
4418f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw     %%xmm3,%%xmm2                   \n"
4419f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm2,%%xmm0                   \n"
4420f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm1,%%xmm0                   \n"
4421f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm1,%%xmm0                   \n"
4422f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm1,%%xmm1                   \n"
4423f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw     %%xmm0,%%xmm1                   \n"
4424f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaxsw    %%xmm1,%%xmm0                   \n"
4425f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
4426f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
4427f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,0) ",%0            \n"
4428f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%4                         \n"
4429f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4430f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_y0),      // %0
4431f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_y1),      // %1
4432f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_y2),      // %2
4433f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_sobelx),  // %3
4434f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %4
4435f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4436f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
4437f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4438f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4439f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4440f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_SOBELXROW_SSE2
4441f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4442f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_SOBELYROW_SSE2
4443f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// SobelY as a matrix is
4444f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// -1 -2 -1
4445f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang//  0  0  0
4446f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang//  1  2  1
4447b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid SobelYRow_SSE2(const uint8* src_y0,
4448b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                    const uint8* src_y1,
4449b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                    uint8* dst_sobely,
4450b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                    int width) {
4451f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4452f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
4453f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%2                           \n"
4454f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm5,%%xmm5                   \n"
4455f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4456f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
4457f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4458b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4459f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(0) ",%%xmm0         \n"
4460f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4461f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm0                   \n"
4462f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm1                   \n"
4463f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw     %%xmm1,%%xmm0                   \n"
4464f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
4465f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
4466f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm1                   \n"
4467f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm2                   \n"
4468f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw     %%xmm2,%%xmm1                   \n"
4469f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
4470f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
4471f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm2                   \n"
4472f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm3                   \n"
4473f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw     %%xmm3,%%xmm2                   \n"
4474f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm2,%%xmm0                   \n"
4475f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm1,%%xmm0                   \n"
4476f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw     %%xmm1,%%xmm0                   \n"
4477f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm1,%%xmm1                   \n"
4478f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubw     %%xmm0,%%xmm1                   \n"
4479f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaxsw    %%xmm1,%%xmm0                   \n"
4480f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
4481f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
4482f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,0) ",%0            \n"
4483f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%3                         \n"
4484f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4485f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_y0),      // %0
4486f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_y1),      // %1
4487f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_sobely),  // %2
4488f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %3
4489f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4490f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
4491f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4492f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4493f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4494f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_SOBELYROW_SSE2
4495f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4496f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_SOBELROW_SSE2
4497f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4498f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// A = 255
4499f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// R = Sobel
4500f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// G = Sobel
4501f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// B = Sobel
4502b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid SobelRow_SSE2(const uint8* src_sobelx,
4503b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                   const uint8* src_sobely,
4504b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                   uint8* dst_argb,
4505b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                   int width) {
4506f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4507f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
4508f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4509f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm5                    \n"
4510f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4511f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
4512f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4513b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4514f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4515f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4516f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4517f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddusb   %%xmm1,%%xmm0                   \n"
4518f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm2                   \n"
4519f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm2                   \n"
4520f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm0,%%xmm0                   \n"
4521f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm1                   \n"
4522f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm2,%%xmm1                   \n"
4523f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm2,%%xmm2                   \n"
4524f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm1                   \n"
4525f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm2                   \n"
4526f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm3                   \n"
4527f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm0,%%xmm3                   \n"
4528f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm0,%%xmm0                   \n"
4529f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm3                   \n"
4530f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "por       %%xmm5,%%xmm0                   \n"
4531f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
4532f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
4533f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
4534f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
4535f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,2) ",%2           \n"
4536f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
4537f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4538f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_sobelx),  // %0
4539f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_sobely),  // %1
4540f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),    // %2
4541f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %3
4542f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4543f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
4544f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4545f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4546f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4547f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_SOBELROW_SSE2
4548f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4549f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_SOBELTOPLANEROW_SSE2
4550f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Adds Sobel X and Sobel Y and stores Sobel into a plane.
4551b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid SobelToPlaneRow_SSE2(const uint8* src_sobelx,
4552b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* src_sobely,
4553b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          uint8* dst_y,
4554b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int width) {
4555f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4556f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
4557f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4558f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pslld     $0x18,%%xmm5                    \n"
4559f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4560f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
4561f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4562b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4563f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4564f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4565f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4566f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddusb   %%xmm1,%%xmm0                   \n"
4567f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4568f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
4569f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
4570f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4571f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_sobelx),  // %0
4572f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_sobely),  // %1
4573f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_y),       // %2
4574f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %3
4575f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4576f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
4577f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1"
4578f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4579f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4580f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_SOBELTOPLANEROW_SSE2
4581f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4582f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_SOBELXYROW_SSE2
4583f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Mixes Sobel X, Sobel Y and Sobel into ARGB.
4584f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// A = 255
4585f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// R = Sobel X
4586f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// G = Sobel
4587f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// B = Sobel Y
4588b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid SobelXYRow_SSE2(const uint8* src_sobelx,
4589b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     const uint8* src_sobely,
4590b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     uint8* dst_argb,
4591b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                     int width) {
4592f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4593f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %0,%1                           \n"
4594f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4595f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4596f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 8 pixel loop.
4597f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4598b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4599f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4600f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4601f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4602f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm2                   \n"
4603f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddusb   %%xmm1,%%xmm2                   \n"
4604f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm3                   \n"
4605f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm3                   \n"
4606f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm5,%%xmm0                   \n"
4607f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm1,%%xmm4                   \n"
4608f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm2,%%xmm4                   \n"
4609f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm2,%%xmm1                   \n"
4610f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm4,%%xmm6                   \n"
4611f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm3,%%xmm6                   \n"
4612f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm3,%%xmm4                   \n"
4613f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm1,%%xmm7                   \n"
4614f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm0,%%xmm7                   \n"
4615f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm0,%%xmm1                   \n"
4616f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
4617f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
4618f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
4619f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
4620f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,2) ",%2           \n"
4621f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%3                        \n"
4622f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4623f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_sobelx),  // %0
4624f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_sobely),  // %1
4625f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),    // %2
4626f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %3
4627f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4628f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
4629f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4630f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4631f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4632f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_SOBELXYROW_SSE2
4633f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4634f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4635f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Creates a table of cumulative sums where each value is a sum of all values
4636f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// above and to the left of the value, inclusive of the value.
4637b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ComputeCumulativeSumRow_SSE2(const uint8* row,
4638b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  int32* cumsum,
4639b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  const int32* previous_cumsum,
4640b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                  int width) {
4641f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4642f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm0,%%xmm0                   \n"
4643f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm1,%%xmm1                   \n"
4644f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
4645f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jl        49f                             \n"
4646f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "test      $0xf,%1                         \n"
4647f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jne       49f                             \n"
4648f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4649b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    // 4 pixel loop.
4650f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4651b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "40:                                       \n"
4652f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4653f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4654f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm4                   \n"
4655f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm1,%%xmm2                   \n"
4656f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm3                   \n"
4657f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm1,%%xmm2                   \n"
4658f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm1,%%xmm3                   \n"
4659f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm1,%%xmm4                   \n"
4660f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm4,%%xmm5                   \n"
4661f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm1,%%xmm4                   \n"
4662f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm1,%%xmm5                   \n"
4663f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm2,%%xmm0                   \n"
4664f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4665f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm0,%%xmm2                   \n"
4666f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm3,%%xmm0                   \n"
4667f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
4668f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm0,%%xmm3                   \n"
4669f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm4,%%xmm0                   \n"
4670f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
4671f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm0,%%xmm4                   \n"
4672f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm5,%%xmm0                   \n"
4673f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
4674f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,2) ",%2           \n"
4675f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm0,%%xmm5                   \n"
4676f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4677f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
4678f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
4679f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
4680f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
4681f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
4682f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       40b                             \n"
4683f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4684b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "49:                                       \n"
4685f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       $0x3,%3                         \n"
4686f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jl        19f                             \n"
4687f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4688b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    // 1 pixel loop.
4689f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4690b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "10:                                       \n"
4691f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      " MEMACCESS(0) ",%%xmm2         \n"
4692f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,0) ",%0            \n"
4693f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm1,%%xmm2                   \n"
4694f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm1,%%xmm2                   \n"
4695f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm2,%%xmm0                   \n"
4696f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4697f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
4698f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddd     %%xmm0,%%xmm2                   \n"
4699f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4700f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
4701f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x1,%3                         \n"
4702f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       10b                             \n"
4703f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4704b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "19:                                       \n"
4705f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(row),  // %0
4706f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(cumsum),  // %1
4707f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(previous_cumsum),  // %2
4708f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)  // %3
4709f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4710f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
4711f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4712f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4713f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4714f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
4715f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4716f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4717b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid CumulativeSumToAverageRow_SSE2(const int32* topleft,
4718b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    const int32* botleft,
4719b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    int width,
4720b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    int area,
4721b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                    uint8* dst,
4722f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                    int count) {
4723f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4724f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %5,%%xmm5                       \n"
4725f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
4726f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rcpss     %%xmm5,%%xmm4                   \n"
4727f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4728f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
4729f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jl        49f                             \n"
4730f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmpl      $0x80,%5                        \n"
4731f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "ja        40f                             \n"
4732f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4733f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4734f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm6,%%xmm6                   \n"
4735f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrld     $0x10,%%xmm6                    \n"
4736f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
4737f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm6,%%xmm5                   \n"
4738f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm4,%%xmm5                   \n"
4739f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtps2dq  %%xmm5,%%xmm5                   \n"
4740f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm5,%%xmm5                   \n"
4741f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4742b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    // 4 pixel small loop.
4743f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4744f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "4:                                         \n"
4745f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4746f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4747f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4748f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4749f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4750f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4751f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4752f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4753f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
4754f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4755f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4756f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4757f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4758f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4759f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4760f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4761f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4762f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
4763f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm1,%%xmm0                   \n"
4764f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm3,%%xmm2                   \n"
4765f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm5,%%xmm0                   \n"
4766f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmulhuw   %%xmm5,%%xmm2                   \n"
4767f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm2,%%xmm0                   \n"
4768f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4769f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
4770f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
4771f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       4b                              \n"
4772f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       49f                             \n"
4773f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4774f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // 4 pixel loop                              \n"
4775f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4776f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "40:                                         \n"
4777f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4778f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4779f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4780f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4781f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4782f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4783f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4784f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4785f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
4786f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4787f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4788f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4789f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4790f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4791f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4792f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4793f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4794f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
4795f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4796f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
4797f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm4,%%xmm0                   \n"
4798f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm4,%%xmm1                   \n"
4799f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
4800f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
4801f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm4,%%xmm2                   \n"
4802f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm4,%%xmm3                   \n"
4803f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtps2dq  %%xmm0,%%xmm0                   \n"
4804f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtps2dq  %%xmm1,%%xmm1                   \n"
4805f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtps2dq  %%xmm2,%%xmm2                   \n"
4806f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtps2dq  %%xmm3,%%xmm3                   \n"
4807f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm1,%%xmm0                   \n"
4808f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm3,%%xmm2                   \n"
4809f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm2,%%xmm0                   \n"
4810f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4811f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
4812f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
4813f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       40b                             \n"
4814f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4815f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "49:                                         \n"
4816f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       $0x3,%3                         \n"
4817f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jl        19f                             \n"
4818f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4819f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // 1 pixel loop                              \n"
4820f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4821f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "10:                                         \n"
4822f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4823f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4824f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
4825f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4826f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4827f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
4828f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4829f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm4,%%xmm0                   \n"
4830f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtps2dq  %%xmm0,%%xmm0                   \n"
4831f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm0,%%xmm0                   \n"
4832f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
4833f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0," MEMACCESS(2) "         \n"
4834f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,2) ",%2            \n"
4835f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x1,%3                         \n"
4836f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       10b                             \n"
4837f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "19:                                         \n"
4838f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(topleft),  // %0
4839f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(botleft),  // %1
4840f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst),      // %2
4841f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(count)    // %3
4842f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(width)),  // %4
4843f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rm"(area)     // %5
4844f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
4845f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4846f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4847f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4848f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4849f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4850f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBAFFINEROW_SSE2
4851f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Copy ARGB pixels from source image with slope to a row of destination.
4852f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu KuangLIBYUV_API
4853b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBAffineRow_SSE2(const uint8* src_argb,
4854b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int src_argb_stride,
4855b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst_argb,
4856b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        const float* src_dudv,
4857b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int width) {
4858f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  intptr_t src_argb_stride_temp = src_argb_stride;
4859f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  intptr_t temp;
4860f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4861f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(3) ",%%xmm2         \n"
4862f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
4863f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "shl       $0x10,%1                        \n"
4864f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       $0x4,%1                         \n"
4865f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %1,%%xmm5                       \n"
4866f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%4                         \n"
4867f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jl        49f                             \n"
4868f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4869f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
4870f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4871f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm0                   \n"
4872f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm7,%%xmm0                   \n"
4873f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movlhps   %%xmm0,%%xmm2                   \n"
4874f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm7,%%xmm4                   \n"
4875f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm4,%%xmm4                   \n"
4876f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm3                   \n"
4877f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm4,%%xmm3                   \n"
4878f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm4,%%xmm4                   \n"
4879f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4880f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // 4 pixel loop                              \n"
4881f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4882f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "40:                                         \n"
4883f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
4884f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
4885f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
4886f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
4887f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k1                      \n"
4888f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4889f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k5                      \n"
4890f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4891f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
4892f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4893f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckldq %%xmm6,%%xmm1                   \n"
4894f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm4,%%xmm2                   \n"
4895f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm1," MEMACCESS(2) "         \n"
4896f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k1                      \n"
4897f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4898f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k5                      \n"
4899f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4900f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4901f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckldq %%xmm6,%%xmm0                   \n"
4902f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm4,%%xmm3                   \n"
4903f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
4904f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
4905f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%4                         \n"
4906f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       40b                             \n"
4907f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4908f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "49:                                         \n"
4909f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       $0x3,%4                         \n"
4910f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jl        19f                             \n"
4911f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4912f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  // 1 pixel loop                              \n"
4913f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4914f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "10:                                         \n"
4915f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvttps2dq %%xmm2,%%xmm0                   \n"
4916f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packssdw  %%xmm0,%%xmm0                   \n"
4917f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddwd   %%xmm5,%%xmm0                   \n"
4918f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm7,%%xmm2                   \n"
4919f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k1                      \n"
4920f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4921f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0," MEMACCESS(2) "         \n"
4922f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x04,2) ",%2           \n"
4923f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x1,%4                         \n"
4924f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jge       10b                             \n"
4925f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "19:                                         \n"
4926f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
4927f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb_stride_temp),  // %1
4928f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %2
4929f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_dudv),  // %3
4930f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width),    // %4
4931f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "=&r"(temp)      // %5
4932f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  :
4933f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
4934f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4935f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
4936f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
4937f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBAFFINEROW_SSE2
4938f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4939f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_INTERPOLATEROW_SSSE3
4940f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Bilinear filter 16x2 -> 16x1
4941b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid InterpolateRow_SSSE3(uint8* dst_ptr,
4942b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* src_ptr,
4943b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          ptrdiff_t src_stride,
4944b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int dst_width,
4945f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                          int source_y_fraction) {
4946f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
4947f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%0                           \n"
4948f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp       $0x0,%3                         \n"
4949f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "je        100f                            \n"
4950f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp       $0x80,%3                        \n"
4951f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "je        50f                             \n"
4952f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4953f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %3,%%xmm0                       \n"
4954f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "neg       %3                              \n"
4955f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       $0x100,%3                       \n"
4956f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %3,%%xmm5                       \n"
4957f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm5                   \n"
4958f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm5,%%xmm5                   \n"
4959f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4960f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       $0x80808080,%%eax               \n"
4961f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%eax,%%xmm4                    \n"
4962f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4963f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4964f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // General purpose row blend.
4965f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4966b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
4967f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4968f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4969f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     %%xmm0,%%xmm1                  \n"
4970f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw  %%xmm2,%%xmm0                  \n"
4971f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw  %%xmm2,%%xmm1                  \n"
4972f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubb      %%xmm4,%%xmm0                  \n"
4973f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psubb      %%xmm4,%%xmm1                  \n"
4974f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     %%xmm5,%%xmm2                  \n"
4975f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa     %%xmm5,%%xmm3                  \n"
4976f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw  %%xmm0,%%xmm2                  \n"
4977f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw  %%xmm1,%%xmm3                  \n"
4978f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw      %%xmm4,%%xmm2                  \n"
4979f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "paddw      %%xmm4,%%xmm3                  \n"
4980f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw      $0x8,%%xmm2                    \n"
4981f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psrlw      $0x8,%%xmm3                    \n"
4982f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb   %%xmm3,%%xmm2                  \n"
4983f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
4984f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
4985f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
4986f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
4987f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       99f                             \n"
4988f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
4989f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Blend 50 / 50.
4990f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
4991f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "50:                                         \n"
4992f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4993f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4994f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pavgb     %%xmm1,%%xmm0                   \n"
4995f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4996f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
4997f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
4998f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        50b                             \n"
4999f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       99f                             \n"
5000f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5001f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Blend 100 / 0 - Copy row unchanged.
5002f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5003f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "100:                                        \n"
5004f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5005f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5006f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
5007f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
5008f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        100b                            \n"
5009f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5010f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "99:                                         \n"
5011f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_ptr),     // %0
5012f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_ptr),     // %1
5013f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(dst_width),  // %2
5014f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(source_y_fraction)  // %3
5015f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride))  // %4
5016f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "eax", NACL_R14
5017f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5018f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5019f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5020f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_INTERPOLATEROW_SSSE3
5021f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5022f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_INTERPOLATEROW_AVX2
5023f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Bilinear filter 32x2 -> 32x1
5024b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid InterpolateRow_AVX2(uint8* dst_ptr,
5025b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         const uint8* src_ptr,
5026b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         ptrdiff_t src_stride,
5027b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int dst_width,
5028f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                         int source_y_fraction) {
5029f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5030f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp       $0x0,%3                         \n"
5031f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "je        100f                            \n"
5032f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%0                           \n"
5033f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp       $0x80,%3                        \n"
5034f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "je        50f                             \n"
5035f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5036f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovd      %3,%%xmm0                      \n"
5037f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "neg        %3                             \n"
5038f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add        $0x100,%3                      \n"
5039f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovd      %3,%%xmm5                      \n"
5040f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
5041f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
5042f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastss %%xmm5,%%ymm5                \n"
5043f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov        $0x80808080,%%eax              \n"
5044f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovd      %%eax,%%xmm4                   \n"
5045f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastss %%xmm4,%%ymm4                \n"
5046f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5047f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // General purpose row blend.
5048f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5049b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5050f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
5051f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
5052f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
5053f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
5054f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
5055f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
5056f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
5057f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
5058f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
5059f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
5060f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
5061f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
5062f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
5063f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5064f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
5065f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
5066f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
5067f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       99f                             \n"
5068f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5069f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Blend 50 / 50.
5070f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5071f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "50:                                         \n"
5072f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
5073f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
5074f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5075f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
5076f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x20,%2                        \n"
5077f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        50b                             \n"
5078f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       99f                             \n"
5079f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5080f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // Blend 100 / 0 - Copy row unchanged.
5081f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5082f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "100:                                        \n"
5083f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rep movsb " MEMMOVESTRING(1,0) "          \n"
5084f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       999f                            \n"
5085f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5086f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "99:                                         \n"
5087f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
5088f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "999:                                        \n"
5089f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+D"(dst_ptr),    // %0
5090f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+S"(src_ptr),    // %1
5091f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+cm"(dst_width),  // %2
5092f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(source_y_fraction)  // %3
5093f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"((intptr_t)(src_stride))  // %4
5094f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "eax", NACL_R14
5095f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
5096f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5097f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5098f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_INTERPOLATEROW_AVX2
5099f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5100f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBSHUFFLEROW_SSSE3
5101f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5102b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBShuffleRow_SSSE3(const uint8* src_argb,
5103b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          uint8* dst_argb,
5104b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          const uint8* shuffler,
5105b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                          int width) {
5106f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5107f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
5108f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5109b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5110f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5111f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5112f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,0) ",%0           \n"
5113f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm5,%%xmm0                   \n"
5114f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufb    %%xmm5,%%xmm1                   \n"
5115f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5116f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
5117f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,1) ",%1           \n"
5118f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x8,%2                         \n"
5119f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
5120f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
5121f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
5122f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
5123f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(shuffler)    // %3
5124f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
5125f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm5"
5126f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5127f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5128f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBSHUFFLEROW_SSSE3
5129f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5130f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBSHUFFLEROW_AVX2
5131f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5132b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBShuffleRow_AVX2(const uint8* src_argb,
5133b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_argb,
5134b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         const uint8* shuffler,
5135b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int width) {
5136f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5137f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
5138f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5139b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5140f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
5141f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
5142f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,0) ",%0           \n"
5143f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
5144f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
5145f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
5146f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
5147f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x40,1) ",%1           \n"
5148f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%2                        \n"
5149f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
5150f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
5151f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
5152f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
5153f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)        // %2
5154f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(shuffler)    // %3
5155f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
5156f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm5"
5157f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5158f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5159f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBSHUFFLEROW_AVX2
5160f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5161f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBSHUFFLEROW_SSE2
5162f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5163b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBShuffleRow_SSE2(const uint8* src_argb,
5164b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         uint8* dst_argb,
5165b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         const uint8* shuffler,
5166b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                         int width) {
5167f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uintptr_t pixel_temp;
5168f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5169f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm5,%%xmm5                   \n"
5170f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       " MEMACCESS(4) ",%k2            \n"
5171f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp       $0x3000102,%k2                  \n"
5172f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "je        3012f                           \n"
5173f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp       $0x10203,%k2                    \n"
5174f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "je        123f                            \n"
5175f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp       $0x30201,%k2                    \n"
5176f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "je        321f                            \n"
5177f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cmp       $0x2010003,%k2                  \n"
5178f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "je        2103f                           \n"
5179f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5180f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5181b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5182f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS(4) ",%2             \n"
5183f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5184f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b2," MEMACCESS(1) "            \n"
5185f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
5186f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5187f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
5188f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
5189f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5190f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
5191f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
5192f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5193f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
5194f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,0) ",%0            \n"
5195f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,1) ",%1            \n"
5196f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x1,%3                         \n"
5197f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
5198f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       99f                             \n"
5199f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5200f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5201f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "123:                                        \n"
5202f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5203f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
5204f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
5205f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm0                   \n"
5206f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm5,%%xmm1                   \n"
5207f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
5208f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
5209f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
5210f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
5211f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
5212f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5213f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
5214f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
5215f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        123b                            \n"
5216f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       99f                             \n"
5217f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5218f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5219f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "321:                                        \n"
5220f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5221f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
5222f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
5223f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm0                   \n"
5224f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm5,%%xmm1                   \n"
5225f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
5226f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
5227f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
5228f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
5229f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
5230f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5231f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
5232f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
5233f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        321b                            \n"
5234f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       99f                             \n"
5235f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5236f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5237f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "2103:                                       \n"
5238f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5239f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
5240f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
5241f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm0                   \n"
5242f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm5,%%xmm1                   \n"
5243f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
5244f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
5245f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
5246f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
5247f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
5248f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5249f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
5250f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
5251f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        2103b                           \n"
5252f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jmp       99f                             \n"
5253f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5254f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5255f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "3012:                                       \n"
5256f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5257f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0           \n"
5258f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
5259f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm5,%%xmm0                   \n"
5260f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm5,%%xmm1                   \n"
5261f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
5262f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
5263f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
5264f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
5265f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm1,%%xmm0                   \n"
5266f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5267f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,1) ",%1           \n"
5268f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%3                         \n"
5269f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        3012b                           \n"
5270f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5271f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  "99:                                         \n"
5272f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),     // %0
5273f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),     // %1
5274f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "=&d"(pixel_temp),  // %2
5275f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)         // %3
5276f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(shuffler)       // %4
5277f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", NACL_R14
5278f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm5"
5279f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5280f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5281f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBSHUFFLEROW_SSE2
5282f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5283f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_I422TOYUY2ROW_SSE2
5284f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid I422ToYUY2Row_SSE2(const uint8* src_y,
5285f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                        const uint8* src_u,
5286f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                        const uint8* src_v,
5287b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst_frame,
5288b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int width) {
5289b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
5290f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       %1,%2                             \n"
5291f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5292b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5293f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(1) ",%%xmm2           \n"
5294f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5295f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1              \n"
5296f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm3,%%xmm2                     \n"
5297f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5298f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0             \n"
5299f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                     \n"
5300f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm2,%%xmm0                     \n"
5301f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm2,%%xmm1                     \n"
5302f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
5303f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
5304f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,3) ",%3             \n"
5305f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%4                          \n"
5306f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg         1b                               \n"
5307f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "+r"(src_y),  // %0
5308f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "+r"(src_u),  // %1
5309f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "+r"(src_v),  // %2
5310f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "+r"(dst_frame),  // %3
5311f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "+rm"(width)  // %4
5312f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    :
5313f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", NACL_R14
5314f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3"
5315f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5316f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5317f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I422TOYUY2ROW_SSE2
5318f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5319f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_I422TOUYVYROW_SSE2
5320f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid I422ToUYVYRow_SSE2(const uint8* src_y,
5321f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                        const uint8* src_u,
5322f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                        const uint8* src_v,
5323b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        uint8* dst_frame,
5324b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                        int width) {
5325b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
5326f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub        %1,%2                            \n"
5327f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5328b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5329f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(1) ",%%xmm2           \n"
5330f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5331f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1              \n"
5332f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm3,%%xmm2                     \n"
5333f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5334f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm2,%%xmm1                     \n"
5335f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,0) ",%0             \n"
5336f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm0,%%xmm1                     \n"
5337f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhbw %%xmm0,%%xmm2                     \n"
5338f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
5339f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
5340f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x20,3) ",%3             \n"
5341f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x10,%4                          \n"
5342f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg         1b                               \n"
5343f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "+r"(src_y),  // %0
5344f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "+r"(src_u),  // %1
5345f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "+r"(src_v),  // %2
5346f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "+r"(dst_frame),  // %3
5347f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang      "+rm"(width)  // %4
5348f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    :
5349f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    : "memory", "cc", NACL_R14
5350f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3"
5351f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5352f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5353f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_I422TOUYVYROW_SSE2
5354f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5355f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5356f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBPolynomialRow_SSE2(const uint8* src_argb,
5357b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                            uint8* dst_argb,
5358b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                            const float* poly,
5359f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                            int width) {
5360f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5361f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm3,%%xmm3                   \n"
5362f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5363f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 2 pixel loop.
5364f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5365b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5366f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      " MEMACCESS(0) ",%%xmm0         \n"
5367f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,0) ",%0            \n"
5368f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklbw %%xmm3,%%xmm0                   \n"
5369f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm4                   \n"
5370f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm3,%%xmm0                   \n"
5371f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpckhwd %%xmm3,%%xmm4                   \n"
5372f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5373f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
5374f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm0,%%xmm1                   \n"
5375f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm4,%%xmm5                   \n"
5376f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
5377f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
5378f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     " MEMACCESS(3) ",%%xmm0         \n"
5379f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     " MEMACCESS(3) ",%%xmm4         \n"
5380f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm1,%%xmm2                   \n"
5381f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqa    %%xmm5,%%xmm6                   \n"
5382f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm1,%%xmm2                   \n"
5383f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm5,%%xmm6                   \n"
5384f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm2,%%xmm1                   \n"
5385f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     %%xmm6,%%xmm5                   \n"
5386f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
5387f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
5388f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
5389f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
5390f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm2,%%xmm0                   \n"
5391f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm6,%%xmm4                   \n"
5392f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm1,%%xmm0                   \n"
5393f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "addps     %%xmm5,%%xmm4                   \n"
5394f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvttps2dq %%xmm0,%%xmm0                   \n"
5395f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "cvttps2dq %%xmm4,%%xmm4                   \n"
5396f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm4,%%xmm0                   \n"
5397f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "packuswb  %%xmm0,%%xmm0                   \n"
5398f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movq      %%xmm0," MEMACCESS(1) "         \n"
5399f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x8,1) ",%1            \n"
5400f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x2,%2                         \n"
5401f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
5402f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
5403f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
5404f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)      // %2
5405f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(poly)        // %3
5406f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc"
5407f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5408f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5409f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5410f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5411f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5412f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5413f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid ARGBPolynomialRow_AVX2(const uint8* src_argb,
5414b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                            uint8* dst_argb,
5415b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                            const float* poly,
5416f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                            int width) {
5417f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5418f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
5419f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5420f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5421f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5422f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5423f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 2 pixel loop.
5424f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5425b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5426f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
5427f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea         " MEMLEA(0x8,0) ",%0          \n"
5428f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
5429f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
5430f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
5431f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
5432f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
5433f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
5434f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
5435f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
5436f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5437f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
5438f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
5439f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea         " MEMLEA(0x8,1) ",%1          \n"
5440f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub         $0x2,%2                       \n"
5441f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg          1b                            \n"
5442f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "vzeroupper                                \n"
5443f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(src_argb),  // %0
5444f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),  // %1
5445f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)      // %2
5446f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(poly)        // %3
5447f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc",
5448f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5449f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5450f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5451f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5452f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5453b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#ifdef HAS_HALFFLOATROW_SSE2
5454b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardstatic float kScaleBias = 1.9259299444e-34f;
5455b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
5456b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
5457b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "pshufd      $0x0,%3,%%xmm4                \n"
5458b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "pxor        %%xmm5,%%xmm5                 \n"
5459b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub         %0,%1                         \n"
5460b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
5461b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    // 16 pixel loop.
5462b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    LABELALIGN
5463b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5464b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqu      " MEMACCESS(0) ",%%xmm2       \n"  // 8 shorts
5465b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "add         $0x10,%0                      \n"
5466b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "movdqa      %%xmm2,%%xmm3                 \n"
5467b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
5468b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
5469b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "punpckhwd   %%xmm5,%%xmm3                 \n"
5470b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "cvtdq2ps    %%xmm3,%%xmm3                 \n"
5471b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "mulps       %%xmm4,%%xmm2                 \n"
5472b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "mulps       %%xmm4,%%xmm3                 \n"
5473b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "psrld       $0xd,%%xmm2                   \n"
5474b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "psrld       $0xd,%%xmm3                   \n"
5475b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "packssdw    %%xmm3,%%xmm2                 \n"
5476b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
5477b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub         $0x8,%2                       \n"
5478b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg          1b                            \n"
5479b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "+r"(src),    // %0
5480b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(dst),    // %1
5481b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(width)   // %2
5482b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "x"(scale * kScaleBias)   // %3
5483b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "memory", "cc",
5484b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "xmm2", "xmm3", "xmm4", "xmm5"
5485b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  );
5486b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard}
5487b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#endif  // HAS_HALFFLOATROW_SSE2
5488b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
5489b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#ifdef HAS_HALFFLOATROW_AVX2
5490b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
5491b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
5492b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vbroadcastss  %3, %%ymm4                  \n"
5493b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
5494b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        %0,%1                          \n"
5495b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
5496b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    // 16 pixel loop.
5497b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    LABELALIGN
5498b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5499b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmovdqu    " MEMACCESS(0) ",%%ymm2        \n"  // 16 shorts
5500b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "add        $0x20,%0                       \n"
5501b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
5502b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
5503b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
5504b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
5505b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
5506b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
5507b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
5508b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
5509b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
5510b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
5511b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        $0x10,%2                       \n"
5512b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg         1b                             \n"
5513b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
5514b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vzeroupper                                \n"
5515b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "+r"(src),    // %0
5516b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(dst),    // %1
5517b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(width)   // %2
5518b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "x"(scale * kScaleBias)   // %3
5519b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "memory", "cc",
5520b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "xmm2", "xmm3", "xmm4", "xmm5"
5521b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  );
5522b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard}
5523b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#endif  // HAS_HALFFLOATROW_AVX2
5524b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
5525b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#ifdef HAS_HALFFLOATROW_F16C
5526b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
5527b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
5528b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vbroadcastss  %3, %%ymm4                  \n"
5529b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        %0,%1                          \n"
5530b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
5531b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    // 16 pixel loop.
5532b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    LABELALIGN
5533b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5534b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
5535b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5536b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
5537b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
5538b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
5539b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
5540b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
5541b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
5542b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5543b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5544b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "add         $0x20,%0                      \n"
5545b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub         $0x10,%2                      \n"
5546b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg          1b                            \n"
5547b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vzeroupper                                \n"
5548b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "+r"(src),   // %0
5549b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(dst),   // %1
5550b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(width)  // %2
5551b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "x"(scale)   // %3
5552b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "memory", "cc",
5553b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "xmm2", "xmm3", "xmm4"
5554b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  );
5555b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard}
5556b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#endif  // HAS_HALFFLOATROW_F16C
5557b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
5558b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#ifdef HAS_HALFFLOATROW_F16C
5559b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
5560b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  asm volatile (
5561b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub        %0,%1                          \n"
5562b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    // 16 pixel loop.
5563b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    LABELALIGN
5564b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5565b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
5566b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5567b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
5568b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
5569b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
5570b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
5571b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5572b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5573b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "add         $0x20,%0                      \n"
5574b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "sub         $0x10,%2                      \n"
5575b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "jg          1b                            \n"
5576b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "vzeroupper                                \n"
5577b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "+r"(src),   // %0
5578b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(dst),   // %1
5579b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "+r"(width)  // %2
5580b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  :
5581b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  : "memory", "cc",
5582b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "xmm2", "xmm3"
5583b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard  );
5584b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard}
5585b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard#endif  // HAS_HALFFLOATROW_F16C
5586b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard
5587f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBCOLORTABLEROW_X86
5588f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Tranform ARGB pixels with color table.
5589b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBColorTableRow_X86(uint8* dst_argb,
5590b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                           const uint8* table_argb,
5591f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                           int width) {
5592f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uintptr_t pixel_temp;
5593f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5594f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 1 pixel loop.
5595f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5596b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5597f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS(0) ",%1             \n"
5598f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,0) ",%0            \n"
5599f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5600f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5601f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5602f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5603f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5604f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5605f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5606f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5607f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
5608f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
5609f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
5610f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dec       %2                              \n"
5611f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
5612f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_argb),     // %0
5613f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "=&d"(pixel_temp),  // %1
5614f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)         // %2
5615f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(table_argb)     // %3
5616f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc");
5617f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5618f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBCOLORTABLEROW_X86
5619f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5620f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_RGBCOLORTABLEROW_X86
5621f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Tranform RGB pixels with color table.
5622f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuangvoid RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5623f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uintptr_t pixel_temp;
5624f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5625f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 1 pixel loop.
5626f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5627b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5628f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS(0) ",%1             \n"
5629f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x4,0) ",%0            \n"
5630f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5631f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5632f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5633f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5634f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5635f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5636f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5637f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5638f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "dec       %2                              \n"
5639f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
5640f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "+r"(dst_argb),     // %0
5641f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "=&d"(pixel_temp),  // %1
5642f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(width)         // %2
5643f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(table_argb)     // %3
5644f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc");
5645f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5646f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_RGBCOLORTABLEROW_X86
5647f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5648f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5649f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang// Tranform RGB pixels with luma table.
5650b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchardvoid ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
5651b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                 uint8* dst_argb,
5652f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang                                 int width,
5653b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                 const uint8* luma,
5654b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard                                 uint32 lumacoeff) {
5655f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uintptr_t pixel_temp;
5656f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  uintptr_t table_temp;
5657f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  asm volatile (
5658f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %6,%%xmm3                       \n"
5659f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
5660f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pcmpeqb   %%xmm4,%%xmm4                   \n"
5661f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "psllw     $0x8,%%xmm4                     \n"
5662f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pxor      %%xmm5,%%xmm5                   \n"
5663f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5664f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    // 4 pixel loop.
5665f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    LABELALIGN
5666b83bb38f0a92bedeb52baa31e515220927ef53bbFrank Barchard    "1:                                        \n"
5667f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
5668f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pmaddubsw %%xmm3,%%xmm0                   \n"
5669f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "phaddw    %%xmm0,%%xmm0                   \n"
5670f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pand      %%xmm4,%%xmm0                   \n"
5671f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "punpcklwd %%xmm5,%%xmm0                   \n"
5672f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5673f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       %5,%1                           \n"
5674f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5675f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5676f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS(2) ",%0             \n"
5677f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5678f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS(3) "            \n"
5679f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
5680f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5681f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
5682f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
5683f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5684f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
5685f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
5686f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
5687f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5688f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5689f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       %5,%1                           \n"
5690f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5691f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5692f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
5693f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5694f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
5695f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
5696f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5697f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
5698f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
5699f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5700f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
5701f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
5702f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
5703f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5704f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5705f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       %5,%1                           \n"
5706f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5707f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5708f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
5709f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5710f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
5711f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
5712f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5713f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
5714f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
5715f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5716f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
5717f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
5718f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
5719f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5720f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5721f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "add       %5,%1                           \n"
5722f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5723f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
5724f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5725f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
5726f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
5727f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5728f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
5729f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
5730f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5731f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
5732f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
5733f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
5734f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,2) ",%2           \n"
5735f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "lea       " MEMLEA(0x10,3) ",%3           \n"
5736f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "sub       $0x4,%4                         \n"
5737f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "jg        1b                              \n"
5738f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "=&d"(pixel_temp),  // %0
5739f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "=&a"(table_temp),  // %1
5740f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(src_argb),     // %2
5741f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+r"(dst_argb),     // %3
5742f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "+rm"(width)        // %4
5743f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "r"(luma),          // %5
5744f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang    "rm"(lumacoeff)     // %6
5745f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5746f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang  );
5747f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}
5748f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5749f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5750f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif  // defined(__x86_64__) || defined(__i386__)
5751f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang
5752f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#ifdef __cplusplus
5753f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}  // extern "C"
5754f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang}  // namespace libyuv
5755f047e7ca6983218eed7703c7afd51fed7bd3b5c9Hangyu Kuang#endif
5756