1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC x86 and x64.
19#if !defined(LIBYUV_DISABLE_X86) && \
20    (defined(__x86_64__) || (defined(__i386__) && !defined(_MSC_VER)))
21
22#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
23
24// Constants for ARGB
25static vec8 kARGBToY = {13, 65, 33, 0, 13, 65, 33, 0,
26                        13, 65, 33, 0, 13, 65, 33, 0};
27
28// JPeg full range.
29static vec8 kARGBToYJ = {15, 75, 38, 0, 15, 75, 38, 0,
30                         15, 75, 38, 0, 15, 75, 38, 0};
31#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
32
33#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
34
35static vec8 kARGBToU = {112, -74, -38, 0, 112, -74, -38, 0,
36                        112, -74, -38, 0, 112, -74, -38, 0};
37
38static vec8 kARGBToUJ = {127, -84, -43, 0, 127, -84, -43, 0,
39                         127, -84, -43, 0, 127, -84, -43, 0};
40
41static vec8 kARGBToV = {
42    -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
43};
44
45static vec8 kARGBToVJ = {-20, -107, 127, 0, -20, -107, 127, 0,
46                         -20, -107, 127, 0, -20, -107, 127, 0};
47
48// Constants for BGRA
49static vec8 kBGRAToY = {0, 33, 65, 13, 0, 33, 65, 13,
50                        0, 33, 65, 13, 0, 33, 65, 13};
51
52static vec8 kBGRAToU = {0, -38, -74, 112, 0, -38, -74, 112,
53                        0, -38, -74, 112, 0, -38, -74, 112};
54
55static vec8 kBGRAToV = {0, 112, -94, -18, 0, 112, -94, -18,
56                        0, 112, -94, -18, 0, 112, -94, -18};
57
58// Constants for ABGR
59static vec8 kABGRToY = {33, 65, 13, 0, 33, 65, 13, 0,
60                        33, 65, 13, 0, 33, 65, 13, 0};
61
62static vec8 kABGRToU = {-38, -74, 112, 0, -38, -74, 112, 0,
63                        -38, -74, 112, 0, -38, -74, 112, 0};
64
65static vec8 kABGRToV = {112, -94, -18, 0, 112, -94, -18, 0,
66                        112, -94, -18, 0, 112, -94, -18, 0};
67
68// Constants for RGBA.
69static vec8 kRGBAToY = {0, 13, 65, 33, 0, 13, 65, 33,
70                        0, 13, 65, 33, 0, 13, 65, 33};
71
72static vec8 kRGBAToU = {0, 112, -74, -38, 0, 112, -74, -38,
73                        0, 112, -74, -38, 0, 112, -74, -38};
74
75static vec8 kRGBAToV = {0, -18, -94, 112, 0, -18, -94, 112,
76                        0, -18, -94, 112, 0, -18, -94, 112};
77
78static uvec8 kAddY16 = {16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
79                        16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u};
80
81// 7 bit fixed point 0.5.
82static vec16 kAddYJ64 = {64, 64, 64, 64, 64, 64, 64, 64};
83
84static uvec8 kAddUV128 = {128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
85                          128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
86
87static uvec16 kAddUVJ128 = {0x8080u, 0x8080u, 0x8080u, 0x8080u,
88                            0x8080u, 0x8080u, 0x8080u, 0x8080u};
89#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
90
91#ifdef HAS_RGB24TOARGBROW_SSSE3
92
93// Shuffle table for converting RGB24 to ARGB.
94static uvec8 kShuffleMaskRGB24ToARGB = {0u, 1u, 2u, 12u, 3u, 4u,  5u,  13u,
95                                        6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u};
96
97// Shuffle table for converting RAW to ARGB.
98static uvec8 kShuffleMaskRAWToARGB = {2u, 1u, 0u, 12u, 5u,  4u,  3u, 13u,
99                                      8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u};
100
101// Shuffle table for converting RAW to RGB24.  First 8.
102static const uvec8 kShuffleMaskRAWToRGB24_0 = {
103    2u,   1u,   0u,   5u,   4u,   3u,   8u,   7u,
104    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
105
106// Shuffle table for converting RAW to RGB24.  Middle 8.
107static const uvec8 kShuffleMaskRAWToRGB24_1 = {
108    2u,   7u,   6u,   5u,   10u,  9u,   8u,   13u,
109    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
110
111// Shuffle table for converting RAW to RGB24.  Last 8.
112static const uvec8 kShuffleMaskRAWToRGB24_2 = {
113    8u,   7u,   12u,  11u,  10u,  15u,  14u,  13u,
114    128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u};
115
116// Shuffle table for converting ARGB to RGB24.
117static uvec8 kShuffleMaskARGBToRGB24 = {
118    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u};
119
120// Shuffle table for converting ARGB to RAW.
121static uvec8 kShuffleMaskARGBToRAW = {
122    2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u};
123
124// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
125static uvec8 kShuffleMaskARGBToRGB24_0 = {
126    0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u};
127
128// YUY2 shuf 16 Y to 32 Y.
129static const lvec8 kShuffleYUY2Y = {0,  0,  2,  2,  4,  4,  6,  6,  8,  8, 10,
130                                    10, 12, 12, 14, 14, 0,  0,  2,  2,  4, 4,
131                                    6,  6,  8,  8,  10, 10, 12, 12, 14, 14};
132
133// YUY2 shuf 8 UV to 16 UV.
134static const lvec8 kShuffleYUY2UV = {1,  3,  1,  3,  5,  7,  5,  7,  9,  11, 9,
135                                     11, 13, 15, 13, 15, 1,  3,  1,  3,  5,  7,
136                                     5,  7,  9,  11, 9,  11, 13, 15, 13, 15};
137
138// UYVY shuf 16 Y to 32 Y.
139static const lvec8 kShuffleUYVYY = {1,  1,  3,  3,  5,  5,  7,  7,  9,  9, 11,
140                                    11, 13, 13, 15, 15, 1,  1,  3,  3,  5, 5,
141                                    7,  7,  9,  9,  11, 11, 13, 13, 15, 15};
142
143// UYVY shuf 8 UV to 16 UV.
144static const lvec8 kShuffleUYVYUV = {0,  2,  0,  2,  4,  6,  4,  6,  8,  10, 8,
145                                     10, 12, 14, 12, 14, 0,  2,  0,  2,  4,  6,
146                                     4,  6,  8,  10, 8,  10, 12, 14, 12, 14};
147
148// NV21 shuf 8 VU to 16 UV.
149static const lvec8 kShuffleNV21 = {
150    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
151    1, 0, 1, 0, 3, 2, 3, 2, 5, 4, 5, 4, 7, 6, 7, 6,
152};
153#endif  // HAS_RGB24TOARGBROW_SSSE3
154
155#ifdef HAS_J400TOARGBROW_SSE2
156void J400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int width) {
157  asm volatile (
158    "pcmpeqb   %%xmm5,%%xmm5                   \n"
159    "pslld     $0x18,%%xmm5                    \n"
160    LABELALIGN
161    "1:                                        \n"
162    "movq      " MEMACCESS(0) ",%%xmm0         \n"
163    "lea       " MEMLEA(0x8,0) ",%0            \n"
164    "punpcklbw %%xmm0,%%xmm0                   \n"
165    "movdqa    %%xmm0,%%xmm1                   \n"
166    "punpcklwd %%xmm0,%%xmm0                   \n"
167    "punpckhwd %%xmm1,%%xmm1                   \n"
168    "por       %%xmm5,%%xmm0                   \n"
169    "por       %%xmm5,%%xmm1                   \n"
170    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
171    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
172    "lea       " MEMLEA(0x20,1) ",%1           \n"
173    "sub       $0x8,%2                         \n"
174    "jg        1b                              \n"
175  : "+r"(src_y),     // %0
176    "+r"(dst_argb),  // %1
177    "+r"(width)        // %2
178  :: "memory", "cc", "xmm0", "xmm1", "xmm5"
179  );
180}
181#endif  // HAS_J400TOARGBROW_SSE2
182
183#ifdef HAS_RGB24TOARGBROW_SSSE3
184void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int width) {
185  asm volatile (
186    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
187    "pslld     $0x18,%%xmm5                    \n"
188    "movdqa    %3,%%xmm4                       \n"
189    LABELALIGN
190    "1:                                        \n"
191    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
192    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
193    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
194    "lea       " MEMLEA(0x30,0) ",%0           \n"
195    "movdqa    %%xmm3,%%xmm2                   \n"
196    "palignr   $0x8,%%xmm1,%%xmm2              \n"
197    "pshufb    %%xmm4,%%xmm2                   \n"
198    "por       %%xmm5,%%xmm2                   \n"
199    "palignr   $0xc,%%xmm0,%%xmm1              \n"
200    "pshufb    %%xmm4,%%xmm0                   \n"
201    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
202    "por       %%xmm5,%%xmm0                   \n"
203    "pshufb    %%xmm4,%%xmm1                   \n"
204    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
205    "por       %%xmm5,%%xmm1                   \n"
206    "palignr   $0x4,%%xmm3,%%xmm3              \n"
207    "pshufb    %%xmm4,%%xmm3                   \n"
208    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
209    "por       %%xmm5,%%xmm3                   \n"
210    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
211    "lea       " MEMLEA(0x40,1) ",%1           \n"
212    "sub       $0x10,%2                        \n"
213    "jg        1b                              \n"
214  : "+r"(src_rgb24),  // %0
215    "+r"(dst_argb),  // %1
216    "+r"(width)        // %2
217  : "m"(kShuffleMaskRGB24ToARGB)  // %3
218  : "memory", "cc" , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
219  );
220}
221
222void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int width) {
223  asm volatile (
224    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
225    "pslld     $0x18,%%xmm5                    \n"
226    "movdqa    %3,%%xmm4                       \n"
227    LABELALIGN
228    "1:                                        \n"
229    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
230    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
231    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
232    "lea       " MEMLEA(0x30,0) ",%0           \n"
233    "movdqa    %%xmm3,%%xmm2                   \n"
234    "palignr   $0x8,%%xmm1,%%xmm2              \n"
235    "pshufb    %%xmm4,%%xmm2                   \n"
236    "por       %%xmm5,%%xmm2                   \n"
237    "palignr   $0xc,%%xmm0,%%xmm1              \n"
238    "pshufb    %%xmm4,%%xmm0                   \n"
239    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
240    "por       %%xmm5,%%xmm0                   \n"
241    "pshufb    %%xmm4,%%xmm1                   \n"
242    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
243    "por       %%xmm5,%%xmm1                   \n"
244    "palignr   $0x4,%%xmm3,%%xmm3              \n"
245    "pshufb    %%xmm4,%%xmm3                   \n"
246    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
247    "por       %%xmm5,%%xmm3                   \n"
248    "movdqu    %%xmm3," MEMACCESS2(0x30,1) "   \n"
249    "lea       " MEMLEA(0x40,1) ",%1           \n"
250    "sub       $0x10,%2                        \n"
251    "jg        1b                              \n"
252  : "+r"(src_raw),   // %0
253    "+r"(dst_argb),  // %1
254    "+r"(width)        // %2
255  : "m"(kShuffleMaskRAWToARGB)  // %3
256  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
257  );
258}
259
260void RAWToRGB24Row_SSSE3(const uint8* src_raw, uint8* dst_rgb24, int width) {
261  asm volatile (
262   "movdqa     %3,%%xmm3                       \n"
263   "movdqa     %4,%%xmm4                       \n"
264   "movdqa     %5,%%xmm5                       \n"
265    LABELALIGN
266    "1:                                        \n"
267    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
268    "movdqu    " MEMACCESS2(0x4,0) ",%%xmm1    \n"
269    "movdqu    " MEMACCESS2(0x8,0) ",%%xmm2    \n"
270    "lea       " MEMLEA(0x18,0) ",%0           \n"
271    "pshufb    %%xmm3,%%xmm0                   \n"
272    "pshufb    %%xmm4,%%xmm1                   \n"
273    "pshufb    %%xmm5,%%xmm2                   \n"
274    "movq      %%xmm0," MEMACCESS(1) "         \n"
275    "movq      %%xmm1," MEMACCESS2(0x8,1) "    \n"
276    "movq      %%xmm2," MEMACCESS2(0x10,1) "   \n"
277    "lea       " MEMLEA(0x18,1) ",%1           \n"
278    "sub       $0x8,%2                         \n"
279    "jg        1b                              \n"
280  : "+r"(src_raw),    // %0
281    "+r"(dst_rgb24),  // %1
282    "+r"(width)       // %2
283  : "m"(kShuffleMaskRAWToRGB24_0),  // %3
284    "m"(kShuffleMaskRAWToRGB24_1),  // %4
285    "m"(kShuffleMaskRAWToRGB24_2)   // %5
286  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
287  );
288}
289
290void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
291  asm volatile (
292    "mov       $0x1080108,%%eax                \n"
293    "movd      %%eax,%%xmm5                    \n"
294    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
295    "mov       $0x20802080,%%eax               \n"
296    "movd      %%eax,%%xmm6                    \n"
297    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
298    "pcmpeqb   %%xmm3,%%xmm3                   \n"
299    "psllw     $0xb,%%xmm3                     \n"
300    "pcmpeqb   %%xmm4,%%xmm4                   \n"
301    "psllw     $0xa,%%xmm4                     \n"
302    "psrlw     $0x5,%%xmm4                     \n"
303    "pcmpeqb   %%xmm7,%%xmm7                   \n"
304    "psllw     $0x8,%%xmm7                     \n"
305    "sub       %0,%1                           \n"
306    "sub       %0,%1                           \n"
307    LABELALIGN
308    "1:                                        \n"
309    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
310    "movdqa    %%xmm0,%%xmm1                   \n"
311    "movdqa    %%xmm0,%%xmm2                   \n"
312    "pand      %%xmm3,%%xmm1                   \n"
313    "psllw     $0xb,%%xmm2                     \n"
314    "pmulhuw   %%xmm5,%%xmm1                   \n"
315    "pmulhuw   %%xmm5,%%xmm2                   \n"
316    "psllw     $0x8,%%xmm1                     \n"
317    "por       %%xmm2,%%xmm1                   \n"
318    "pand      %%xmm4,%%xmm0                   \n"
319    "pmulhuw   %%xmm6,%%xmm0                   \n"
320    "por       %%xmm7,%%xmm0                   \n"
321    "movdqa    %%xmm1,%%xmm2                   \n"
322    "punpcklbw %%xmm0,%%xmm1                   \n"
323    "punpckhbw %%xmm0,%%xmm2                   \n"
324    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
325    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
326    "lea       " MEMLEA(0x10,0) ",%0           \n"
327    "sub       $0x8,%2                         \n"
328    "jg        1b                              \n"
329  : "+r"(src),  // %0
330    "+r"(dst),  // %1
331    "+r"(width)   // %2
332  :
333  : "memory", "cc", "eax", NACL_R14
334    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
335  );
336}
337
338void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
339  asm volatile (
340    "mov       $0x1080108,%%eax                \n"
341    "movd      %%eax,%%xmm5                    \n"
342    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
343    "mov       $0x42004200,%%eax               \n"
344    "movd      %%eax,%%xmm6                    \n"
345    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
346    "pcmpeqb   %%xmm3,%%xmm3                   \n"
347    "psllw     $0xb,%%xmm3                     \n"
348    "movdqa    %%xmm3,%%xmm4                   \n"
349    "psrlw     $0x6,%%xmm4                     \n"
350    "pcmpeqb   %%xmm7,%%xmm7                   \n"
351    "psllw     $0x8,%%xmm7                     \n"
352    "sub       %0,%1                           \n"
353    "sub       %0,%1                           \n"
354    LABELALIGN
355    "1:                                        \n"
356    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
357    "movdqa    %%xmm0,%%xmm1                   \n"
358    "movdqa    %%xmm0,%%xmm2                   \n"
359    "psllw     $0x1,%%xmm1                     \n"
360    "psllw     $0xb,%%xmm2                     \n"
361    "pand      %%xmm3,%%xmm1                   \n"
362    "pmulhuw   %%xmm5,%%xmm2                   \n"
363    "pmulhuw   %%xmm5,%%xmm1                   \n"
364    "psllw     $0x8,%%xmm1                     \n"
365    "por       %%xmm2,%%xmm1                   \n"
366    "movdqa    %%xmm0,%%xmm2                   \n"
367    "pand      %%xmm4,%%xmm0                   \n"
368    "psraw     $0x8,%%xmm2                     \n"
369    "pmulhuw   %%xmm6,%%xmm0                   \n"
370    "pand      %%xmm7,%%xmm2                   \n"
371    "por       %%xmm2,%%xmm0                   \n"
372    "movdqa    %%xmm1,%%xmm2                   \n"
373    "punpcklbw %%xmm0,%%xmm1                   \n"
374    "punpckhbw %%xmm0,%%xmm2                   \n"
375    MEMOPMEM(movdqu,xmm1,0x00,1,0,2)           //  movdqu  %%xmm1,(%1,%0,2)
376    MEMOPMEM(movdqu,xmm2,0x10,1,0,2)           //  movdqu  %%xmm2,0x10(%1,%0,2)
377    "lea       " MEMLEA(0x10,0) ",%0           \n"
378    "sub       $0x8,%2                         \n"
379    "jg        1b                              \n"
380  : "+r"(src),  // %0
381    "+r"(dst),  // %1
382    "+r"(width)   // %2
383  :
384  : "memory", "cc", "eax", NACL_R14
385    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
386  );
387}
388
389void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int width) {
390  asm volatile (
391    "mov       $0xf0f0f0f,%%eax                \n"
392    "movd      %%eax,%%xmm4                    \n"
393    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
394    "movdqa    %%xmm4,%%xmm5                   \n"
395    "pslld     $0x4,%%xmm5                     \n"
396    "sub       %0,%1                           \n"
397    "sub       %0,%1                           \n"
398    LABELALIGN
399    "1:                                        \n"
400    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
401    "movdqa    %%xmm0,%%xmm2                   \n"
402    "pand      %%xmm4,%%xmm0                   \n"
403    "pand      %%xmm5,%%xmm2                   \n"
404    "movdqa    %%xmm0,%%xmm1                   \n"
405    "movdqa    %%xmm2,%%xmm3                   \n"
406    "psllw     $0x4,%%xmm1                     \n"
407    "psrlw     $0x4,%%xmm3                     \n"
408    "por       %%xmm1,%%xmm0                   \n"
409    "por       %%xmm3,%%xmm2                   \n"
410    "movdqa    %%xmm0,%%xmm1                   \n"
411    "punpcklbw %%xmm2,%%xmm0                   \n"
412    "punpckhbw %%xmm2,%%xmm1                   \n"
413    MEMOPMEM(movdqu,xmm0,0x00,1,0,2)           //  movdqu  %%xmm0,(%1,%0,2)
414    MEMOPMEM(movdqu,xmm1,0x10,1,0,2)           //  movdqu  %%xmm1,0x10(%1,%0,2)
415    "lea       " MEMLEA(0x10,0) ",%0           \n"
416    "sub       $0x8,%2                         \n"
417    "jg        1b                              \n"
418  : "+r"(src),  // %0
419    "+r"(dst),  // %1
420    "+r"(width)   // %2
421  :
422  : "memory", "cc", "eax", NACL_R14
423    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
424  );
425}
426
427void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int width) {
428  asm volatile (
429    "movdqa    %3,%%xmm6                       \n"
430    LABELALIGN
431    "1:                                        \n"
432    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
433    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
434    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
435    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
436    "lea       " MEMLEA(0x40,0) ",%0           \n"
437    "pshufb    %%xmm6,%%xmm0                   \n"
438    "pshufb    %%xmm6,%%xmm1                   \n"
439    "pshufb    %%xmm6,%%xmm2                   \n"
440    "pshufb    %%xmm6,%%xmm3                   \n"
441    "movdqa    %%xmm1,%%xmm4                   \n"
442    "psrldq    $0x4,%%xmm1                     \n"
443    "pslldq    $0xc,%%xmm4                     \n"
444    "movdqa    %%xmm2,%%xmm5                   \n"
445    "por       %%xmm4,%%xmm0                   \n"
446    "pslldq    $0x8,%%xmm5                     \n"
447    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
448    "por       %%xmm5,%%xmm1                   \n"
449    "psrldq    $0x8,%%xmm2                     \n"
450    "pslldq    $0x4,%%xmm3                     \n"
451    "por       %%xmm3,%%xmm2                   \n"
452    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
453    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
454    "lea       " MEMLEA(0x30,1) ",%1           \n"
455    "sub       $0x10,%2                        \n"
456    "jg        1b                              \n"
457  : "+r"(src),  // %0
458    "+r"(dst),  // %1
459    "+r"(width)   // %2
460  : "m"(kShuffleMaskARGBToRGB24)  // %3
461  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
462  );
463}
464
465void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int width) {
466  asm volatile (
467    "movdqa    %3,%%xmm6                       \n"
468    LABELALIGN
469    "1:                                        \n"
470    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
471    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
472    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
473    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
474    "lea       " MEMLEA(0x40,0) ",%0           \n"
475    "pshufb    %%xmm6,%%xmm0                   \n"
476    "pshufb    %%xmm6,%%xmm1                   \n"
477    "pshufb    %%xmm6,%%xmm2                   \n"
478    "pshufb    %%xmm6,%%xmm3                   \n"
479    "movdqa    %%xmm1,%%xmm4                   \n"
480    "psrldq    $0x4,%%xmm1                     \n"
481    "pslldq    $0xc,%%xmm4                     \n"
482    "movdqa    %%xmm2,%%xmm5                   \n"
483    "por       %%xmm4,%%xmm0                   \n"
484    "pslldq    $0x8,%%xmm5                     \n"
485    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
486    "por       %%xmm5,%%xmm1                   \n"
487    "psrldq    $0x8,%%xmm2                     \n"
488    "pslldq    $0x4,%%xmm3                     \n"
489    "por       %%xmm3,%%xmm2                   \n"
490    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
491    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
492    "lea       " MEMLEA(0x30,1) ",%1           \n"
493    "sub       $0x10,%2                        \n"
494    "jg        1b                              \n"
495  : "+r"(src),  // %0
496    "+r"(dst),  // %1
497    "+r"(width)   // %2
498  : "m"(kShuffleMaskARGBToRAW)  // %3
499  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
500  );
501}
502
503void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int width) {
504  asm volatile (
505    "pcmpeqb   %%xmm3,%%xmm3                   \n"
506    "psrld     $0x1b,%%xmm3                    \n"
507    "pcmpeqb   %%xmm4,%%xmm4                   \n"
508    "psrld     $0x1a,%%xmm4                    \n"
509    "pslld     $0x5,%%xmm4                     \n"
510    "pcmpeqb   %%xmm5,%%xmm5                   \n"
511    "pslld     $0xb,%%xmm5                     \n"
512    LABELALIGN
513    "1:                                        \n"
514    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
515    "movdqa    %%xmm0,%%xmm1                   \n"
516    "movdqa    %%xmm0,%%xmm2                   \n"
517    "pslld     $0x8,%%xmm0                     \n"
518    "psrld     $0x3,%%xmm1                     \n"
519    "psrld     $0x5,%%xmm2                     \n"
520    "psrad     $0x10,%%xmm0                    \n"
521    "pand      %%xmm3,%%xmm1                   \n"
522    "pand      %%xmm4,%%xmm2                   \n"
523    "pand      %%xmm5,%%xmm0                   \n"
524    "por       %%xmm2,%%xmm1                   \n"
525    "por       %%xmm1,%%xmm0                   \n"
526    "packssdw  %%xmm0,%%xmm0                   \n"
527    "lea       " MEMLEA(0x10,0) ",%0           \n"
528    "movq      %%xmm0," MEMACCESS(1) "         \n"
529    "lea       " MEMLEA(0x8,1) ",%1            \n"
530    "sub       $0x4,%2                         \n"
531    "jg        1b                              \n"
532  : "+r"(src),  // %0
533    "+r"(dst),  // %1
534    "+r"(width)   // %2
535  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
536  );
537}
538
539void ARGBToRGB565DitherRow_SSE2(const uint8* src,
540                                uint8* dst,
541                                const uint32 dither4,
542                                int width) {
543  asm volatile(
544      "movd       %3,%%xmm6                      \n"
545      "punpcklbw  %%xmm6,%%xmm6                  \n"
546      "movdqa     %%xmm6,%%xmm7                  \n"
547      "punpcklwd  %%xmm6,%%xmm6                  \n"
548      "punpckhwd  %%xmm7,%%xmm7                  \n"
549      "pcmpeqb    %%xmm3,%%xmm3                  \n"
550      "psrld      $0x1b,%%xmm3                   \n"
551      "pcmpeqb    %%xmm4,%%xmm4                  \n"
552      "psrld      $0x1a,%%xmm4                   \n"
553      "pslld      $0x5,%%xmm4                    \n"
554      "pcmpeqb    %%xmm5,%%xmm5                  \n"
555      "pslld      $0xb,%%xmm5                    \n"
556
557      LABELALIGN
558      "1:                                        \n"
559      "movdqu     (%0),%%xmm0                    \n"
560      "paddusb    %%xmm6,%%xmm0                  \n"
561      "movdqa     %%xmm0,%%xmm1                  \n"
562      "movdqa     %%xmm0,%%xmm2                  \n"
563      "pslld      $0x8,%%xmm0                    \n"
564      "psrld      $0x3,%%xmm1                    \n"
565      "psrld      $0x5,%%xmm2                    \n"
566      "psrad      $0x10,%%xmm0                   \n"
567      "pand       %%xmm3,%%xmm1                  \n"
568      "pand       %%xmm4,%%xmm2                  \n"
569      "pand       %%xmm5,%%xmm0                  \n"
570      "por        %%xmm2,%%xmm1                  \n"
571      "por        %%xmm1,%%xmm0                  \n"
572      "packssdw   %%xmm0,%%xmm0                  \n"
573      "lea        0x10(%0),%0                    \n"
574      "movq       %%xmm0,(%1)                    \n"
575      "lea        0x8(%1),%1                     \n"
576      "sub        $0x4,%2                        \n"
577      "jg        1b                              \n"
578      : "+r"(src),    // %0
579        "+r"(dst),    // %1
580        "+r"(width)   // %2
581      : "m"(dither4)  // %3
582      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
583        "xmm7");
584}
585
586#ifdef HAS_ARGBTORGB565DITHERROW_AVX2
587void ARGBToRGB565DitherRow_AVX2(const uint8* src,
588                                uint8* dst,
589                                const uint32 dither4,
590                                int width) {
591  asm volatile(
592      "vbroadcastss %3,%%xmm6                    \n"
593      "vpunpcklbw %%xmm6,%%xmm6,%%xmm6           \n"
594      "vpermq     $0xd8,%%ymm6,%%ymm6            \n"
595      "vpunpcklwd %%ymm6,%%ymm6,%%ymm6           \n"
596      "vpcmpeqb   %%ymm3,%%ymm3,%%ymm3           \n"
597      "vpsrld     $0x1b,%%ymm3,%%ymm3            \n"
598      "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
599      "vpsrld     $0x1a,%%ymm4,%%ymm4            \n"
600      "vpslld     $0x5,%%ymm4,%%ymm4             \n"
601      "vpslld     $0xb,%%ymm3,%%ymm5             \n"
602
603      LABELALIGN
604      "1:                                        \n"
605      "vmovdqu    (%0),%%ymm0                    \n"
606      "vpaddusb   %%ymm6,%%ymm0,%%ymm0           \n"
607      "vpsrld     $0x5,%%ymm0,%%ymm2             \n"
608      "vpsrld     $0x3,%%ymm0,%%ymm1             \n"
609      "vpsrld     $0x8,%%ymm0,%%ymm0             \n"
610      "vpand      %%ymm4,%%ymm2,%%ymm2           \n"
611      "vpand      %%ymm3,%%ymm1,%%ymm1           \n"
612      "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
613      "vpor       %%ymm2,%%ymm1,%%ymm1           \n"
614      "vpor       %%ymm1,%%ymm0,%%ymm0           \n"
615      "vpackusdw  %%ymm0,%%ymm0,%%ymm0           \n"
616      "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
617      "lea        0x20(%0),%0                    \n"
618      "vmovdqu    %%xmm0,(%1)                    \n"
619      "lea        0x10(%1),%1                    \n"
620      "sub        $0x8,%2                        \n"
621      "jg         1b                             \n"
622      "vzeroupper                                \n"
623      : "+r"(src),    // %0
624        "+r"(dst),    // %1
625        "+r"(width)   // %2
626      : "m"(dither4)  // %3
627      : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
628        "xmm7");
629}
630#endif  // HAS_ARGBTORGB565DITHERROW_AVX2
631
632void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int width) {
633  asm volatile (
634    "pcmpeqb   %%xmm4,%%xmm4                   \n"
635    "psrld     $0x1b,%%xmm4                    \n"
636    "movdqa    %%xmm4,%%xmm5                   \n"
637    "pslld     $0x5,%%xmm5                     \n"
638    "movdqa    %%xmm4,%%xmm6                   \n"
639    "pslld     $0xa,%%xmm6                     \n"
640    "pcmpeqb   %%xmm7,%%xmm7                   \n"
641    "pslld     $0xf,%%xmm7                     \n"
642
643    LABELALIGN
644    "1:                                        \n"
645    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
646    "movdqa    %%xmm0,%%xmm1                   \n"
647    "movdqa    %%xmm0,%%xmm2                   \n"
648    "movdqa    %%xmm0,%%xmm3                   \n"
649    "psrad     $0x10,%%xmm0                    \n"
650    "psrld     $0x3,%%xmm1                     \n"
651    "psrld     $0x6,%%xmm2                     \n"
652    "psrld     $0x9,%%xmm3                     \n"
653    "pand      %%xmm7,%%xmm0                   \n"
654    "pand      %%xmm4,%%xmm1                   \n"
655    "pand      %%xmm5,%%xmm2                   \n"
656    "pand      %%xmm6,%%xmm3                   \n"
657    "por       %%xmm1,%%xmm0                   \n"
658    "por       %%xmm3,%%xmm2                   \n"
659    "por       %%xmm2,%%xmm0                   \n"
660    "packssdw  %%xmm0,%%xmm0                   \n"
661    "lea       " MEMLEA(0x10,0) ",%0           \n"
662    "movq      %%xmm0," MEMACCESS(1) "         \n"
663    "lea       " MEMLEA(0x8,1) ",%1            \n"
664    "sub       $0x4,%2                         \n"
665    "jg        1b                              \n"
666  : "+r"(src),  // %0
667    "+r"(dst),  // %1
668    "+r"(width)   // %2
669  :: "memory", "cc",
670    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
671  );
672}
673
674void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int width) {
675  asm volatile (
676    "pcmpeqb   %%xmm4,%%xmm4                   \n"
677    "psllw     $0xc,%%xmm4                     \n"
678    "movdqa    %%xmm4,%%xmm3                   \n"
679    "psrlw     $0x8,%%xmm3                     \n"
680
681    LABELALIGN
682    "1:                                        \n"
683    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
684    "movdqa    %%xmm0,%%xmm1                   \n"
685    "pand      %%xmm3,%%xmm0                   \n"
686    "pand      %%xmm4,%%xmm1                   \n"
687    "psrlq     $0x4,%%xmm0                     \n"
688    "psrlq     $0x8,%%xmm1                     \n"
689    "por       %%xmm1,%%xmm0                   \n"
690    "packuswb  %%xmm0,%%xmm0                   \n"
691    "lea       " MEMLEA(0x10,0) ",%0           \n"
692    "movq      %%xmm0," MEMACCESS(1) "         \n"
693    "lea       " MEMLEA(0x8,1) ",%1            \n"
694    "sub       $0x4,%2                         \n"
695    "jg        1b                              \n"
696  : "+r"(src),  // %0
697    "+r"(dst),  // %1
698    "+r"(width)   // %2
699  :: "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
700  );
701}
702#endif  // HAS_RGB24TOARGBROW_SSSE3
703
704#ifdef HAS_ARGBTOYROW_SSSE3
705// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
706void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
707  asm volatile (
708    "movdqa    %3,%%xmm4                       \n"
709    "movdqa    %4,%%xmm5                       \n"
710
711    LABELALIGN
712    "1:                                        \n"
713    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
714    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
715    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
716    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
717    "pmaddubsw %%xmm4,%%xmm0                   \n"
718    "pmaddubsw %%xmm4,%%xmm1                   \n"
719    "pmaddubsw %%xmm4,%%xmm2                   \n"
720    "pmaddubsw %%xmm4,%%xmm3                   \n"
721    "lea       " MEMLEA(0x40,0) ",%0           \n"
722    "phaddw    %%xmm1,%%xmm0                   \n"
723    "phaddw    %%xmm3,%%xmm2                   \n"
724    "psrlw     $0x7,%%xmm0                     \n"
725    "psrlw     $0x7,%%xmm2                     \n"
726    "packuswb  %%xmm2,%%xmm0                   \n"
727    "paddb     %%xmm5,%%xmm0                   \n"
728    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
729    "lea       " MEMLEA(0x10,1) ",%1           \n"
730    "sub       $0x10,%2                        \n"
731    "jg        1b                              \n"
732  : "+r"(src_argb),  // %0
733    "+r"(dst_y),     // %1
734    "+r"(width)        // %2
735  : "m"(kARGBToY),   // %3
736    "m"(kAddY16)     // %4
737  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
738  );
739}
740#endif  // HAS_ARGBTOYROW_SSSE3
741
742#ifdef HAS_ARGBTOYJROW_SSSE3
743// Convert 16 ARGB pixels (64 bytes) to 16 YJ values.
744// Same as ARGBToYRow but different coefficients, no add 16, but do rounding.
745void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int width) {
746  asm volatile (
747    "movdqa    %3,%%xmm4                       \n"
748    "movdqa    %4,%%xmm5                       \n"
749
750    LABELALIGN
751    "1:                                        \n"
752    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
753    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
754    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
755    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
756    "pmaddubsw %%xmm4,%%xmm0                   \n"
757    "pmaddubsw %%xmm4,%%xmm1                   \n"
758    "pmaddubsw %%xmm4,%%xmm2                   \n"
759    "pmaddubsw %%xmm4,%%xmm3                   \n"
760    "lea       " MEMLEA(0x40,0) ",%0           \n"
761    "phaddw    %%xmm1,%%xmm0                   \n"
762    "phaddw    %%xmm3,%%xmm2                   \n"
763    "paddw     %%xmm5,%%xmm0                   \n"
764    "paddw     %%xmm5,%%xmm2                   \n"
765    "psrlw     $0x7,%%xmm0                     \n"
766    "psrlw     $0x7,%%xmm2                     \n"
767    "packuswb  %%xmm2,%%xmm0                   \n"
768    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
769    "lea       " MEMLEA(0x10,1) ",%1           \n"
770    "sub       $0x10,%2                        \n"
771    "jg        1b                              \n"
772  : "+r"(src_argb),  // %0
773    "+r"(dst_y),     // %1
774    "+r"(width)        // %2
775  : "m"(kARGBToYJ),  // %3
776    "m"(kAddYJ64)    // %4
777  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
778  );
779}
780#endif  // HAS_ARGBTOYJROW_SSSE3
781
782#ifdef HAS_ARGBTOYROW_AVX2
783// vpermd for vphaddw + vpackuswb vpermd.
784static const lvec32 kPermdARGBToY_AVX = {0, 4, 1, 5, 2, 6, 3, 7};
785
786// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
787void ARGBToYRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
788  asm volatile (
789    "vbroadcastf128 %3,%%ymm4                  \n"
790    "vbroadcastf128 %4,%%ymm5                  \n"
791    "vmovdqu    %5,%%ymm6                      \n"
792
793    LABELALIGN
794    "1:                                        \n"
795    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
796    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
797    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
798    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
799    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
800    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
801    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
802    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
803    "lea       " MEMLEA(0x80,0) ",%0           \n"
804    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
805    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
806    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
807    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
808    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
809    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
810    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"  // add 16 for Y
811    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
812    "lea       " MEMLEA(0x20,1) ",%1           \n"
813    "sub       $0x20,%2                        \n"
814    "jg        1b                              \n"
815    "vzeroupper                                \n"
816  : "+r"(src_argb),  // %0
817    "+r"(dst_y),     // %1
818    "+r"(width)        // %2
819  : "m"(kARGBToY),   // %3
820    "m"(kAddY16),    // %4
821    "m"(kPermdARGBToY_AVX)  // %5
822  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
823  );
824}
825#endif  // HAS_ARGBTOYROW_AVX2
826
827#ifdef HAS_ARGBTOYJROW_AVX2
828// Convert 32 ARGB pixels (128 bytes) to 32 Y values.
829void ARGBToYJRow_AVX2(const uint8* src_argb, uint8* dst_y, int width) {
830  asm volatile (
831    "vbroadcastf128 %3,%%ymm4                  \n"
832    "vbroadcastf128 %4,%%ymm5                  \n"
833    "vmovdqu    %5,%%ymm6                      \n"
834
835    LABELALIGN
836    "1:                                        \n"
837    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
838    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
839    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
840    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
841    "vpmaddubsw %%ymm4,%%ymm0,%%ymm0           \n"
842    "vpmaddubsw %%ymm4,%%ymm1,%%ymm1           \n"
843    "vpmaddubsw %%ymm4,%%ymm2,%%ymm2           \n"
844    "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
845    "lea       " MEMLEA(0x80,0) ",%0           \n"
846    "vphaddw    %%ymm1,%%ymm0,%%ymm0           \n"  // mutates.
847    "vphaddw    %%ymm3,%%ymm2,%%ymm2           \n"
848    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"  // Add .5 for rounding.
849    "vpaddw     %%ymm5,%%ymm2,%%ymm2           \n"
850    "vpsrlw     $0x7,%%ymm0,%%ymm0             \n"
851    "vpsrlw     $0x7,%%ymm2,%%ymm2             \n"
852    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
853    "vpermd     %%ymm0,%%ymm6,%%ymm0           \n"  // unmutate.
854    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
855    "lea       " MEMLEA(0x20,1) ",%1           \n"
856    "sub       $0x20,%2                        \n"
857    "jg        1b                              \n"
858    "vzeroupper                                \n"
859  : "+r"(src_argb),  // %0
860    "+r"(dst_y),     // %1
861    "+r"(width)        // %2
862  : "m"(kARGBToYJ),   // %3
863    "m"(kAddYJ64),    // %4
864    "m"(kPermdARGBToY_AVX)  // %5
865  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
866  );
867}
868#endif  // HAS_ARGBTOYJROW_AVX2
869
870#ifdef HAS_ARGBTOUVROW_SSSE3
871void ARGBToUVRow_SSSE3(const uint8* src_argb0,
872                       int src_stride_argb,
873                       uint8* dst_u,
874                       uint8* dst_v,
875                       int width) {
876  asm volatile (
877    "movdqa    %5,%%xmm3                       \n"
878    "movdqa    %6,%%xmm4                       \n"
879    "movdqa    %7,%%xmm5                       \n"
880    "sub       %1,%2                           \n"
881
882    LABELALIGN
883    "1:                                        \n"
884    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
885    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
886    "pavgb     %%xmm7,%%xmm0                   \n"
887    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
888    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
889    "pavgb     %%xmm7,%%xmm1                   \n"
890    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
891    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
892    "pavgb     %%xmm7,%%xmm2                   \n"
893    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
894    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
895    "pavgb     %%xmm7,%%xmm6                   \n"
896
897    "lea       " MEMLEA(0x40,0) ",%0           \n"
898    "movdqa    %%xmm0,%%xmm7                   \n"
899    "shufps    $0x88,%%xmm1,%%xmm0             \n"
900    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
901    "pavgb     %%xmm7,%%xmm0                   \n"
902    "movdqa    %%xmm2,%%xmm7                   \n"
903    "shufps    $0x88,%%xmm6,%%xmm2             \n"
904    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
905    "pavgb     %%xmm7,%%xmm2                   \n"
906    "movdqa    %%xmm0,%%xmm1                   \n"
907    "movdqa    %%xmm2,%%xmm6                   \n"
908    "pmaddubsw %%xmm4,%%xmm0                   \n"
909    "pmaddubsw %%xmm4,%%xmm2                   \n"
910    "pmaddubsw %%xmm3,%%xmm1                   \n"
911    "pmaddubsw %%xmm3,%%xmm6                   \n"
912    "phaddw    %%xmm2,%%xmm0                   \n"
913    "phaddw    %%xmm6,%%xmm1                   \n"
914    "psraw     $0x8,%%xmm0                     \n"
915    "psraw     $0x8,%%xmm1                     \n"
916    "packsswb  %%xmm1,%%xmm0                   \n"
917    "paddb     %%xmm5,%%xmm0                   \n"
918    "movlps    %%xmm0," MEMACCESS(1) "         \n"
919    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
920    "lea       " MEMLEA(0x8,1) ",%1            \n"
921    "sub       $0x10,%3                        \n"
922    "jg        1b                              \n"
923  : "+r"(src_argb0),       // %0
924    "+r"(dst_u),           // %1
925    "+r"(dst_v),           // %2
926    "+rm"(width)           // %3
927  : "r"((intptr_t)(src_stride_argb)), // %4
928    "m"(kARGBToV),  // %5
929    "m"(kARGBToU),  // %6
930    "m"(kAddUV128)  // %7
931  : "memory", "cc", NACL_R14
932    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
933  );
934}
935#endif  // HAS_ARGBTOUVROW_SSSE3
936
937#ifdef HAS_ARGBTOUVROW_AVX2
938// vpshufb for vphaddw + vpackuswb packed to shorts.
939static const lvec8 kShufARGBToUV_AVX = {
940    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15,
941    0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15};
942void ARGBToUVRow_AVX2(const uint8* src_argb0,
943                      int src_stride_argb,
944                      uint8* dst_u,
945                      uint8* dst_v,
946                      int width) {
947  asm volatile (
948    "vbroadcastf128 %5,%%ymm5                  \n"
949    "vbroadcastf128 %6,%%ymm6                  \n"
950    "vbroadcastf128 %7,%%ymm7                  \n"
951    "sub        %1,%2                          \n"
952
953    LABELALIGN
954    "1:                                        \n"
955    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
956    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
957    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
958    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
959    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
960    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
961    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
962    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
963    "lea        " MEMLEA(0x80,0) ",%0          \n"
964    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
965    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
966    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
967    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
968    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
969    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
970
971    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
972    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
973    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
974    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
975    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
976    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
977    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
978    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
979    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
980    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
981    "vpshufb    %8,%%ymm0,%%ymm0               \n"
982    "vpaddb     %%ymm5,%%ymm0,%%ymm0           \n"
983
984    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
985    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
986    "lea        " MEMLEA(0x10,1) ",%1          \n"
987    "sub        $0x20,%3                       \n"
988    "jg         1b                             \n"
989    "vzeroupper                                \n"
990  : "+r"(src_argb0),       // %0
991    "+r"(dst_u),           // %1
992    "+r"(dst_v),           // %2
993    "+rm"(width)           // %3
994  : "r"((intptr_t)(src_stride_argb)), // %4
995    "m"(kAddUV128),  // %5
996    "m"(kARGBToV),   // %6
997    "m"(kARGBToU),   // %7
998    "m"(kShufARGBToUV_AVX)  // %8
999  : "memory", "cc", NACL_R14
1000    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1001  );
1002}
1003#endif  // HAS_ARGBTOUVROW_AVX2
1004
1005#ifdef HAS_ARGBTOUVJROW_AVX2
1006void ARGBToUVJRow_AVX2(const uint8* src_argb0,
1007                       int src_stride_argb,
1008                       uint8* dst_u,
1009                       uint8* dst_v,
1010                       int width) {
1011  asm volatile (
1012    "vbroadcastf128 %5,%%ymm5                  \n"
1013    "vbroadcastf128 %6,%%ymm6                  \n"
1014    "vbroadcastf128 %7,%%ymm7                  \n"
1015    "sub        %1,%2                          \n"
1016
1017    LABELALIGN
1018    "1:                                        \n"
1019    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
1020    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
1021    "vmovdqu    " MEMACCESS2(0x40,0) ",%%ymm2  \n"
1022    "vmovdqu    " MEMACCESS2(0x60,0) ",%%ymm3  \n"
1023    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
1024    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
1025    VMEMOPREG(vpavgb,0x40,0,4,1,ymm2,ymm2)
1026    VMEMOPREG(vpavgb,0x60,0,4,1,ymm3,ymm3)
1027    "lea       " MEMLEA(0x80,0) ",%0           \n"
1028    "vshufps    $0x88,%%ymm1,%%ymm0,%%ymm4     \n"
1029    "vshufps    $0xdd,%%ymm1,%%ymm0,%%ymm0     \n"
1030    "vpavgb     %%ymm4,%%ymm0,%%ymm0           \n"
1031    "vshufps    $0x88,%%ymm3,%%ymm2,%%ymm4     \n"
1032    "vshufps    $0xdd,%%ymm3,%%ymm2,%%ymm2     \n"
1033    "vpavgb     %%ymm4,%%ymm2,%%ymm2           \n"
1034
1035    "vpmaddubsw %%ymm7,%%ymm0,%%ymm1           \n"
1036    "vpmaddubsw %%ymm7,%%ymm2,%%ymm3           \n"
1037    "vpmaddubsw %%ymm6,%%ymm0,%%ymm0           \n"
1038    "vpmaddubsw %%ymm6,%%ymm2,%%ymm2           \n"
1039    "vphaddw    %%ymm3,%%ymm1,%%ymm1           \n"
1040    "vphaddw    %%ymm2,%%ymm0,%%ymm0           \n"
1041    "vpaddw     %%ymm5,%%ymm0,%%ymm0           \n"
1042    "vpaddw     %%ymm5,%%ymm1,%%ymm1           \n"
1043    "vpsraw     $0x8,%%ymm1,%%ymm1             \n"
1044    "vpsraw     $0x8,%%ymm0,%%ymm0             \n"
1045    "vpacksswb  %%ymm0,%%ymm1,%%ymm0           \n"
1046    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
1047    "vpshufb    %8,%%ymm0,%%ymm0               \n"
1048
1049    "vextractf128 $0x0,%%ymm0," MEMACCESS(1) " \n"
1050    VEXTOPMEM(vextractf128,1,ymm0,0x0,1,2,1) // vextractf128 $1,%%ymm0,(%1,%2,1)
1051    "lea       " MEMLEA(0x10,1) ",%1           \n"
1052    "sub       $0x20,%3                        \n"
1053    "jg        1b                              \n"
1054    "vzeroupper                                \n"
1055  : "+r"(src_argb0),       // %0
1056    "+r"(dst_u),           // %1
1057    "+r"(dst_v),           // %2
1058    "+rm"(width)           // %3
1059  : "r"((intptr_t)(src_stride_argb)), // %4
1060    "m"(kAddUVJ128),  // %5
1061    "m"(kARGBToVJ),  // %6
1062    "m"(kARGBToUJ),  // %7
1063    "m"(kShufARGBToUV_AVX)  // %8
1064  : "memory", "cc", NACL_R14
1065    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
1066  );
1067}
1068#endif  // HAS_ARGBTOUVJROW_AVX2
1069
1070#ifdef HAS_ARGBTOUVJROW_SSSE3
1071void ARGBToUVJRow_SSSE3(const uint8* src_argb0,
1072                        int src_stride_argb,
1073                        uint8* dst_u,
1074                        uint8* dst_v,
1075                        int width) {
1076  asm volatile (
1077    "movdqa    %5,%%xmm3                       \n"
1078    "movdqa    %6,%%xmm4                       \n"
1079    "movdqa    %7,%%xmm5                       \n"
1080    "sub       %1,%2                           \n"
1081
1082    LABELALIGN
1083    "1:                                        \n"
1084    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1085    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1086    "pavgb     %%xmm7,%%xmm0                   \n"
1087    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1088    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1089    "pavgb     %%xmm7,%%xmm1                   \n"
1090    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1091    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1092    "pavgb     %%xmm7,%%xmm2                   \n"
1093    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1094    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1095    "pavgb     %%xmm7,%%xmm6                   \n"
1096
1097    "lea       " MEMLEA(0x40,0) ",%0           \n"
1098    "movdqa    %%xmm0,%%xmm7                   \n"
1099    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1100    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1101    "pavgb     %%xmm7,%%xmm0                   \n"
1102    "movdqa    %%xmm2,%%xmm7                   \n"
1103    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1104    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1105    "pavgb     %%xmm7,%%xmm2                   \n"
1106    "movdqa    %%xmm0,%%xmm1                   \n"
1107    "movdqa    %%xmm2,%%xmm6                   \n"
1108    "pmaddubsw %%xmm4,%%xmm0                   \n"
1109    "pmaddubsw %%xmm4,%%xmm2                   \n"
1110    "pmaddubsw %%xmm3,%%xmm1                   \n"
1111    "pmaddubsw %%xmm3,%%xmm6                   \n"
1112    "phaddw    %%xmm2,%%xmm0                   \n"
1113    "phaddw    %%xmm6,%%xmm1                   \n"
1114    "paddw     %%xmm5,%%xmm0                   \n"
1115    "paddw     %%xmm5,%%xmm1                   \n"
1116    "psraw     $0x8,%%xmm0                     \n"
1117    "psraw     $0x8,%%xmm1                     \n"
1118    "packsswb  %%xmm1,%%xmm0                   \n"
1119    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1120    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1121    "lea       " MEMLEA(0x8,1) ",%1            \n"
1122    "sub       $0x10,%3                        \n"
1123    "jg        1b                              \n"
1124  : "+r"(src_argb0),       // %0
1125    "+r"(dst_u),           // %1
1126    "+r"(dst_v),           // %2
1127    "+rm"(width)           // %3
1128  : "r"((intptr_t)(src_stride_argb)), // %4
1129    "m"(kARGBToVJ),  // %5
1130    "m"(kARGBToUJ),  // %6
1131    "m"(kAddUVJ128)  // %7
1132  : "memory", "cc", NACL_R14
1133    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1134  );
1135}
1136#endif  // HAS_ARGBTOUVJROW_SSSE3
1137
1138#ifdef HAS_ARGBTOUV444ROW_SSSE3
1139void ARGBToUV444Row_SSSE3(const uint8* src_argb,
1140                          uint8* dst_u,
1141                          uint8* dst_v,
1142                          int width) {
1143  asm volatile (
1144    "movdqa    %4,%%xmm3                       \n"
1145    "movdqa    %5,%%xmm4                       \n"
1146    "movdqa    %6,%%xmm5                       \n"
1147    "sub       %1,%2                           \n"
1148
1149    LABELALIGN
1150    "1:                                        \n"
1151    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1152    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1153    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1154    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1155    "pmaddubsw %%xmm4,%%xmm0                   \n"
1156    "pmaddubsw %%xmm4,%%xmm1                   \n"
1157    "pmaddubsw %%xmm4,%%xmm2                   \n"
1158    "pmaddubsw %%xmm4,%%xmm6                   \n"
1159    "phaddw    %%xmm1,%%xmm0                   \n"
1160    "phaddw    %%xmm6,%%xmm2                   \n"
1161    "psraw     $0x8,%%xmm0                     \n"
1162    "psraw     $0x8,%%xmm2                     \n"
1163    "packsswb  %%xmm2,%%xmm0                   \n"
1164    "paddb     %%xmm5,%%xmm0                   \n"
1165    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1166    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1167    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1168    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1169    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1170    "pmaddubsw %%xmm3,%%xmm0                   \n"
1171    "pmaddubsw %%xmm3,%%xmm1                   \n"
1172    "pmaddubsw %%xmm3,%%xmm2                   \n"
1173    "pmaddubsw %%xmm3,%%xmm6                   \n"
1174    "phaddw    %%xmm1,%%xmm0                   \n"
1175    "phaddw    %%xmm6,%%xmm2                   \n"
1176    "psraw     $0x8,%%xmm0                     \n"
1177    "psraw     $0x8,%%xmm2                     \n"
1178    "packsswb  %%xmm2,%%xmm0                   \n"
1179    "paddb     %%xmm5,%%xmm0                   \n"
1180    "lea       " MEMLEA(0x40,0) ",%0           \n"
1181    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
1182    "lea       " MEMLEA(0x10,1) ",%1           \n"
1183    "sub       $0x10,%3                        \n"
1184    "jg        1b                              \n"
1185  : "+r"(src_argb),        // %0
1186    "+r"(dst_u),           // %1
1187    "+r"(dst_v),           // %2
1188    "+rm"(width)           // %3
1189  : "m"(kARGBToV),  // %4
1190    "m"(kARGBToU),  // %5
1191    "m"(kAddUV128)  // %6
1192  : "memory", "cc", NACL_R14
1193    "xmm0", "xmm1", "xmm2", "xmm6"
1194  );
1195}
1196#endif  // HAS_ARGBTOUV444ROW_SSSE3
1197
1198void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int width) {
1199  asm volatile (
1200    "movdqa    %4,%%xmm5                       \n"
1201    "movdqa    %3,%%xmm4                       \n"
1202
1203    LABELALIGN
1204    "1:                                        \n"
1205    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1206    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1207    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1208    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1209    "pmaddubsw %%xmm4,%%xmm0                   \n"
1210    "pmaddubsw %%xmm4,%%xmm1                   \n"
1211    "pmaddubsw %%xmm4,%%xmm2                   \n"
1212    "pmaddubsw %%xmm4,%%xmm3                   \n"
1213    "lea       " MEMLEA(0x40,0) ",%0           \n"
1214    "phaddw    %%xmm1,%%xmm0                   \n"
1215    "phaddw    %%xmm3,%%xmm2                   \n"
1216    "psrlw     $0x7,%%xmm0                     \n"
1217    "psrlw     $0x7,%%xmm2                     \n"
1218    "packuswb  %%xmm2,%%xmm0                   \n"
1219    "paddb     %%xmm5,%%xmm0                   \n"
1220    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1221    "lea       " MEMLEA(0x10,1) ",%1           \n"
1222    "sub       $0x10,%2                        \n"
1223    "jg        1b                              \n"
1224  : "+r"(src_bgra),  // %0
1225    "+r"(dst_y),     // %1
1226    "+r"(width)        // %2
1227  : "m"(kBGRAToY),   // %3
1228    "m"(kAddY16)     // %4
1229  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1230  );
1231}
1232
1233void BGRAToUVRow_SSSE3(const uint8* src_bgra0,
1234                       int src_stride_bgra,
1235                       uint8* dst_u,
1236                       uint8* dst_v,
1237                       int width) {
1238  asm volatile (
1239    "movdqa    %5,%%xmm3                       \n"
1240    "movdqa    %6,%%xmm4                       \n"
1241    "movdqa    %7,%%xmm5                       \n"
1242    "sub       %1,%2                           \n"
1243
1244    LABELALIGN
1245    "1:                                        \n"
1246    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1247    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1248    "pavgb     %%xmm7,%%xmm0                   \n"
1249    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1250    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1251    "pavgb     %%xmm7,%%xmm1                   \n"
1252    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1253    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1254    "pavgb     %%xmm7,%%xmm2                   \n"
1255    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1256    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1257    "pavgb     %%xmm7,%%xmm6                   \n"
1258
1259    "lea       " MEMLEA(0x40,0) ",%0           \n"
1260    "movdqa    %%xmm0,%%xmm7                   \n"
1261    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1262    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1263    "pavgb     %%xmm7,%%xmm0                   \n"
1264    "movdqa    %%xmm2,%%xmm7                   \n"
1265    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1266    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1267    "pavgb     %%xmm7,%%xmm2                   \n"
1268    "movdqa    %%xmm0,%%xmm1                   \n"
1269    "movdqa    %%xmm2,%%xmm6                   \n"
1270    "pmaddubsw %%xmm4,%%xmm0                   \n"
1271    "pmaddubsw %%xmm4,%%xmm2                   \n"
1272    "pmaddubsw %%xmm3,%%xmm1                   \n"
1273    "pmaddubsw %%xmm3,%%xmm6                   \n"
1274    "phaddw    %%xmm2,%%xmm0                   \n"
1275    "phaddw    %%xmm6,%%xmm1                   \n"
1276    "psraw     $0x8,%%xmm0                     \n"
1277    "psraw     $0x8,%%xmm1                     \n"
1278    "packsswb  %%xmm1,%%xmm0                   \n"
1279    "paddb     %%xmm5,%%xmm0                   \n"
1280    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1281    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1282    "lea       " MEMLEA(0x8,1) ",%1            \n"
1283    "sub       $0x10,%3                        \n"
1284    "jg        1b                              \n"
1285  : "+r"(src_bgra0),       // %0
1286    "+r"(dst_u),           // %1
1287    "+r"(dst_v),           // %2
1288    "+rm"(width)           // %3
1289  : "r"((intptr_t)(src_stride_bgra)), // %4
1290    "m"(kBGRAToV),  // %5
1291    "m"(kBGRAToU),  // %6
1292    "m"(kAddUV128)  // %7
1293  : "memory", "cc", NACL_R14
1294    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1295  );
1296}
1297
1298void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int width) {
1299  asm volatile (
1300    "movdqa    %4,%%xmm5                       \n"
1301    "movdqa    %3,%%xmm4                       \n"
1302
1303    LABELALIGN
1304    "1:                                        \n"
1305    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1306    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1307    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1308    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1309    "pmaddubsw %%xmm4,%%xmm0                   \n"
1310    "pmaddubsw %%xmm4,%%xmm1                   \n"
1311    "pmaddubsw %%xmm4,%%xmm2                   \n"
1312    "pmaddubsw %%xmm4,%%xmm3                   \n"
1313    "lea       " MEMLEA(0x40,0) ",%0           \n"
1314    "phaddw    %%xmm1,%%xmm0                   \n"
1315    "phaddw    %%xmm3,%%xmm2                   \n"
1316    "psrlw     $0x7,%%xmm0                     \n"
1317    "psrlw     $0x7,%%xmm2                     \n"
1318    "packuswb  %%xmm2,%%xmm0                   \n"
1319    "paddb     %%xmm5,%%xmm0                   \n"
1320    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1321    "lea       " MEMLEA(0x10,1) ",%1           \n"
1322    "sub       $0x10,%2                        \n"
1323    "jg        1b                              \n"
1324  : "+r"(src_abgr),  // %0
1325    "+r"(dst_y),     // %1
1326    "+r"(width)        // %2
1327  : "m"(kABGRToY),   // %3
1328    "m"(kAddY16)     // %4
1329  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1330  );
1331}
1332
1333void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int width) {
1334  asm volatile (
1335    "movdqa    %4,%%xmm5                       \n"
1336    "movdqa    %3,%%xmm4                       \n"
1337
1338    LABELALIGN
1339    "1:                                        \n"
1340    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1341    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1342    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1343    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1344    "pmaddubsw %%xmm4,%%xmm0                   \n"
1345    "pmaddubsw %%xmm4,%%xmm1                   \n"
1346    "pmaddubsw %%xmm4,%%xmm2                   \n"
1347    "pmaddubsw %%xmm4,%%xmm3                   \n"
1348    "lea       " MEMLEA(0x40,0) ",%0           \n"
1349    "phaddw    %%xmm1,%%xmm0                   \n"
1350    "phaddw    %%xmm3,%%xmm2                   \n"
1351    "psrlw     $0x7,%%xmm0                     \n"
1352    "psrlw     $0x7,%%xmm2                     \n"
1353    "packuswb  %%xmm2,%%xmm0                   \n"
1354    "paddb     %%xmm5,%%xmm0                   \n"
1355    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1356    "lea       " MEMLEA(0x10,1) ",%1           \n"
1357    "sub       $0x10,%2                        \n"
1358    "jg        1b                              \n"
1359  : "+r"(src_rgba),  // %0
1360    "+r"(dst_y),     // %1
1361    "+r"(width)        // %2
1362  : "m"(kRGBAToY),   // %3
1363    "m"(kAddY16)     // %4
1364  : "memory", "cc", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1365  );
1366}
1367
1368void ABGRToUVRow_SSSE3(const uint8* src_abgr0,
1369                       int src_stride_abgr,
1370                       uint8* dst_u,
1371                       uint8* dst_v,
1372                       int width) {
1373  asm volatile (
1374    "movdqa    %5,%%xmm3                       \n"
1375    "movdqa    %6,%%xmm4                       \n"
1376    "movdqa    %7,%%xmm5                       \n"
1377    "sub       %1,%2                           \n"
1378
1379    LABELALIGN
1380    "1:                                        \n"
1381    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1382    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1383    "pavgb     %%xmm7,%%xmm0                   \n"
1384    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1385    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1386    "pavgb     %%xmm7,%%xmm1                   \n"
1387    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1388    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1389    "pavgb     %%xmm7,%%xmm2                   \n"
1390    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1391    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1392    "pavgb     %%xmm7,%%xmm6                   \n"
1393
1394    "lea       " MEMLEA(0x40,0) ",%0           \n"
1395    "movdqa    %%xmm0,%%xmm7                   \n"
1396    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1397    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1398    "pavgb     %%xmm7,%%xmm0                   \n"
1399    "movdqa    %%xmm2,%%xmm7                   \n"
1400    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1401    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1402    "pavgb     %%xmm7,%%xmm2                   \n"
1403    "movdqa    %%xmm0,%%xmm1                   \n"
1404    "movdqa    %%xmm2,%%xmm6                   \n"
1405    "pmaddubsw %%xmm4,%%xmm0                   \n"
1406    "pmaddubsw %%xmm4,%%xmm2                   \n"
1407    "pmaddubsw %%xmm3,%%xmm1                   \n"
1408    "pmaddubsw %%xmm3,%%xmm6                   \n"
1409    "phaddw    %%xmm2,%%xmm0                   \n"
1410    "phaddw    %%xmm6,%%xmm1                   \n"
1411    "psraw     $0x8,%%xmm0                     \n"
1412    "psraw     $0x8,%%xmm1                     \n"
1413    "packsswb  %%xmm1,%%xmm0                   \n"
1414    "paddb     %%xmm5,%%xmm0                   \n"
1415    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1416    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1417    "lea       " MEMLEA(0x8,1) ",%1            \n"
1418    "sub       $0x10,%3                        \n"
1419    "jg        1b                              \n"
1420  : "+r"(src_abgr0),       // %0
1421    "+r"(dst_u),           // %1
1422    "+r"(dst_v),           // %2
1423    "+rm"(width)           // %3
1424  : "r"((intptr_t)(src_stride_abgr)), // %4
1425    "m"(kABGRToV),  // %5
1426    "m"(kABGRToU),  // %6
1427    "m"(kAddUV128)  // %7
1428  : "memory", "cc", NACL_R14
1429    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1430  );
1431}
1432
1433void RGBAToUVRow_SSSE3(const uint8* src_rgba0,
1434                       int src_stride_rgba,
1435                       uint8* dst_u,
1436                       uint8* dst_v,
1437                       int width) {
1438  asm volatile (
1439    "movdqa    %5,%%xmm3                       \n"
1440    "movdqa    %6,%%xmm4                       \n"
1441    "movdqa    %7,%%xmm5                       \n"
1442    "sub       %1,%2                           \n"
1443
1444    LABELALIGN
1445    "1:                                        \n"
1446    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1447    MEMOPREG(movdqu,0x00,0,4,1,xmm7)            //  movdqu (%0,%4,1),%%xmm7
1448    "pavgb     %%xmm7,%%xmm0                   \n"
1449    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1450    MEMOPREG(movdqu,0x10,0,4,1,xmm7)            //  movdqu 0x10(%0,%4,1),%%xmm7
1451    "pavgb     %%xmm7,%%xmm1                   \n"
1452    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1453    MEMOPREG(movdqu,0x20,0,4,1,xmm7)            //  movdqu 0x20(%0,%4,1),%%xmm7
1454    "pavgb     %%xmm7,%%xmm2                   \n"
1455    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1456    MEMOPREG(movdqu,0x30,0,4,1,xmm7)            //  movdqu 0x30(%0,%4,1),%%xmm7
1457    "pavgb     %%xmm7,%%xmm6                   \n"
1458
1459    "lea       " MEMLEA(0x40,0) ",%0           \n"
1460    "movdqa    %%xmm0,%%xmm7                   \n"
1461    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1462    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1463    "pavgb     %%xmm7,%%xmm0                   \n"
1464    "movdqa    %%xmm2,%%xmm7                   \n"
1465    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1466    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1467    "pavgb     %%xmm7,%%xmm2                   \n"
1468    "movdqa    %%xmm0,%%xmm1                   \n"
1469    "movdqa    %%xmm2,%%xmm6                   \n"
1470    "pmaddubsw %%xmm4,%%xmm0                   \n"
1471    "pmaddubsw %%xmm4,%%xmm2                   \n"
1472    "pmaddubsw %%xmm3,%%xmm1                   \n"
1473    "pmaddubsw %%xmm3,%%xmm6                   \n"
1474    "phaddw    %%xmm2,%%xmm0                   \n"
1475    "phaddw    %%xmm6,%%xmm1                   \n"
1476    "psraw     $0x8,%%xmm0                     \n"
1477    "psraw     $0x8,%%xmm1                     \n"
1478    "packsswb  %%xmm1,%%xmm0                   \n"
1479    "paddb     %%xmm5,%%xmm0                   \n"
1480    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1481    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1482    "lea       " MEMLEA(0x8,1) ",%1            \n"
1483    "sub       $0x10,%3                        \n"
1484    "jg        1b                              \n"
1485  : "+r"(src_rgba0),       // %0
1486    "+r"(dst_u),           // %1
1487    "+r"(dst_v),           // %2
1488    "+rm"(width)           // %3
1489  : "r"((intptr_t)(src_stride_rgba)), // %4
1490    "m"(kRGBAToV),  // %5
1491    "m"(kRGBAToU),  // %6
1492    "m"(kAddUV128)  // %7
1493  : "memory", "cc", NACL_R14
1494    "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1495  );
1496}
1497
1498#if defined(HAS_I422TOARGBROW_SSSE3) || defined(HAS_I422TOARGBROW_AVX2)
1499
1500// Read 8 UV from 444
1501#define READYUV444 \
1502  "movq       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1503    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1504    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
1505    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1506    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1507    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1508    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1509
1510// Read 4 UV from 422, upsample to 8 UV
1511#define READYUV422 \
1512  "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1513    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1514    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1515    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1516    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1517    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1518    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1519    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1520
1521// Read 4 UV from 422, upsample to 8 UV.  With 8 Alpha.
1522#define READYUVA422 \
1523  "movd       " MEMACCESS([u_buf]) ",%%xmm0                     \n"            \
1524    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
1525    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
1526    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
1527    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1528    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1529    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1530    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
1531    "movq       " MEMACCESS([a_buf]) ",%%xmm5                   \n"            \
1532    "lea        " MEMLEA(0x8, [a_buf]) ",%[a_buf]               \n"
1533
1534// Read 4 UV from NV12, upsample to 8 UV
1535#define READNV12 \
1536  "movq       " MEMACCESS([uv_buf]) ",%%xmm0                    \n"            \
1537    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
1538    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
1539    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1540    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1541    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1542
1543// Read 4 VU from NV21, upsample to 8 UV
1544#define READNV21 \
1545  "movq       " MEMACCESS([vu_buf]) ",%%xmm0                    \n"            \
1546    "lea        " MEMLEA(0x8, [vu_buf]) ",%[vu_buf]             \n"            \
1547    "pshufb     %[kShuffleNV21], %%xmm0                         \n"            \
1548    "movq       " MEMACCESS([y_buf]) ",%%xmm4                   \n"            \
1549    "punpcklbw  %%xmm4,%%xmm4                                   \n"            \
1550    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"
1551
1552// Read 4 YUY2 with 8 Y and update 4 UV to 8 UV.
1553#define READYUY2 \
1554  "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm4                  \n"            \
1555    "pshufb     %[kShuffleYUY2Y], %%xmm4                        \n"            \
1556    "movdqu     " MEMACCESS([yuy2_buf]) ",%%xmm0                \n"            \
1557    "pshufb     %[kShuffleYUY2UV], %%xmm0                       \n"            \
1558    "lea        " MEMLEA(0x10, [yuy2_buf]) ",%[yuy2_buf]        \n"
1559
1560// Read 4 UYVY with 8 Y and update 4 UV to 8 UV.
1561#define READUYVY \
1562  "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm4                  \n"            \
1563    "pshufb     %[kShuffleUYVYY], %%xmm4                        \n"            \
1564    "movdqu     " MEMACCESS([uyvy_buf]) ",%%xmm0                \n"            \
1565    "pshufb     %[kShuffleUYVYUV], %%xmm0                       \n"            \
1566    "lea        " MEMLEA(0x10, [uyvy_buf]) ",%[uyvy_buf]        \n"
1567
1568#if defined(__x86_64__)
1569#define YUVTORGB_SETUP(yuvconstants) \
1570  "movdqa     " MEMACCESS([yuvconstants]) ",%%xmm8              \n"            \
1571    "movdqa     " MEMACCESS2(32, [yuvconstants]) ",%%xmm9       \n"            \
1572    "movdqa     " MEMACCESS2(64, [yuvconstants]) ",%%xmm10      \n"            \
1573    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm11      \n"            \
1574    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm12     \n"            \
1575    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm13     \n"            \
1576    "movdqa     " MEMACCESS2(192, [yuvconstants]) ",%%xmm14     \n"
1577// Convert 8 pixels: 8 UV and 8 Y
1578#define YUVTORGB(yuvconstants)                                    \
1579  "movdqa     %%xmm0,%%xmm1                                   \n" \
1580  "movdqa     %%xmm0,%%xmm2                                   \n" \
1581  "movdqa     %%xmm0,%%xmm3                                   \n" \
1582  "movdqa     %%xmm11,%%xmm0                                  \n" \
1583  "pmaddubsw  %%xmm8,%%xmm1                                   \n" \
1584  "psubw      %%xmm1,%%xmm0                                   \n" \
1585  "movdqa     %%xmm12,%%xmm1                                  \n" \
1586  "pmaddubsw  %%xmm9,%%xmm2                                   \n" \
1587  "psubw      %%xmm2,%%xmm1                                   \n" \
1588  "movdqa     %%xmm13,%%xmm2                                  \n" \
1589  "pmaddubsw  %%xmm10,%%xmm3                                  \n" \
1590  "psubw      %%xmm3,%%xmm2                                   \n" \
1591  "pmulhuw    %%xmm14,%%xmm4                                  \n" \
1592  "paddsw     %%xmm4,%%xmm0                                   \n" \
1593  "paddsw     %%xmm4,%%xmm1                                   \n" \
1594  "paddsw     %%xmm4,%%xmm2                                   \n" \
1595  "psraw      $0x6,%%xmm0                                     \n" \
1596  "psraw      $0x6,%%xmm1                                     \n" \
1597  "psraw      $0x6,%%xmm2                                     \n" \
1598  "packuswb   %%xmm0,%%xmm0                                   \n" \
1599  "packuswb   %%xmm1,%%xmm1                                   \n" \
1600  "packuswb   %%xmm2,%%xmm2                                   \n"
1601#define YUVTORGB_REGS \
1602  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
1603
1604#else
1605#define YUVTORGB_SETUP(yuvconstants)
1606// Convert 8 pixels: 8 UV and 8 Y
1607#define YUVTORGB(yuvconstants) \
1608  "movdqa     %%xmm0,%%xmm1                                     \n"            \
1609    "movdqa     %%xmm0,%%xmm2                                   \n"            \
1610    "movdqa     %%xmm0,%%xmm3                                   \n"            \
1611    "movdqa     " MEMACCESS2(96, [yuvconstants]) ",%%xmm0       \n"            \
1612    "pmaddubsw  " MEMACCESS([yuvconstants]) ",%%xmm1            \n"            \
1613    "psubw      %%xmm1,%%xmm0                                   \n"            \
1614    "movdqa     " MEMACCESS2(128, [yuvconstants]) ",%%xmm1      \n"            \
1615    "pmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%xmm2       \n"            \
1616    "psubw      %%xmm2,%%xmm1                                   \n"            \
1617    "movdqa     " MEMACCESS2(160, [yuvconstants]) ",%%xmm2      \n"            \
1618    "pmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%xmm3       \n"            \
1619    "psubw      %%xmm3,%%xmm2                                   \n"            \
1620    "pmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%xmm4      \n"            \
1621    "paddsw     %%xmm4,%%xmm0                                   \n"            \
1622    "paddsw     %%xmm4,%%xmm1                                   \n"            \
1623    "paddsw     %%xmm4,%%xmm2                                   \n"            \
1624    "psraw      $0x6,%%xmm0                                     \n"            \
1625    "psraw      $0x6,%%xmm1                                     \n"            \
1626    "psraw      $0x6,%%xmm2                                     \n"            \
1627    "packuswb   %%xmm0,%%xmm0                                   \n"            \
1628    "packuswb   %%xmm1,%%xmm1                                   \n"            \
1629    "packuswb   %%xmm2,%%xmm2                                   \n"
1630#define YUVTORGB_REGS
1631#endif
1632
1633// Store 8 ARGB values.
1634#define STOREARGB \
1635  "punpcklbw  %%xmm1,%%xmm0                                      \n"           \
1636    "punpcklbw  %%xmm5,%%xmm2                                    \n"           \
1637    "movdqa     %%xmm0,%%xmm1                                    \n"           \
1638    "punpcklwd  %%xmm2,%%xmm0                                    \n"           \
1639    "punpckhwd  %%xmm2,%%xmm1                                    \n"           \
1640    "movdqu     %%xmm0," MEMACCESS([dst_argb]) "                 \n"           \
1641    "movdqu     %%xmm1," MEMACCESS2(0x10, [dst_argb]) "          \n"           \
1642    "lea        " MEMLEA(0x20, [dst_argb]) ", %[dst_argb]        \n"
1643
1644// Store 8 RGBA values.
1645#define STORERGBA \
1646  "pcmpeqb   %%xmm5,%%xmm5                                       \n"           \
1647    "punpcklbw %%xmm2,%%xmm1                                     \n"           \
1648    "punpcklbw %%xmm0,%%xmm5                                     \n"           \
1649    "movdqa    %%xmm5,%%xmm0                                     \n"           \
1650    "punpcklwd %%xmm1,%%xmm5                                     \n"           \
1651    "punpckhwd %%xmm1,%%xmm0                                     \n"           \
1652    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "                  \n"           \
1653    "movdqu    %%xmm0," MEMACCESS2(0x10, [dst_rgba]) "           \n"           \
1654    "lea       " MEMLEA(0x20, [dst_rgba]) ",%[dst_rgba]          \n"
1655
1656void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1657                                const uint8* u_buf,
1658                                const uint8* v_buf,
1659                                uint8* dst_argb,
1660                                const struct YuvConstants* yuvconstants,
1661                                int width) {
1662  asm volatile (
1663    YUVTORGB_SETUP(yuvconstants)
1664    "sub       %[u_buf],%[v_buf]               \n"
1665    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1666
1667    LABELALIGN
1668    "1:                                        \n"
1669    READYUV444
1670    YUVTORGB(yuvconstants)
1671    STOREARGB
1672    "sub       $0x8,%[width]                   \n"
1673    "jg        1b                              \n"
1674  : [y_buf]"+r"(y_buf),    // %[y_buf]
1675    [u_buf]"+r"(u_buf),    // %[u_buf]
1676    [v_buf]"+r"(v_buf),    // %[v_buf]
1677    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1678    [width]"+rm"(width)    // %[width]
1679  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1680  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1681    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1682  );
1683}
1684
1685void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
1686                                 const uint8* u_buf,
1687                                 const uint8* v_buf,
1688                                 uint8* dst_rgb24,
1689                                 const struct YuvConstants* yuvconstants,
1690                                 int width) {
1691  asm volatile (
1692    YUVTORGB_SETUP(yuvconstants)
1693    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
1694    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
1695    "sub       %[u_buf],%[v_buf]               \n"
1696
1697    LABELALIGN
1698    "1:                                        \n"
1699    READYUV422
1700    YUVTORGB(yuvconstants)
1701    "punpcklbw %%xmm1,%%xmm0                   \n"
1702    "punpcklbw %%xmm2,%%xmm2                   \n"
1703    "movdqa    %%xmm0,%%xmm1                   \n"
1704    "punpcklwd %%xmm2,%%xmm0                   \n"
1705    "punpckhwd %%xmm2,%%xmm1                   \n"
1706    "pshufb    %%xmm5,%%xmm0                   \n"
1707    "pshufb    %%xmm6,%%xmm1                   \n"
1708    "palignr   $0xc,%%xmm0,%%xmm1              \n"
1709    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
1710    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
1711    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
1712    "subl      $0x8,%[width]                   \n"
1713    "jg        1b                              \n"
1714  : [y_buf]"+r"(y_buf),    // %[y_buf]
1715    [u_buf]"+r"(u_buf),    // %[u_buf]
1716    [v_buf]"+r"(v_buf),    // %[v_buf]
1717    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
1718#if defined(__i386__)
1719    [width]"+m"(width)     // %[width]
1720#else
1721    [width]"+rm"(width)    // %[width]
1722#endif
1723  : [yuvconstants]"r"(yuvconstants),  // %[yuvconstants]
1724    [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
1725    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
1726  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1727    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
1728  );
1729}
1730
1731void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1732                                const uint8* u_buf,
1733                                const uint8* v_buf,
1734                                uint8* dst_argb,
1735                                const struct YuvConstants* yuvconstants,
1736                                int width) {
1737  asm volatile (
1738    YUVTORGB_SETUP(yuvconstants)
1739    "sub       %[u_buf],%[v_buf]               \n"
1740    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1741
1742    LABELALIGN
1743    "1:                                        \n"
1744    READYUV422
1745    YUVTORGB(yuvconstants)
1746    STOREARGB
1747    "sub       $0x8,%[width]                   \n"
1748    "jg        1b                              \n"
1749  : [y_buf]"+r"(y_buf),    // %[y_buf]
1750    [u_buf]"+r"(u_buf),    // %[u_buf]
1751    [v_buf]"+r"(v_buf),    // %[v_buf]
1752    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1753    [width]"+rm"(width)    // %[width]
1754  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1755  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1756    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1757  );
1758}
1759
1760#ifdef HAS_I422ALPHATOARGBROW_SSSE3
1761void OMITFP I422AlphaToARGBRow_SSSE3(const uint8* y_buf,
1762                                     const uint8* u_buf,
1763                                     const uint8* v_buf,
1764                                     const uint8* a_buf,
1765                                     uint8* dst_argb,
1766                                     const struct YuvConstants* yuvconstants,
1767                                     int width) {
1768  // clang-format off
1769  asm volatile (
1770    YUVTORGB_SETUP(yuvconstants)
1771    "sub       %[u_buf],%[v_buf]               \n"
1772
1773    LABELALIGN
1774    "1:                                        \n"
1775    READYUVA422
1776    YUVTORGB(yuvconstants)
1777    STOREARGB
1778    "subl      $0x8,%[width]                   \n"
1779    "jg        1b                              \n"
1780  : [y_buf]"+r"(y_buf),    // %[y_buf]
1781    [u_buf]"+r"(u_buf),    // %[u_buf]
1782    [v_buf]"+r"(v_buf),    // %[v_buf]
1783    [a_buf]"+r"(a_buf),    // %[a_buf]
1784    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1785#if defined(__i386__)
1786    [width]"+m"(width)     // %[width]
1787#else
1788    [width]"+rm"(width)    // %[width]
1789#endif
1790  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1791  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1792    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1793  );
1794  // clang-format on
1795}
1796#endif  // HAS_I422ALPHATOARGBROW_SSSE3
1797
1798void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1799                                const uint8* uv_buf,
1800                                uint8* dst_argb,
1801                                const struct YuvConstants* yuvconstants,
1802                                int width) {
1803  // clang-format off
1804  asm volatile (
1805    YUVTORGB_SETUP(yuvconstants)
1806    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1807
1808    LABELALIGN
1809    "1:                                        \n"
1810    READNV12
1811    YUVTORGB(yuvconstants)
1812    STOREARGB
1813    "sub       $0x8,%[width]                   \n"
1814    "jg        1b                              \n"
1815  : [y_buf]"+r"(y_buf),    // %[y_buf]
1816    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1817    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1818    [width]"+rm"(width)    // %[width]
1819  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1820    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1821      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1822  );
1823  // clang-format on
1824}
1825
1826void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1827                                const uint8* vu_buf,
1828                                uint8* dst_argb,
1829                                const struct YuvConstants* yuvconstants,
1830                                int width) {
1831  // clang-format off
1832  asm volatile (
1833    YUVTORGB_SETUP(yuvconstants)
1834    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1835
1836    LABELALIGN
1837    "1:                                        \n"
1838    READNV21
1839    YUVTORGB(yuvconstants)
1840    STOREARGB
1841    "sub       $0x8,%[width]                   \n"
1842    "jg        1b                              \n"
1843  : [y_buf]"+r"(y_buf),    // %[y_buf]
1844    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
1845    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1846    [width]"+rm"(width)    // %[width]
1847  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1848    [kShuffleNV21]"m"(kShuffleNV21)
1849    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1850      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1851  );
1852  // clang-format on
1853}
1854
1855void OMITFP YUY2ToARGBRow_SSSE3(const uint8* yuy2_buf,
1856                                uint8* dst_argb,
1857                                const struct YuvConstants* yuvconstants,
1858                                int width) {
1859  // clang-format off
1860  asm volatile (
1861    YUVTORGB_SETUP(yuvconstants)
1862    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1863
1864    LABELALIGN
1865    "1:                                        \n"
1866    READYUY2
1867    YUVTORGB(yuvconstants)
1868    STOREARGB
1869    "sub       $0x8,%[width]                   \n"
1870    "jg        1b                              \n"
1871  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
1872    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1873    [width]"+rm"(width)    // %[width]
1874  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1875    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
1876    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
1877    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1878      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1879  );
1880  // clang-format on
1881}
1882
1883void OMITFP UYVYToARGBRow_SSSE3(const uint8* uyvy_buf,
1884                                uint8* dst_argb,
1885                                const struct YuvConstants* yuvconstants,
1886                                int width) {
1887  // clang-format off
1888  asm volatile (
1889    YUVTORGB_SETUP(yuvconstants)
1890    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1891
1892    LABELALIGN
1893    "1:                                        \n"
1894    READUYVY
1895    YUVTORGB(yuvconstants)
1896    STOREARGB
1897    "sub       $0x8,%[width]                   \n"
1898    "jg        1b                              \n"
1899  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
1900    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
1901    [width]"+rm"(width)    // %[width]
1902  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
1903    [kShuffleUYVYY]"m"(kShuffleUYVYY),
1904    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
1905    : "memory", "cc", YUVTORGB_REGS  // Does not use r14.
1906      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1907  );
1908  // clang-format on
1909}
1910
1911void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
1912                                const uint8* u_buf,
1913                                const uint8* v_buf,
1914                                uint8* dst_rgba,
1915                                const struct YuvConstants* yuvconstants,
1916                                int width) {
1917  asm volatile (
1918    YUVTORGB_SETUP(yuvconstants)
1919    "sub       %[u_buf],%[v_buf]               \n"
1920    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1921
1922    LABELALIGN
1923    "1:                                        \n"
1924    READYUV422
1925    YUVTORGB(yuvconstants)
1926    STORERGBA
1927    "sub       $0x8,%[width]                   \n"
1928    "jg        1b                              \n"
1929  : [y_buf]"+r"(y_buf),    // %[y_buf]
1930    [u_buf]"+r"(u_buf),    // %[u_buf]
1931    [v_buf]"+r"(v_buf),    // %[v_buf]
1932    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
1933    [width]"+rm"(width)    // %[width]
1934  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
1935  : "memory", "cc", NACL_R14 YUVTORGB_REGS
1936    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1937  );
1938}
1939
1940#endif  // HAS_I422TOARGBROW_SSSE3
1941
1942// Read 16 UV from 444
1943#define READYUV444_AVX2 \
1944  "vmovdqu    " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1945    MEMOPREG(vmovdqu, 0x00, [u_buf], [v_buf], 1, xmm1)                         \
1946    "lea        " MEMLEA(0x10, [u_buf]) ",%[u_buf]                  \n"        \
1947    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1948    "vpermq     $0xd8,%%ymm1,%%ymm1                                 \n"        \
1949    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1950    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1951    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1952    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1953    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1954
1955// Read 8 UV from 422, upsample to 16 UV.
1956#define READYUV422_AVX2 \
1957  "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1958    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1959    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1960    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1961    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1962    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1963    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1964    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1965    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1966    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1967
1968// Read 8 UV from 422, upsample to 16 UV.  With 16 Alpha.
1969#define READYUVA422_AVX2 \
1970  "vmovq      " MEMACCESS([u_buf]) ",%%xmm0                         \n"        \
1971    MEMOPREG(vmovq, 0x00, [u_buf], [v_buf], 1, xmm1)                           \
1972    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]                   \n"        \
1973    "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                \n"        \
1974    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1975    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1976    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1977    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1978    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1979    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"        \
1980    "vmovdqu    " MEMACCESS([a_buf]) ",%%xmm5                       \n"        \
1981    "vpermq     $0xd8,%%ymm5,%%ymm5                                 \n"        \
1982    "lea        " MEMLEA(0x10, [a_buf]) ",%[a_buf]                  \n"
1983
1984// Read 8 UV from NV12, upsample to 16 UV.
1985#define READNV12_AVX2 \
1986  "vmovdqu    " MEMACCESS([uv_buf]) ",%%xmm0                        \n"        \
1987    "lea        " MEMLEA(0x10, [uv_buf]) ",%[uv_buf]                \n"        \
1988    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
1989    "vpunpcklwd %%ymm0,%%ymm0,%%ymm0                                \n"        \
1990    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
1991    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
1992    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
1993    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
1994
1995// Read 8 VU from NV21, upsample to 16 UV.
1996#define READNV21_AVX2 \
1997  "vmovdqu    " MEMACCESS([vu_buf]) ",%%xmm0                        \n"        \
1998    "lea        " MEMLEA(0x10, [vu_buf]) ",%[vu_buf]                \n"        \
1999    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2000    "vpshufb     %[kShuffleNV21], %%ymm0, %%ymm0                    \n"        \
2001    "vmovdqu    " MEMACCESS([y_buf]) ",%%xmm4                       \n"        \
2002    "vpermq     $0xd8,%%ymm4,%%ymm4                                 \n"        \
2003    "vpunpcklbw %%ymm4,%%ymm4,%%ymm4                                \n"        \
2004    "lea        " MEMLEA(0x10, [y_buf]) ",%[y_buf]                  \n"
2005
2006// Read 8 YUY2 with 16 Y and upsample 8 UV to 16 UV.
2007#define READYUY2_AVX2 \
2008  "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm4                      \n"        \
2009    "vpshufb    %[kShuffleYUY2Y], %%ymm4, %%ymm4                    \n"        \
2010    "vmovdqu    " MEMACCESS([yuy2_buf]) ",%%ymm0                    \n"        \
2011    "vpshufb    %[kShuffleYUY2UV], %%ymm0, %%ymm0                   \n"        \
2012    "lea        " MEMLEA(0x20, [yuy2_buf]) ",%[yuy2_buf]            \n"
2013
2014// Read 8 UYVY with 16 Y and upsample 8 UV to 16 UV.
2015#define READUYVY_AVX2 \
2016  "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm4                     \n"        \
2017    "vpshufb     %[kShuffleUYVYY], %%ymm4, %%ymm4                   \n"        \
2018    "vmovdqu     " MEMACCESS([uyvy_buf]) ",%%ymm0                   \n"        \
2019    "vpshufb     %[kShuffleUYVYUV], %%ymm0, %%ymm0                  \n"        \
2020    "lea        " MEMLEA(0x20, [uyvy_buf]) ",%[uyvy_buf]            \n"
2021
2022#if defined(__x86_64__)
2023#define YUVTORGB_SETUP_AVX2(yuvconstants) \
2024  "vmovdqa     " MEMACCESS([yuvconstants]) ",%%ymm8              \n"           \
2025    "vmovdqa     " MEMACCESS2(32, [yuvconstants]) ",%%ymm9       \n"           \
2026    "vmovdqa     " MEMACCESS2(64, [yuvconstants]) ",%%ymm10      \n"           \
2027    "vmovdqa     " MEMACCESS2(96, [yuvconstants]) ",%%ymm11      \n"           \
2028    "vmovdqa     " MEMACCESS2(128, [yuvconstants]) ",%%ymm12     \n"           \
2029    "vmovdqa     " MEMACCESS2(160, [yuvconstants]) ",%%ymm13     \n"           \
2030    "vmovdqa     " MEMACCESS2(192, [yuvconstants]) ",%%ymm14     \n"
2031
2032#define YUVTORGB_AVX2(yuvconstants)                                   \
2033  "vpmaddubsw  %%ymm10,%%ymm0,%%ymm2                              \n" \
2034  "vpmaddubsw  %%ymm9,%%ymm0,%%ymm1                               \n" \
2035  "vpmaddubsw  %%ymm8,%%ymm0,%%ymm0                               \n" \
2036  "vpsubw      %%ymm2,%%ymm13,%%ymm2                              \n" \
2037  "vpsubw      %%ymm1,%%ymm12,%%ymm1                              \n" \
2038  "vpsubw      %%ymm0,%%ymm11,%%ymm0                              \n" \
2039  "vpmulhuw    %%ymm14,%%ymm4,%%ymm4                              \n" \
2040  "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n" \
2041  "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n" \
2042  "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n" \
2043  "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n" \
2044  "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n" \
2045  "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n" \
2046  "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n" \
2047  "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n" \
2048  "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2049
2050#define YUVTORGB_REGS_AVX2 \
2051  "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14",
2052
2053#else  // Convert 16 pixels: 16 UV and 16 Y.
2054
2055#define YUVTORGB_SETUP_AVX2(yuvconstants)
2056#define YUVTORGB_AVX2(yuvconstants) \
2057  "vpmaddubsw  " MEMACCESS2(64, [yuvconstants]) ",%%ymm0,%%ymm2     \n"        \
2058    "vpmaddubsw  " MEMACCESS2(32, [yuvconstants]) ",%%ymm0,%%ymm1   \n"        \
2059    "vpmaddubsw  " MEMACCESS([yuvconstants]) ",%%ymm0,%%ymm0        \n"        \
2060    "vmovdqu     " MEMACCESS2(160, [yuvconstants]) ",%%ymm3         \n"        \
2061    "vpsubw      %%ymm2,%%ymm3,%%ymm2                               \n"        \
2062    "vmovdqu     " MEMACCESS2(128, [yuvconstants]) ",%%ymm3         \n"        \
2063    "vpsubw      %%ymm1,%%ymm3,%%ymm1                               \n"        \
2064    "vmovdqu     " MEMACCESS2(96, [yuvconstants]) ",%%ymm3          \n"        \
2065    "vpsubw      %%ymm0,%%ymm3,%%ymm0                               \n"        \
2066    "vpmulhuw    " MEMACCESS2(192, [yuvconstants]) ",%%ymm4,%%ymm4  \n"        \
2067    "vpaddsw     %%ymm4,%%ymm0,%%ymm0                               \n"        \
2068    "vpaddsw     %%ymm4,%%ymm1,%%ymm1                               \n"        \
2069    "vpaddsw     %%ymm4,%%ymm2,%%ymm2                               \n"        \
2070    "vpsraw      $0x6,%%ymm0,%%ymm0                                 \n"        \
2071    "vpsraw      $0x6,%%ymm1,%%ymm1                                 \n"        \
2072    "vpsraw      $0x6,%%ymm2,%%ymm2                                 \n"        \
2073    "vpackuswb   %%ymm0,%%ymm0,%%ymm0                               \n"        \
2074    "vpackuswb   %%ymm1,%%ymm1,%%ymm1                               \n"        \
2075    "vpackuswb   %%ymm2,%%ymm2,%%ymm2                               \n"
2076#define YUVTORGB_REGS_AVX2
2077#endif
2078
2079// Store 16 ARGB values.
2080#define STOREARGB_AVX2 \
2081  "vpunpcklbw %%ymm1,%%ymm0,%%ymm0                                  \n"        \
2082    "vpermq     $0xd8,%%ymm0,%%ymm0                                 \n"        \
2083    "vpunpcklbw %%ymm5,%%ymm2,%%ymm2                                \n"        \
2084    "vpermq     $0xd8,%%ymm2,%%ymm2                                 \n"        \
2085    "vpunpcklwd %%ymm2,%%ymm0,%%ymm1                                \n"        \
2086    "vpunpckhwd %%ymm2,%%ymm0,%%ymm0                                \n"        \
2087    "vmovdqu    %%ymm1," MEMACCESS([dst_argb]) "                    \n"        \
2088    "vmovdqu    %%ymm0," MEMACCESS2(0x20, [dst_argb]) "             \n"        \
2089    "lea       " MEMLEA(0x40, [dst_argb]) ", %[dst_argb]            \n"
2090
2091#ifdef HAS_I444TOARGBROW_AVX2
2092// 16 pixels
2093// 16 UV values with 16 Y producing 16 ARGB (64 bytes).
2094void OMITFP I444ToARGBRow_AVX2(const uint8* y_buf,
2095                               const uint8* u_buf,
2096                               const uint8* v_buf,
2097                               uint8* dst_argb,
2098                               const struct YuvConstants* yuvconstants,
2099                               int width) {
2100  asm volatile (
2101    YUVTORGB_SETUP_AVX2(yuvconstants)
2102    "sub       %[u_buf],%[v_buf]               \n"
2103    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2104
2105    LABELALIGN
2106    "1:                                        \n"
2107    READYUV444_AVX2
2108    YUVTORGB_AVX2(yuvconstants)
2109    STOREARGB_AVX2
2110    "sub       $0x10,%[width]                  \n"
2111    "jg        1b                              \n"
2112    "vzeroupper                                \n"
2113  : [y_buf]"+r"(y_buf),    // %[y_buf]
2114    [u_buf]"+r"(u_buf),    // %[u_buf]
2115    [v_buf]"+r"(v_buf),    // %[v_buf]
2116    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2117    [width]"+rm"(width)    // %[width]
2118  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2119  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2120    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2121  );
2122}
2123#endif  // HAS_I444TOARGBROW_AVX2
2124
2125#if defined(HAS_I422TOARGBROW_AVX2)
2126// 16 pixels
2127// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2128void OMITFP I422ToARGBRow_AVX2(const uint8* y_buf,
2129                               const uint8* u_buf,
2130                               const uint8* v_buf,
2131                               uint8* dst_argb,
2132                               const struct YuvConstants* yuvconstants,
2133                               int width) {
2134  asm volatile (
2135    YUVTORGB_SETUP_AVX2(yuvconstants)
2136    "sub       %[u_buf],%[v_buf]               \n"
2137    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
2138
2139    LABELALIGN
2140    "1:                                        \n"
2141    READYUV422_AVX2
2142    YUVTORGB_AVX2(yuvconstants)
2143    STOREARGB_AVX2
2144    "sub       $0x10,%[width]                  \n"
2145    "jg        1b                              \n"
2146
2147    "vzeroupper                                \n"
2148  : [y_buf]"+r"(y_buf),    // %[y_buf]
2149    [u_buf]"+r"(u_buf),    // %[u_buf]
2150    [v_buf]"+r"(v_buf),    // %[v_buf]
2151    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2152    [width]"+rm"(width)    // %[width]
2153  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2154  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2155    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2156  );
2157}
2158#endif  // HAS_I422TOARGBROW_AVX2
2159
2160#if defined(HAS_I422ALPHATOARGBROW_AVX2)
2161// 16 pixels
2162// 8 UV values upsampled to 16 UV, mixed with 16 Y and 16 A producing 16 ARGB.
2163void OMITFP I422AlphaToARGBRow_AVX2(const uint8* y_buf,
2164                                    const uint8* u_buf,
2165                                    const uint8* v_buf,
2166                                    const uint8* a_buf,
2167                                    uint8* dst_argb,
2168                                    const struct YuvConstants* yuvconstants,
2169                                    int width) {
2170  // clang-format off
2171  asm volatile (
2172    YUVTORGB_SETUP_AVX2(yuvconstants)
2173    "sub       %[u_buf],%[v_buf]               \n"
2174
2175    LABELALIGN
2176    "1:                                        \n"
2177    READYUVA422_AVX2
2178    YUVTORGB_AVX2(yuvconstants)
2179    STOREARGB_AVX2
2180    "subl      $0x10,%[width]                  \n"
2181    "jg        1b                              \n"
2182    "vzeroupper                                \n"
2183  : [y_buf]"+r"(y_buf),    // %[y_buf]
2184    [u_buf]"+r"(u_buf),    // %[u_buf]
2185    [v_buf]"+r"(v_buf),    // %[v_buf]
2186    [a_buf]"+r"(a_buf),    // %[a_buf]
2187    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2188#if defined(__i386__)
2189    [width]"+m"(width)     // %[width]
2190#else
2191    [width]"+rm"(width)    // %[width]
2192#endif
2193  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2194  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2195    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2196  );
2197  // clang-format on
2198}
2199#endif  // HAS_I422ALPHATOARGBROW_AVX2
2200
2201#if defined(HAS_I422TORGBAROW_AVX2)
2202// 16 pixels
2203// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 RGBA (64 bytes).
2204void OMITFP I422ToRGBARow_AVX2(const uint8* y_buf,
2205                               const uint8* u_buf,
2206                               const uint8* v_buf,
2207                               uint8* dst_argb,
2208                               const struct YuvConstants* yuvconstants,
2209                               int width) {
2210  asm volatile (
2211    YUVTORGB_SETUP_AVX2(yuvconstants)
2212    "sub       %[u_buf],%[v_buf]               \n"
2213    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2214
2215    LABELALIGN
2216    "1:                                        \n"
2217    READYUV422_AVX2
2218    YUVTORGB_AVX2(yuvconstants)
2219
2220    // Step 3: Weave into RGBA
2221    "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
2222    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2223    "vpunpcklbw %%ymm0,%%ymm5,%%ymm2           \n"
2224    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2225    "vpunpcklwd %%ymm1,%%ymm2,%%ymm0           \n"
2226    "vpunpckhwd %%ymm1,%%ymm2,%%ymm1           \n"
2227    "vmovdqu    %%ymm0," MEMACCESS([dst_argb]) "\n"
2228    "vmovdqu    %%ymm1," MEMACCESS2(0x20,[dst_argb]) "\n"
2229    "lea       " MEMLEA(0x40,[dst_argb]) ",%[dst_argb] \n"
2230    "sub       $0x10,%[width]                  \n"
2231    "jg        1b                              \n"
2232    "vzeroupper                                \n"
2233  : [y_buf]"+r"(y_buf),    // %[y_buf]
2234    [u_buf]"+r"(u_buf),    // %[u_buf]
2235    [v_buf]"+r"(v_buf),    // %[v_buf]
2236    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2237    [width]"+rm"(width)    // %[width]
2238  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2239  : "memory", "cc", NACL_R14 YUVTORGB_REGS_AVX2
2240    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2241  );
2242}
2243#endif  // HAS_I422TORGBAROW_AVX2
2244
2245#if defined(HAS_NV12TOARGBROW_AVX2)
2246// 16 pixels.
2247// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2248void OMITFP NV12ToARGBRow_AVX2(const uint8* y_buf,
2249                               const uint8* uv_buf,
2250                               uint8* dst_argb,
2251                               const struct YuvConstants* yuvconstants,
2252                               int width) {
2253  // clang-format off
2254  asm volatile (
2255    YUVTORGB_SETUP_AVX2(yuvconstants)
2256    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2257
2258    LABELALIGN
2259    "1:                                        \n"
2260    READNV12_AVX2
2261    YUVTORGB_AVX2(yuvconstants)
2262    STOREARGB_AVX2
2263    "sub       $0x10,%[width]                  \n"
2264    "jg        1b                              \n"
2265    "vzeroupper                                \n"
2266  : [y_buf]"+r"(y_buf),    // %[y_buf]
2267    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2268    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2269    [width]"+rm"(width)    // %[width]
2270  : [yuvconstants]"r"(yuvconstants)  // %[yuvconstants]
2271    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2272    "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2273  );
2274  // clang-format on
2275}
2276#endif  // HAS_NV12TOARGBROW_AVX2
2277
2278#if defined(HAS_NV21TOARGBROW_AVX2)
2279// 16 pixels.
2280// 8 VU values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
2281void OMITFP NV21ToARGBRow_AVX2(const uint8* y_buf,
2282                               const uint8* vu_buf,
2283                               uint8* dst_argb,
2284                               const struct YuvConstants* yuvconstants,
2285                               int width) {
2286  // clang-format off
2287  asm volatile (
2288    YUVTORGB_SETUP_AVX2(yuvconstants)
2289    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2290
2291    LABELALIGN
2292    "1:                                        \n"
2293    READNV21_AVX2
2294    YUVTORGB_AVX2(yuvconstants)
2295    STOREARGB_AVX2
2296    "sub       $0x10,%[width]                  \n"
2297    "jg        1b                              \n"
2298    "vzeroupper                                \n"
2299  : [y_buf]"+r"(y_buf),    // %[y_buf]
2300    [vu_buf]"+r"(vu_buf),    // %[vu_buf]
2301    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2302    [width]"+rm"(width)    // %[width]
2303  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2304    [kShuffleNV21]"m"(kShuffleNV21)
2305    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2306      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2307  );
2308  // clang-format on
2309}
2310#endif  // HAS_NV21TOARGBROW_AVX2
2311
2312#if defined(HAS_YUY2TOARGBROW_AVX2)
2313// 16 pixels.
2314// 8 YUY2 values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2315void OMITFP YUY2ToARGBRow_AVX2(const uint8* yuy2_buf,
2316                               uint8* dst_argb,
2317                               const struct YuvConstants* yuvconstants,
2318                               int width) {
2319  // clang-format off
2320  asm volatile (
2321    YUVTORGB_SETUP_AVX2(yuvconstants)
2322    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2323
2324    LABELALIGN
2325    "1:                                        \n"
2326    READYUY2_AVX2
2327    YUVTORGB_AVX2(yuvconstants)
2328    STOREARGB_AVX2
2329    "sub       $0x10,%[width]                  \n"
2330    "jg        1b                              \n"
2331    "vzeroupper                                \n"
2332  : [yuy2_buf]"+r"(yuy2_buf),    // %[yuy2_buf]
2333    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2334    [width]"+rm"(width)    // %[width]
2335  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2336    [kShuffleYUY2Y]"m"(kShuffleYUY2Y),
2337    [kShuffleYUY2UV]"m"(kShuffleYUY2UV)
2338    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2339      "xmm0", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2340  );
2341  // clang-format on
2342}
2343#endif  // HAS_YUY2TOARGBROW_AVX2
2344
2345#if defined(HAS_UYVYTOARGBROW_AVX2)
2346// 16 pixels.
2347// 8 UYVY values with 16 Y and 8 UV producing 16 ARGB (64 bytes).
2348void OMITFP UYVYToARGBRow_AVX2(const uint8* uyvy_buf,
2349                               uint8* dst_argb,
2350                               const struct YuvConstants* yuvconstants,
2351                               int width) {
2352  // clang-format off
2353  asm volatile (
2354    YUVTORGB_SETUP_AVX2(yuvconstants)
2355    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2356
2357    LABELALIGN
2358    "1:                                        \n"
2359    READUYVY_AVX2
2360    YUVTORGB_AVX2(yuvconstants)
2361    STOREARGB_AVX2
2362    "sub       $0x10,%[width]                  \n"
2363    "jg        1b                              \n"
2364    "vzeroupper                                \n"
2365  : [uyvy_buf]"+r"(uyvy_buf),    // %[uyvy_buf]
2366    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2367    [width]"+rm"(width)    // %[width]
2368  : [yuvconstants]"r"(yuvconstants), // %[yuvconstants]
2369    [kShuffleUYVYY]"m"(kShuffleUYVYY),
2370    [kShuffleUYVYUV]"m"(kShuffleUYVYUV)
2371    : "memory", "cc", YUVTORGB_REGS_AVX2  // Does not use r14.
2372      "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2373  );
2374  // clang-format on
2375}
2376#endif  // HAS_UYVYTOARGBROW_AVX2
2377
2378#ifdef HAS_I400TOARGBROW_SSE2
2379void I400ToARGBRow_SSE2(const uint8* y_buf, uint8* dst_argb, int width) {
2380  asm volatile (
2381    "mov       $0x4a354a35,%%eax               \n"  // 4a35 = 18997 = 1.164
2382    "movd      %%eax,%%xmm2                    \n"
2383    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2384    "mov       $0x04880488,%%eax               \n"  // 0488 = 1160 = 1.164 * 16
2385    "movd      %%eax,%%xmm3                    \n"
2386    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2387    "pcmpeqb   %%xmm4,%%xmm4                   \n"
2388    "pslld     $0x18,%%xmm4                    \n"
2389
2390    LABELALIGN
2391    "1:                                        \n"
2392    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2393    "movq      " MEMACCESS(0) ",%%xmm0         \n"
2394    "lea       " MEMLEA(0x8,0) ",%0            \n"
2395    "punpcklbw %%xmm0,%%xmm0                   \n"
2396    "pmulhuw   %%xmm2,%%xmm0                   \n"
2397    "psubusw   %%xmm3,%%xmm0                   \n"
2398    "psrlw     $6, %%xmm0                      \n"
2399    "packuswb  %%xmm0,%%xmm0                   \n"
2400
2401    // Step 2: Weave into ARGB
2402    "punpcklbw %%xmm0,%%xmm0                   \n"
2403    "movdqa    %%xmm0,%%xmm1                   \n"
2404    "punpcklwd %%xmm0,%%xmm0                   \n"
2405    "punpckhwd %%xmm1,%%xmm1                   \n"
2406    "por       %%xmm4,%%xmm0                   \n"
2407    "por       %%xmm4,%%xmm1                   \n"
2408    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2409    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2410    "lea       " MEMLEA(0x20,1) ",%1           \n"
2411
2412    "sub       $0x8,%2                         \n"
2413    "jg        1b                              \n"
2414  : "+r"(y_buf),     // %0
2415    "+r"(dst_argb),  // %1
2416    "+rm"(width)     // %2
2417  :
2418  : "memory", "cc", "eax"
2419    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2420  );
2421}
2422#endif  // HAS_I400TOARGBROW_SSE2
2423
2424#ifdef HAS_I400TOARGBROW_AVX2
2425// 16 pixels of Y converted to 16 pixels of ARGB (64 bytes).
2426// note: vpunpcklbw mutates and vpackuswb unmutates.
2427void I400ToARGBRow_AVX2(const uint8* y_buf, uint8* dst_argb, int width) {
2428  asm volatile (
2429    "mov        $0x4a354a35,%%eax              \n" // 0488 = 1160 = 1.164 * 16
2430    "vmovd      %%eax,%%xmm2                   \n"
2431    "vbroadcastss %%xmm2,%%ymm2                \n"
2432    "mov        $0x4880488,%%eax               \n" // 4a35 = 18997 = 1.164
2433    "vmovd      %%eax,%%xmm3                   \n"
2434    "vbroadcastss %%xmm3,%%ymm3                \n"
2435    "vpcmpeqb   %%ymm4,%%ymm4,%%ymm4           \n"
2436    "vpslld     $0x18,%%ymm4,%%ymm4            \n"
2437
2438    LABELALIGN
2439    "1:                                        \n"
2440    // Step 1: Scale Y contribution to 16 G values. G = (y - 16) * 1.164
2441    "vmovdqu    " MEMACCESS(0) ",%%xmm0        \n"
2442    "lea        " MEMLEA(0x10,0) ",%0          \n"
2443    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2444    "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
2445    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
2446    "vpsubusw   %%ymm3,%%ymm0,%%ymm0           \n"
2447    "vpsrlw     $0x6,%%ymm0,%%ymm0             \n"
2448    "vpackuswb  %%ymm0,%%ymm0,%%ymm0           \n"
2449    "vpunpcklbw %%ymm0,%%ymm0,%%ymm1           \n"
2450    "vpermq     $0xd8,%%ymm1,%%ymm1            \n"
2451    "vpunpcklwd %%ymm1,%%ymm1,%%ymm0           \n"
2452    "vpunpckhwd %%ymm1,%%ymm1,%%ymm1           \n"
2453    "vpor       %%ymm4,%%ymm0,%%ymm0           \n"
2454    "vpor       %%ymm4,%%ymm1,%%ymm1           \n"
2455    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2456    "vmovdqu    %%ymm1," MEMACCESS2(0x20,1) "  \n"
2457    "lea       " MEMLEA(0x40,1) ",%1           \n"
2458    "sub        $0x10,%2                       \n"
2459    "jg        1b                              \n"
2460    "vzeroupper                                \n"
2461  : "+r"(y_buf),     // %0
2462    "+r"(dst_argb),  // %1
2463    "+rm"(width)     // %2
2464  :
2465  : "memory", "cc", "eax"
2466    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2467  );
2468}
2469#endif  // HAS_I400TOARGBROW_AVX2
2470
2471#ifdef HAS_MIRRORROW_SSSE3
2472// Shuffle table for reversing the bytes.
2473static uvec8 kShuffleMirror = {15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u,
2474                               7u,  6u,  5u,  4u,  3u,  2u,  1u, 0u};
2475
2476void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2477  intptr_t temp_width = (intptr_t)(width);
2478  asm volatile (
2479    "movdqa    %3,%%xmm5                       \n"
2480
2481    LABELALIGN
2482    "1:                                        \n"
2483    MEMOPREG(movdqu,-0x10,0,2,1,xmm0)          //  movdqu -0x10(%0,%2),%%xmm0
2484    "pshufb    %%xmm5,%%xmm0                   \n"
2485    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2486    "lea       " MEMLEA(0x10,1) ",%1           \n"
2487    "sub       $0x10,%2                        \n"
2488    "jg        1b                              \n"
2489  : "+r"(src),  // %0
2490    "+r"(dst),  // %1
2491    "+r"(temp_width)  // %2
2492  : "m"(kShuffleMirror) // %3
2493  : "memory", "cc", NACL_R14
2494    "xmm0", "xmm5"
2495  );
2496}
2497#endif  // HAS_MIRRORROW_SSSE3
2498
2499#ifdef HAS_MIRRORROW_AVX2
2500void MirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2501  intptr_t temp_width = (intptr_t)(width);
2502  asm volatile (
2503    "vbroadcastf128 %3,%%ymm5                  \n"
2504
2505    LABELALIGN
2506    "1:                                        \n"
2507    MEMOPREG(vmovdqu,-0x20,0,2,1,ymm0)         //  vmovdqu -0x20(%0,%2),%%ymm0
2508    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n"
2509    "vpermq     $0x4e,%%ymm0,%%ymm0            \n"
2510    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2511    "lea       " MEMLEA(0x20,1) ",%1           \n"
2512    "sub       $0x20,%2                        \n"
2513    "jg        1b                              \n"
2514    "vzeroupper                                \n"
2515  : "+r"(src),  // %0
2516    "+r"(dst),  // %1
2517    "+r"(temp_width)  // %2
2518  : "m"(kShuffleMirror) // %3
2519  : "memory", "cc", NACL_R14
2520    "xmm0", "xmm5"
2521  );
2522}
2523#endif  // HAS_MIRRORROW_AVX2
2524
2525#ifdef HAS_MIRRORUVROW_SSSE3
2526// Shuffle table for reversing the bytes of UV channels.
2527static uvec8 kShuffleMirrorUV = {14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u,
2528                                 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u};
2529void MirrorUVRow_SSSE3(const uint8* src,
2530                       uint8* dst_u,
2531                       uint8* dst_v,
2532                       int width) {
2533  intptr_t temp_width = (intptr_t)(width);
2534  asm volatile (
2535    "movdqa    %4,%%xmm1                       \n"
2536    "lea       " MEMLEA4(-0x10,0,3,2) ",%0     \n"
2537    "sub       %1,%2                           \n"
2538
2539    LABELALIGN
2540    "1:                                        \n"
2541    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2542    "lea       " MEMLEA(-0x10,0) ",%0          \n"
2543    "pshufb    %%xmm1,%%xmm0                   \n"
2544    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
2545    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
2546    "lea       " MEMLEA(0x8,1) ",%1            \n"
2547    "sub       $8,%3                           \n"
2548    "jg        1b                              \n"
2549  : "+r"(src),      // %0
2550    "+r"(dst_u),    // %1
2551    "+r"(dst_v),    // %2
2552    "+r"(temp_width)  // %3
2553  : "m"(kShuffleMirrorUV)  // %4
2554  : "memory", "cc", NACL_R14
2555    "xmm0", "xmm1"
2556  );
2557}
2558#endif  // HAS_MIRRORUVROW_SSSE3
2559
2560#ifdef HAS_ARGBMIRRORROW_SSE2
2561
2562void ARGBMirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2563  intptr_t temp_width = (intptr_t)(width);
2564  asm volatile (
2565    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
2566
2567    LABELALIGN
2568    "1:                                        \n"
2569    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2570    "pshufd    $0x1b,%%xmm0,%%xmm0             \n"
2571    "lea       " MEMLEA(-0x10,0) ",%0          \n"
2572    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2573    "lea       " MEMLEA(0x10,1) ",%1           \n"
2574    "sub       $0x4,%2                         \n"
2575    "jg        1b                              \n"
2576  : "+r"(src),  // %0
2577    "+r"(dst),  // %1
2578    "+r"(temp_width)  // %2
2579  :
2580  : "memory", "cc"
2581    , "xmm0"
2582  );
2583}
2584#endif  // HAS_ARGBMIRRORROW_SSE2
2585
2586#ifdef HAS_ARGBMIRRORROW_AVX2
2587// Shuffle table for reversing the bytes.
2588static const ulvec32 kARGBShuffleMirror_AVX2 = {7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u};
2589void ARGBMirrorRow_AVX2(const uint8* src, uint8* dst, int width) {
2590  intptr_t temp_width = (intptr_t)(width);
2591  asm volatile (
2592    "vmovdqu    %3,%%ymm5                      \n"
2593
2594    LABELALIGN
2595    "1:                                        \n"
2596    VMEMOPREG(vpermd,-0x20,0,2,4,ymm5,ymm0) // vpermd -0x20(%0,%2,4),ymm5,ymm0
2597    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2598    "lea        " MEMLEA(0x20,1) ",%1          \n"
2599    "sub        $0x8,%2                        \n"
2600    "jg         1b                             \n"
2601    "vzeroupper                                \n"
2602  : "+r"(src),  // %0
2603    "+r"(dst),  // %1
2604    "+r"(temp_width)  // %2
2605  : "m"(kARGBShuffleMirror_AVX2) // %3
2606  : "memory", "cc", NACL_R14
2607    "xmm0", "xmm5"
2608  );
2609}
2610#endif  // HAS_ARGBMIRRORROW_AVX2
2611
2612#ifdef HAS_SPLITUVROW_AVX2
2613void SplitUVRow_AVX2(const uint8* src_uv,
2614                     uint8* dst_u,
2615                     uint8* dst_v,
2616                     int width) {
2617  asm volatile (
2618    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
2619    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
2620    "sub        %1,%2                          \n"
2621
2622    LABELALIGN
2623    "1:                                        \n"
2624    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
2625    "vmovdqu    " MEMACCESS2(0x20,0) ",%%ymm1  \n"
2626    "lea        " MEMLEA(0x40,0) ",%0          \n"
2627    "vpsrlw     $0x8,%%ymm0,%%ymm2             \n"
2628    "vpsrlw     $0x8,%%ymm1,%%ymm3             \n"
2629    "vpand      %%ymm5,%%ymm0,%%ymm0           \n"
2630    "vpand      %%ymm5,%%ymm1,%%ymm1           \n"
2631    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
2632    "vpackuswb  %%ymm3,%%ymm2,%%ymm2           \n"
2633    "vpermq     $0xd8,%%ymm0,%%ymm0            \n"
2634    "vpermq     $0xd8,%%ymm2,%%ymm2            \n"
2635    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2636    MEMOPMEM(vmovdqu,ymm2,0x00,1,2,1)           //  vmovdqu %%ymm2,(%1,%2)
2637    "lea        " MEMLEA(0x20,1) ",%1          \n"
2638    "sub        $0x20,%3                       \n"
2639    "jg         1b                             \n"
2640    "vzeroupper                                \n"
2641  : "+r"(src_uv),     // %0
2642    "+r"(dst_u),      // %1
2643    "+r"(dst_v),      // %2
2644    "+r"(width)         // %3
2645  :
2646  : "memory", "cc", NACL_R14
2647    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2648  );
2649}
2650#endif  // HAS_SPLITUVROW_AVX2
2651
2652#ifdef HAS_SPLITUVROW_SSE2
2653void SplitUVRow_SSE2(const uint8* src_uv,
2654                     uint8* dst_u,
2655                     uint8* dst_v,
2656                     int width) {
2657  asm volatile (
2658    "pcmpeqb    %%xmm5,%%xmm5                  \n"
2659    "psrlw      $0x8,%%xmm5                    \n"
2660    "sub        %1,%2                          \n"
2661
2662    LABELALIGN
2663    "1:                                        \n"
2664    "movdqu     " MEMACCESS(0) ",%%xmm0        \n"
2665    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1  \n"
2666    "lea        " MEMLEA(0x20,0) ",%0          \n"
2667    "movdqa     %%xmm0,%%xmm2                  \n"
2668    "movdqa     %%xmm1,%%xmm3                  \n"
2669    "pand       %%xmm5,%%xmm0                  \n"
2670    "pand       %%xmm5,%%xmm1                  \n"
2671    "packuswb   %%xmm1,%%xmm0                  \n"
2672    "psrlw      $0x8,%%xmm2                    \n"
2673    "psrlw      $0x8,%%xmm3                    \n"
2674    "packuswb   %%xmm3,%%xmm2                  \n"
2675    "movdqu     %%xmm0," MEMACCESS(1) "        \n"
2676    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)           //  movdqu     %%xmm2,(%1,%2)
2677    "lea        " MEMLEA(0x10,1) ",%1          \n"
2678    "sub        $0x10,%3                       \n"
2679    "jg         1b                             \n"
2680  : "+r"(src_uv),     // %0
2681    "+r"(dst_u),      // %1
2682    "+r"(dst_v),      // %2
2683    "+r"(width)         // %3
2684  :
2685  : "memory", "cc", NACL_R14
2686    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2687  );
2688}
2689#endif  // HAS_SPLITUVROW_SSE2
2690
2691#ifdef HAS_MERGEUVROW_AVX2
2692void MergeUVRow_AVX2(const uint8* src_u,
2693                     const uint8* src_v,
2694                     uint8* dst_uv,
2695                     int width) {
2696  asm volatile (
2697    "sub       %0,%1                           \n"
2698
2699    LABELALIGN
2700    "1:                                        \n"
2701    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2702    MEMOPREG(vmovdqu,0x00,0,1,1,ymm1)           //  vmovdqu (%0,%1,1),%%ymm1
2703    "lea       " MEMLEA(0x20,0) ",%0           \n"
2704    "vpunpcklbw %%ymm1,%%ymm0,%%ymm2           \n"
2705    "vpunpckhbw %%ymm1,%%ymm0,%%ymm0           \n"
2706    "vextractf128 $0x0,%%ymm2," MEMACCESS(2) " \n"
2707    "vextractf128 $0x0,%%ymm0," MEMACCESS2(0x10,2) "\n"
2708    "vextractf128 $0x1,%%ymm2," MEMACCESS2(0x20,2) "\n"
2709    "vextractf128 $0x1,%%ymm0," MEMACCESS2(0x30,2) "\n"
2710    "lea       " MEMLEA(0x40,2) ",%2           \n"
2711    "sub       $0x20,%3                        \n"
2712    "jg        1b                              \n"
2713    "vzeroupper                                \n"
2714  : "+r"(src_u),     // %0
2715    "+r"(src_v),     // %1
2716    "+r"(dst_uv),    // %2
2717    "+r"(width)      // %3
2718  :
2719  : "memory", "cc", NACL_R14
2720    "xmm0", "xmm1", "xmm2"
2721  );
2722}
2723#endif  // HAS_MERGEUVROW_AVX2
2724
2725#ifdef HAS_MERGEUVROW_SSE2
2726void MergeUVRow_SSE2(const uint8* src_u,
2727                     const uint8* src_v,
2728                     uint8* dst_uv,
2729                     int width) {
2730  asm volatile (
2731    "sub       %0,%1                           \n"
2732
2733    LABELALIGN
2734    "1:                                        \n"
2735    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2736    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
2737    "lea       " MEMLEA(0x10,0) ",%0           \n"
2738    "movdqa    %%xmm0,%%xmm2                   \n"
2739    "punpcklbw %%xmm1,%%xmm0                   \n"
2740    "punpckhbw %%xmm1,%%xmm2                   \n"
2741    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
2742    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
2743    "lea       " MEMLEA(0x20,2) ",%2           \n"
2744    "sub       $0x10,%3                        \n"
2745    "jg        1b                              \n"
2746  : "+r"(src_u),     // %0
2747    "+r"(src_v),     // %1
2748    "+r"(dst_uv),    // %2
2749    "+r"(width)      // %3
2750  :
2751  : "memory", "cc", NACL_R14
2752    "xmm0", "xmm1", "xmm2"
2753  );
2754}
2755#endif  // HAS_MERGEUVROW_SSE2
2756
2757#ifdef HAS_COPYROW_SSE2
2758void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2759  asm volatile (
2760    "test       $0xf,%0                        \n"
2761    "jne        2f                             \n"
2762    "test       $0xf,%1                        \n"
2763    "jne        2f                             \n"
2764
2765    LABELALIGN
2766    "1:                                        \n"
2767    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
2768    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2769    "lea       " MEMLEA(0x20,0) ",%0           \n"
2770    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
2771    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2772    "lea       " MEMLEA(0x20,1) ",%1           \n"
2773    "sub       $0x20,%2                        \n"
2774    "jg        1b                              \n"
2775    "jmp       9f                              \n"
2776
2777    LABELALIGN
2778  "2:                                          \n"
2779    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2780    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2781    "lea       " MEMLEA(0x20,0) ",%0           \n"
2782    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
2783    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2784    "lea       " MEMLEA(0x20,1) ",%1           \n"
2785    "sub       $0x20,%2                        \n"
2786    "jg        2b                              \n"
2787  "9:                                          \n"
2788  : "+r"(src),   // %0
2789    "+r"(dst),   // %1
2790    "+r"(count)  // %2
2791  :
2792  : "memory", "cc"
2793    , "xmm0", "xmm1"
2794  );
2795}
2796#endif  // HAS_COPYROW_SSE2
2797
2798#ifdef HAS_COPYROW_AVX
2799void CopyRow_AVX(const uint8* src, uint8* dst, int count) {
2800  asm volatile (
2801    LABELALIGN
2802    "1:                                        \n"
2803    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
2804    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
2805    "lea       " MEMLEA(0x40,0) ",%0           \n"
2806    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
2807    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
2808    "lea       " MEMLEA(0x40,1) ",%1           \n"
2809    "sub       $0x40,%2                        \n"
2810    "jg        1b                              \n"
2811  : "+r"(src),   // %0
2812    "+r"(dst),   // %1
2813    "+r"(count)  // %2
2814  :
2815  : "memory", "cc"
2816    , "xmm0", "xmm1"
2817  );
2818}
2819#endif  // HAS_COPYROW_AVX
2820
2821#ifdef HAS_COPYROW_ERMS
2822// Multiple of 1.
2823void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
2824  size_t width_tmp = (size_t)(width);
2825  asm volatile("rep movsb " MEMMOVESTRING(0, 1) "          \n"
2826               : "+S"(src),       // %0
2827                 "+D"(dst),       // %1
2828                 "+c"(width_tmp)  // %2
2829               :
2830               : "memory", "cc");
2831}
2832#endif  // HAS_COPYROW_ERMS
2833
2834#ifdef HAS_ARGBCOPYALPHAROW_SSE2
2835// width in pixels
2836void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2837  asm volatile (
2838    "pcmpeqb   %%xmm0,%%xmm0                   \n"
2839    "pslld     $0x18,%%xmm0                    \n"
2840    "pcmpeqb   %%xmm1,%%xmm1                   \n"
2841    "psrld     $0x8,%%xmm1                     \n"
2842
2843    LABELALIGN
2844    "1:                                        \n"
2845    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
2846    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
2847    "lea       " MEMLEA(0x20,0) ",%0           \n"
2848    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2849    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2850    "pand      %%xmm0,%%xmm2                   \n"
2851    "pand      %%xmm0,%%xmm3                   \n"
2852    "pand      %%xmm1,%%xmm4                   \n"
2853    "pand      %%xmm1,%%xmm5                   \n"
2854    "por       %%xmm4,%%xmm2                   \n"
2855    "por       %%xmm5,%%xmm3                   \n"
2856    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2857    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2858    "lea       " MEMLEA(0x20,1) ",%1           \n"
2859    "sub       $0x8,%2                         \n"
2860    "jg        1b                              \n"
2861  : "+r"(src),   // %0
2862    "+r"(dst),   // %1
2863    "+r"(width)  // %2
2864  :
2865  : "memory", "cc"
2866    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2867  );
2868}
2869#endif  // HAS_ARGBCOPYALPHAROW_SSE2
2870
2871#ifdef HAS_ARGBCOPYALPHAROW_AVX2
2872// width in pixels
2873void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
2874  asm volatile (
2875    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
2876    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
2877
2878    LABELALIGN
2879    "1:                                        \n"
2880    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
2881    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
2882    "lea       " MEMLEA(0x40,0) ",%0           \n"
2883    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
2884    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
2885    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
2886    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
2887    "lea       " MEMLEA(0x40,1) ",%1           \n"
2888    "sub       $0x10,%2                        \n"
2889    "jg        1b                              \n"
2890    "vzeroupper                                \n"
2891  : "+r"(src),   // %0
2892    "+r"(dst),   // %1
2893    "+r"(width)  // %2
2894  :
2895  : "memory", "cc"
2896    , "xmm0", "xmm1", "xmm2"
2897  );
2898}
2899#endif  // HAS_ARGBCOPYALPHAROW_AVX2
2900
2901#ifdef HAS_ARGBEXTRACTALPHAROW_SSE2
2902// width in pixels
2903void ARGBExtractAlphaRow_SSE2(const uint8* src_argb, uint8* dst_a, int width) {
2904  asm volatile (
2905    LABELALIGN
2906    "1:                                        \n"
2907    "movdqu    " MEMACCESS(0) ", %%xmm0        \n"
2908    "movdqu    " MEMACCESS2(0x10, 0) ", %%xmm1 \n"
2909    "lea       " MEMLEA(0x20, 0) ", %0         \n"
2910    "psrld     $0x18, %%xmm0                   \n"
2911    "psrld     $0x18, %%xmm1                   \n"
2912    "packssdw  %%xmm1, %%xmm0                  \n"
2913    "packuswb  %%xmm0, %%xmm0                  \n"
2914    "movq      %%xmm0," MEMACCESS(1) "         \n"
2915    "lea       " MEMLEA(0x8, 1) ", %1          \n"
2916    "sub       $0x8, %2                        \n"
2917    "jg        1b                              \n"
2918  : "+r"(src_argb),  // %0
2919    "+r"(dst_a),     // %1
2920    "+rm"(width)     // %2
2921  :
2922  : "memory", "cc"
2923    , "xmm0", "xmm1"
2924  );
2925}
2926#endif  // HAS_ARGBEXTRACTALPHAROW_SSE2
2927
2928#ifdef HAS_ARGBEXTRACTALPHAROW_AVX2
2929static const uvec8 kShuffleAlphaShort_AVX2 = {
2930    3u,  128u, 128u, 128u, 7u,  128u, 128u, 128u,
2931    11u, 128u, 128u, 128u, 15u, 128u, 128u, 128u};
2932
2933void ARGBExtractAlphaRow_AVX2(const uint8* src_argb, uint8* dst_a, int width) {
2934  asm volatile (
2935    "vmovdqa    %3,%%ymm4                      \n"
2936    "vbroadcastf128 %4,%%ymm5                  \n"
2937
2938    LABELALIGN
2939    "1:                                        \n"
2940    "vmovdqu   " MEMACCESS(0) ", %%ymm0        \n"
2941    "vmovdqu   " MEMACCESS2(0x20, 0) ", %%ymm1 \n"
2942    "vpshufb    %%ymm5,%%ymm0,%%ymm0           \n" // vpsrld $0x18, %%ymm0
2943    "vpshufb    %%ymm5,%%ymm1,%%ymm1           \n"
2944    "vmovdqu   " MEMACCESS2(0x40, 0) ", %%ymm2 \n"
2945    "vmovdqu   " MEMACCESS2(0x60, 0) ", %%ymm3 \n"
2946    "lea       " MEMLEA(0x80, 0) ", %0         \n"
2947    "vpackssdw  %%ymm1, %%ymm0, %%ymm0         \n"  // mutates
2948    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
2949    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
2950    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // mutates
2951    "vpackuswb  %%ymm2,%%ymm0,%%ymm0           \n"  // mutates.
2952    "vpermd     %%ymm0,%%ymm4,%%ymm0           \n"  // unmutate.
2953    "vmovdqu    %%ymm0," MEMACCESS(1) "        \n"
2954    "lea       " MEMLEA(0x20,1) ",%1           \n"
2955    "sub        $0x20, %2                      \n"
2956    "jg         1b                             \n"
2957    "vzeroupper                                \n"
2958  : "+r"(src_argb),  // %0
2959    "+r"(dst_a),     // %1
2960    "+rm"(width)     // %2
2961  : "m"(kPermdARGBToY_AVX),  // %3
2962    "m"(kShuffleAlphaShort_AVX2)  // %4
2963  : "memory", "cc"
2964    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2965  );
2966}
2967#endif  // HAS_ARGBEXTRACTALPHAROW_AVX2
2968
2969#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
2970// width in pixels
2971void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
2972  asm volatile (
2973    "pcmpeqb   %%xmm0,%%xmm0                   \n"
2974    "pslld     $0x18,%%xmm0                    \n"
2975    "pcmpeqb   %%xmm1,%%xmm1                   \n"
2976    "psrld     $0x8,%%xmm1                     \n"
2977
2978    LABELALIGN
2979    "1:                                        \n"
2980    "movq      " MEMACCESS(0) ",%%xmm2         \n"
2981    "lea       " MEMLEA(0x8,0) ",%0            \n"
2982    "punpcklbw %%xmm2,%%xmm2                   \n"
2983    "punpckhwd %%xmm2,%%xmm3                   \n"
2984    "punpcklwd %%xmm2,%%xmm2                   \n"
2985    "movdqu    " MEMACCESS(1) ",%%xmm4         \n"
2986    "movdqu    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
2987    "pand      %%xmm0,%%xmm2                   \n"
2988    "pand      %%xmm0,%%xmm3                   \n"
2989    "pand      %%xmm1,%%xmm4                   \n"
2990    "pand      %%xmm1,%%xmm5                   \n"
2991    "por       %%xmm4,%%xmm2                   \n"
2992    "por       %%xmm5,%%xmm3                   \n"
2993    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
2994    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
2995    "lea       " MEMLEA(0x20,1) ",%1           \n"
2996    "sub       $0x8,%2                         \n"
2997    "jg        1b                              \n"
2998  : "+r"(src),   // %0
2999    "+r"(dst),   // %1
3000    "+r"(width)  // %2
3001  :
3002  : "memory", "cc"
3003    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3004  );
3005}
3006#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3007
3008#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3009// width in pixels
3010void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3011  asm volatile (
3012    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
3013    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
3014
3015    LABELALIGN
3016    "1:                                        \n"
3017    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
3018    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
3019    "lea       " MEMLEA(0x10,0) ",%0           \n"
3020    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
3021    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
3022    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
3023    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
3024    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
3025    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
3026    "lea       " MEMLEA(0x40,1) ",%1           \n"
3027    "sub       $0x10,%2                        \n"
3028    "jg        1b                              \n"
3029    "vzeroupper                                \n"
3030  : "+r"(src),   // %0
3031    "+r"(dst),   // %1
3032    "+r"(width)  // %2
3033  :
3034  : "memory", "cc"
3035    , "xmm0", "xmm1", "xmm2"
3036  );
3037}
3038#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3039
3040#ifdef HAS_SETROW_X86
3041void SetRow_X86(uint8* dst, uint8 v8, int width) {
3042  size_t width_tmp = (size_t)(width >> 2);
3043  const uint32 v32 = v8 * 0x01010101u;  // Duplicate byte to all bytes.
3044  asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
3045               : "+D"(dst),       // %0
3046                 "+c"(width_tmp)  // %1
3047               : "a"(v32)         // %2
3048               : "memory", "cc");
3049}
3050
3051void SetRow_ERMS(uint8* dst, uint8 v8, int width) {
3052  size_t width_tmp = (size_t)(width);
3053  asm volatile("rep stosb " MEMSTORESTRING(al, 0) "        \n"
3054               : "+D"(dst),       // %0
3055                 "+c"(width_tmp)  // %1
3056               : "a"(v8)          // %2
3057               : "memory", "cc");
3058}
3059
3060void ARGBSetRow_X86(uint8* dst_argb, uint32 v32, int width) {
3061  size_t width_tmp = (size_t)(width);
3062  asm volatile("rep stosl " MEMSTORESTRING(eax, 0) "       \n"
3063               : "+D"(dst_argb),  // %0
3064                 "+c"(width_tmp)  // %1
3065               : "a"(v32)         // %2
3066               : "memory", "cc");
3067}
3068#endif  // HAS_SETROW_X86
3069
3070#ifdef HAS_YUY2TOYROW_SSE2
3071void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int width) {
3072  asm volatile (
3073    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3074    "psrlw     $0x8,%%xmm5                     \n"
3075
3076    LABELALIGN
3077    "1:                                        \n"
3078    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3079    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3080    "lea       " MEMLEA(0x20,0) ",%0           \n"
3081    "pand      %%xmm5,%%xmm0                   \n"
3082    "pand      %%xmm5,%%xmm1                   \n"
3083    "packuswb  %%xmm1,%%xmm0                   \n"
3084    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3085    "lea       " MEMLEA(0x10,1) ",%1           \n"
3086    "sub       $0x10,%2                        \n"
3087    "jg        1b                              \n"
3088  : "+r"(src_yuy2),  // %0
3089    "+r"(dst_y),     // %1
3090    "+r"(width)        // %2
3091  :
3092  : "memory", "cc"
3093    , "xmm0", "xmm1", "xmm5"
3094  );
3095}
3096
3097void YUY2ToUVRow_SSE2(const uint8* src_yuy2,
3098                      int stride_yuy2,
3099                      uint8* dst_u,
3100                      uint8* dst_v,
3101                      int width) {
3102  asm volatile (
3103    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3104    "psrlw     $0x8,%%xmm5                     \n"
3105    "sub       %1,%2                           \n"
3106
3107    LABELALIGN
3108    "1:                                        \n"
3109    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3110    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3111    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3112    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3113    "lea       " MEMLEA(0x20,0) ",%0           \n"
3114    "pavgb     %%xmm2,%%xmm0                   \n"
3115    "pavgb     %%xmm3,%%xmm1                   \n"
3116    "psrlw     $0x8,%%xmm0                     \n"
3117    "psrlw     $0x8,%%xmm1                     \n"
3118    "packuswb  %%xmm1,%%xmm0                   \n"
3119    "movdqa    %%xmm0,%%xmm1                   \n"
3120    "pand      %%xmm5,%%xmm0                   \n"
3121    "packuswb  %%xmm0,%%xmm0                   \n"
3122    "psrlw     $0x8,%%xmm1                     \n"
3123    "packuswb  %%xmm1,%%xmm1                   \n"
3124    "movq      %%xmm0," MEMACCESS(1) "         \n"
3125    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3126    "lea       " MEMLEA(0x8,1) ",%1            \n"
3127    "sub       $0x10,%3                        \n"
3128    "jg        1b                              \n"
3129  : "+r"(src_yuy2),    // %0
3130    "+r"(dst_u),       // %1
3131    "+r"(dst_v),       // %2
3132    "+r"(width)          // %3
3133  : "r"((intptr_t)(stride_yuy2))  // %4
3134  : "memory", "cc", NACL_R14
3135    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3136  );
3137}
3138
3139void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3140                         uint8* dst_u,
3141                         uint8* dst_v,
3142                         int width) {
3143  asm volatile (
3144    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3145    "psrlw     $0x8,%%xmm5                     \n"
3146    "sub       %1,%2                           \n"
3147
3148    LABELALIGN
3149    "1:                                        \n"
3150    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3151    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3152    "lea       " MEMLEA(0x20,0) ",%0           \n"
3153    "psrlw     $0x8,%%xmm0                     \n"
3154    "psrlw     $0x8,%%xmm1                     \n"
3155    "packuswb  %%xmm1,%%xmm0                   \n"
3156    "movdqa    %%xmm0,%%xmm1                   \n"
3157    "pand      %%xmm5,%%xmm0                   \n"
3158    "packuswb  %%xmm0,%%xmm0                   \n"
3159    "psrlw     $0x8,%%xmm1                     \n"
3160    "packuswb  %%xmm1,%%xmm1                   \n"
3161    "movq      %%xmm0," MEMACCESS(1) "         \n"
3162    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3163    "lea       " MEMLEA(0x8,1) ",%1            \n"
3164    "sub       $0x10,%3                        \n"
3165    "jg        1b                              \n"
3166  : "+r"(src_yuy2),    // %0
3167    "+r"(dst_u),       // %1
3168    "+r"(dst_v),       // %2
3169    "+r"(width)          // %3
3170  :
3171  : "memory", "cc", NACL_R14
3172    "xmm0", "xmm1", "xmm5"
3173  );
3174}
3175
3176void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int width) {
3177  asm volatile (
3178    LABELALIGN
3179    "1:                                        \n"
3180    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3181    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3182    "lea       " MEMLEA(0x20,0) ",%0           \n"
3183    "psrlw     $0x8,%%xmm0                     \n"
3184    "psrlw     $0x8,%%xmm1                     \n"
3185    "packuswb  %%xmm1,%%xmm0                   \n"
3186    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3187    "lea       " MEMLEA(0x10,1) ",%1           \n"
3188    "sub       $0x10,%2                        \n"
3189    "jg        1b                              \n"
3190  : "+r"(src_uyvy),  // %0
3191    "+r"(dst_y),     // %1
3192    "+r"(width)        // %2
3193  :
3194  : "memory", "cc"
3195    , "xmm0", "xmm1"
3196  );
3197}
3198
3199void UYVYToUVRow_SSE2(const uint8* src_uyvy,
3200                      int stride_uyvy,
3201                      uint8* dst_u,
3202                      uint8* dst_v,
3203                      int width) {
3204  asm volatile (
3205    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3206    "psrlw     $0x8,%%xmm5                     \n"
3207    "sub       %1,%2                           \n"
3208
3209    LABELALIGN
3210    "1:                                        \n"
3211    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3212    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3213    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3214    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3215    "lea       " MEMLEA(0x20,0) ",%0           \n"
3216    "pavgb     %%xmm2,%%xmm0                   \n"
3217    "pavgb     %%xmm3,%%xmm1                   \n"
3218    "pand      %%xmm5,%%xmm0                   \n"
3219    "pand      %%xmm5,%%xmm1                   \n"
3220    "packuswb  %%xmm1,%%xmm0                   \n"
3221    "movdqa    %%xmm0,%%xmm1                   \n"
3222    "pand      %%xmm5,%%xmm0                   \n"
3223    "packuswb  %%xmm0,%%xmm0                   \n"
3224    "psrlw     $0x8,%%xmm1                     \n"
3225    "packuswb  %%xmm1,%%xmm1                   \n"
3226    "movq      %%xmm0," MEMACCESS(1) "         \n"
3227    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3228    "lea       " MEMLEA(0x8,1) ",%1            \n"
3229    "sub       $0x10,%3                        \n"
3230    "jg        1b                              \n"
3231  : "+r"(src_uyvy),    // %0
3232    "+r"(dst_u),       // %1
3233    "+r"(dst_v),       // %2
3234    "+r"(width)          // %3
3235  : "r"((intptr_t)(stride_uyvy))  // %4
3236  : "memory", "cc", NACL_R14
3237    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3238  );
3239}
3240
3241void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3242                         uint8* dst_u,
3243                         uint8* dst_v,
3244                         int width) {
3245  asm volatile (
3246    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3247    "psrlw     $0x8,%%xmm5                     \n"
3248    "sub       %1,%2                           \n"
3249
3250    LABELALIGN
3251    "1:                                        \n"
3252    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3253    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3254    "lea       " MEMLEA(0x20,0) ",%0           \n"
3255    "pand      %%xmm5,%%xmm0                   \n"
3256    "pand      %%xmm5,%%xmm1                   \n"
3257    "packuswb  %%xmm1,%%xmm0                   \n"
3258    "movdqa    %%xmm0,%%xmm1                   \n"
3259    "pand      %%xmm5,%%xmm0                   \n"
3260    "packuswb  %%xmm0,%%xmm0                   \n"
3261    "psrlw     $0x8,%%xmm1                     \n"
3262    "packuswb  %%xmm1,%%xmm1                   \n"
3263    "movq      %%xmm0," MEMACCESS(1) "         \n"
3264    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3265    "lea       " MEMLEA(0x8,1) ",%1            \n"
3266    "sub       $0x10,%3                        \n"
3267    "jg        1b                              \n"
3268  : "+r"(src_uyvy),    // %0
3269    "+r"(dst_u),       // %1
3270    "+r"(dst_v),       // %2
3271    "+r"(width)          // %3
3272  :
3273  : "memory", "cc", NACL_R14
3274    "xmm0", "xmm1", "xmm5"
3275  );
3276}
3277#endif  // HAS_YUY2TOYROW_SSE2
3278
3279#ifdef HAS_YUY2TOYROW_AVX2
3280void YUY2ToYRow_AVX2(const uint8* src_yuy2, uint8* dst_y, int width) {
3281  asm volatile (
3282    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3283    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3284
3285    LABELALIGN
3286    "1:                                        \n"
3287    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3288    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3289    "lea       " MEMLEA(0x40,0) ",%0           \n"
3290    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3291    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3292    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3293    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3294    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3295    "lea      " MEMLEA(0x20,1) ",%1            \n"
3296    "sub       $0x20,%2                        \n"
3297    "jg        1b                              \n"
3298    "vzeroupper                                \n"
3299  : "+r"(src_yuy2),  // %0
3300    "+r"(dst_y),     // %1
3301    "+r"(width)        // %2
3302  :
3303  : "memory", "cc"
3304    , "xmm0", "xmm1", "xmm5"
3305  );
3306}
3307
3308void YUY2ToUVRow_AVX2(const uint8* src_yuy2,
3309                      int stride_yuy2,
3310                      uint8* dst_u,
3311                      uint8* dst_v,
3312                      int width) {
3313  asm volatile (
3314    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3315    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3316    "sub       %1,%2                           \n"
3317
3318    LABELALIGN
3319    "1:                                        \n"
3320    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3321    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3322    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3323    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3324    "lea       " MEMLEA(0x40,0) ",%0           \n"
3325    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3326    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3327    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3328    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3329    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3330    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3331    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3332    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3333    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3334    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3335    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3336    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3337    "lea      " MEMLEA(0x10,1) ",%1            \n"
3338    "sub       $0x20,%3                        \n"
3339    "jg        1b                              \n"
3340    "vzeroupper                                \n"
3341  : "+r"(src_yuy2),    // %0
3342    "+r"(dst_u),       // %1
3343    "+r"(dst_v),       // %2
3344    "+r"(width)          // %3
3345  : "r"((intptr_t)(stride_yuy2))  // %4
3346  : "memory", "cc", NACL_R14
3347    "xmm0", "xmm1", "xmm5"
3348  );
3349}
3350
3351void YUY2ToUV422Row_AVX2(const uint8* src_yuy2,
3352                         uint8* dst_u,
3353                         uint8* dst_v,
3354                         int width) {
3355  asm volatile (
3356    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3357    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3358    "sub       %1,%2                           \n"
3359
3360    LABELALIGN
3361    "1:                                        \n"
3362    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3363    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3364    "lea       " MEMLEA(0x40,0) ",%0           \n"
3365    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3366    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3367    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3368    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3369    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3370    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3371    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3372    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3373    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3374    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3375    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3376    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3377    "lea      " MEMLEA(0x10,1) ",%1            \n"
3378    "sub       $0x20,%3                        \n"
3379    "jg        1b                              \n"
3380    "vzeroupper                                \n"
3381  : "+r"(src_yuy2),    // %0
3382    "+r"(dst_u),       // %1
3383    "+r"(dst_v),       // %2
3384    "+r"(width)          // %3
3385  :
3386  : "memory", "cc", NACL_R14
3387    "xmm0", "xmm1", "xmm5"
3388  );
3389}
3390
3391void UYVYToYRow_AVX2(const uint8* src_uyvy, uint8* dst_y, int width) {
3392  asm volatile (
3393    LABELALIGN
3394    "1:                                        \n"
3395    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3396    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3397    "lea       " MEMLEA(0x40,0) ",%0           \n"
3398    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3399    "vpsrlw    $0x8,%%ymm1,%%ymm1              \n"
3400    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3401    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3402    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
3403    "lea      " MEMLEA(0x20,1) ",%1            \n"
3404    "sub       $0x20,%2                        \n"
3405    "jg        1b                              \n"
3406    "vzeroupper                                \n"
3407  : "+r"(src_uyvy),  // %0
3408    "+r"(dst_y),     // %1
3409    "+r"(width)        // %2
3410  :
3411  : "memory", "cc"
3412    , "xmm0", "xmm1", "xmm5"
3413  );
3414}
3415void UYVYToUVRow_AVX2(const uint8* src_uyvy,
3416                      int stride_uyvy,
3417                      uint8* dst_u,
3418                      uint8* dst_v,
3419                      int width) {
3420  asm volatile (
3421    "vpcmpeqb  %%ymm5,%%ymm5,%%ymm5            \n"
3422    "vpsrlw    $0x8,%%ymm5,%%ymm5              \n"
3423    "sub       %1,%2                           \n"
3424
3425    LABELALIGN
3426    "1:                                        \n"
3427    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3428    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3429    VMEMOPREG(vpavgb,0x00,0,4,1,ymm0,ymm0)     // vpavgb (%0,%4,1),%%ymm0,%%ymm0
3430    VMEMOPREG(vpavgb,0x20,0,4,1,ymm1,ymm1)
3431    "lea       " MEMLEA(0x40,0) ",%0           \n"
3432    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3433    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3434    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3435    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3436    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3437    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3438    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3439    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3440    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3441    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3442    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3443    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3444    "lea      " MEMLEA(0x10,1) ",%1            \n"
3445    "sub       $0x20,%3                        \n"
3446    "jg        1b                              \n"
3447    "vzeroupper                                \n"
3448  : "+r"(src_uyvy),    // %0
3449    "+r"(dst_u),       // %1
3450    "+r"(dst_v),       // %2
3451    "+r"(width)          // %3
3452  : "r"((intptr_t)(stride_uyvy))  // %4
3453  : "memory", "cc", NACL_R14
3454    "xmm0", "xmm1", "xmm5"
3455  );
3456}
3457
3458void UYVYToUV422Row_AVX2(const uint8* src_uyvy,
3459                         uint8* dst_u,
3460                         uint8* dst_v,
3461                         int width) {
3462  asm volatile (
3463    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3464    "vpsrlw     $0x8,%%ymm5,%%ymm5             \n"
3465    "sub       %1,%2                           \n"
3466
3467    LABELALIGN
3468    "1:                                        \n"
3469    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
3470    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
3471    "lea       " MEMLEA(0x40,0) ",%0           \n"
3472    "vpand     %%ymm5,%%ymm0,%%ymm0            \n"
3473    "vpand     %%ymm5,%%ymm1,%%ymm1            \n"
3474    "vpackuswb %%ymm1,%%ymm0,%%ymm0            \n"
3475    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3476    "vpand     %%ymm5,%%ymm0,%%ymm1            \n"
3477    "vpsrlw    $0x8,%%ymm0,%%ymm0              \n"
3478    "vpackuswb %%ymm1,%%ymm1,%%ymm1            \n"
3479    "vpackuswb %%ymm0,%%ymm0,%%ymm0            \n"
3480    "vpermq    $0xd8,%%ymm1,%%ymm1             \n"
3481    "vpermq    $0xd8,%%ymm0,%%ymm0             \n"
3482    "vextractf128 $0x0,%%ymm1," MEMACCESS(1) " \n"
3483    VEXTOPMEM(vextractf128,0,ymm0,0x00,1,2,1) // vextractf128 $0x0,%%ymm0,(%1,%2,1)
3484    "lea      " MEMLEA(0x10,1) ",%1            \n"
3485    "sub       $0x20,%3                        \n"
3486    "jg        1b                              \n"
3487    "vzeroupper                                \n"
3488  : "+r"(src_uyvy),    // %0
3489    "+r"(dst_u),       // %1
3490    "+r"(dst_v),       // %2
3491    "+r"(width)          // %3
3492  :
3493  : "memory", "cc", NACL_R14
3494    "xmm0", "xmm1", "xmm5"
3495  );
3496}
3497#endif  // HAS_YUY2TOYROW_AVX2
3498
3499#ifdef HAS_ARGBBLENDROW_SSSE3
3500// Shuffle table for isolating alpha.
3501static uvec8 kShuffleAlpha = {3u,  0x80, 3u,  0x80, 7u,  0x80, 7u,  0x80,
3502                              11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80};
3503
3504// Blend 8 pixels at a time
3505void ARGBBlendRow_SSSE3(const uint8* src_argb0,
3506                        const uint8* src_argb1,
3507                        uint8* dst_argb,
3508                        int width) {
3509  asm volatile (
3510    "pcmpeqb   %%xmm7,%%xmm7                   \n"
3511    "psrlw     $0xf,%%xmm7                     \n"
3512    "pcmpeqb   %%xmm6,%%xmm6                   \n"
3513    "psrlw     $0x8,%%xmm6                     \n"
3514    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3515    "psllw     $0x8,%%xmm5                     \n"
3516    "pcmpeqb   %%xmm4,%%xmm4                   \n"
3517    "pslld     $0x18,%%xmm4                    \n"
3518    "sub       $0x4,%3                         \n"
3519    "jl        49f                             \n"
3520
3521    // 4 pixel loop.
3522    LABELALIGN
3523  "40:                                         \n"
3524    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3525    "lea       " MEMLEA(0x10,0) ",%0           \n"
3526    "movdqa    %%xmm3,%%xmm0                   \n"
3527    "pxor      %%xmm4,%%xmm3                   \n"
3528    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3529    "pshufb    %4,%%xmm3                       \n"
3530    "pand      %%xmm6,%%xmm2                   \n"
3531    "paddw     %%xmm7,%%xmm3                   \n"
3532    "pmullw    %%xmm3,%%xmm2                   \n"
3533    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3534    "lea       " MEMLEA(0x10,1) ",%1           \n"
3535    "psrlw     $0x8,%%xmm1                     \n"
3536    "por       %%xmm4,%%xmm0                   \n"
3537    "pmullw    %%xmm3,%%xmm1                   \n"
3538    "psrlw     $0x8,%%xmm2                     \n"
3539    "paddusb   %%xmm2,%%xmm0                   \n"
3540    "pand      %%xmm5,%%xmm1                   \n"
3541    "paddusb   %%xmm1,%%xmm0                   \n"
3542    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
3543    "lea       " MEMLEA(0x10,2) ",%2           \n"
3544    "sub       $0x4,%3                         \n"
3545    "jge       40b                             \n"
3546
3547  "49:                                         \n"
3548    "add       $0x3,%3                         \n"
3549    "jl        99f                             \n"
3550
3551    // 1 pixel loop.
3552  "91:                                         \n"
3553    "movd      " MEMACCESS(0) ",%%xmm3         \n"
3554    "lea       " MEMLEA(0x4,0) ",%0            \n"
3555    "movdqa    %%xmm3,%%xmm0                   \n"
3556    "pxor      %%xmm4,%%xmm3                   \n"
3557    "movd      " MEMACCESS(1) ",%%xmm2         \n"
3558    "pshufb    %4,%%xmm3                       \n"
3559    "pand      %%xmm6,%%xmm2                   \n"
3560    "paddw     %%xmm7,%%xmm3                   \n"
3561    "pmullw    %%xmm3,%%xmm2                   \n"
3562    "movd      " MEMACCESS(1) ",%%xmm1         \n"
3563    "lea       " MEMLEA(0x4,1) ",%1            \n"
3564    "psrlw     $0x8,%%xmm1                     \n"
3565    "por       %%xmm4,%%xmm0                   \n"
3566    "pmullw    %%xmm3,%%xmm1                   \n"
3567    "psrlw     $0x8,%%xmm2                     \n"
3568    "paddusb   %%xmm2,%%xmm0                   \n"
3569    "pand      %%xmm5,%%xmm1                   \n"
3570    "paddusb   %%xmm1,%%xmm0                   \n"
3571    "movd      %%xmm0," MEMACCESS(2) "         \n"
3572    "lea       " MEMLEA(0x4,2) ",%2            \n"
3573    "sub       $0x1,%3                         \n"
3574    "jge       91b                             \n"
3575  "99:                                         \n"
3576  : "+r"(src_argb0),    // %0
3577    "+r"(src_argb1),    // %1
3578    "+r"(dst_argb),     // %2
3579    "+r"(width)         // %3
3580  : "m"(kShuffleAlpha)  // %4
3581  : "memory", "cc"
3582    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3583  );
3584}
3585#endif  // HAS_ARGBBLENDROW_SSSE3
3586
3587#ifdef HAS_BLENDPLANEROW_SSSE3
3588// Blend 8 pixels at a time.
3589// unsigned version of math
3590// =((A2*C2)+(B2*(255-C2))+255)/256
3591// signed version of math
3592// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3593void BlendPlaneRow_SSSE3(const uint8* src0,
3594                         const uint8* src1,
3595                         const uint8* alpha,
3596                         uint8* dst,
3597                         int width) {
3598  asm volatile(
3599      "pcmpeqb    %%xmm5,%%xmm5                  \n"
3600      "psllw      $0x8,%%xmm5                    \n"
3601      "mov        $0x80808080,%%eax              \n"
3602      "movd       %%eax,%%xmm6                   \n"
3603      "pshufd     $0x0,%%xmm6,%%xmm6             \n"
3604      "mov        $0x807f807f,%%eax              \n"
3605      "movd       %%eax,%%xmm7                   \n"
3606      "pshufd     $0x0,%%xmm7,%%xmm7             \n"
3607      "sub        %2,%0                          \n"
3608      "sub        %2,%1                          \n"
3609      "sub        %2,%3                          \n"
3610
3611      // 8 pixel loop.
3612      LABELALIGN
3613      "1:                                        \n"
3614      "movq       (%2),%%xmm0                    \n"
3615      "punpcklbw  %%xmm0,%%xmm0                  \n"
3616      "pxor       %%xmm5,%%xmm0                  \n"
3617      "movq       (%0,%2,1),%%xmm1               \n"
3618      "movq       (%1,%2,1),%%xmm2               \n"
3619      "punpcklbw  %%xmm2,%%xmm1                  \n"
3620      "psubb      %%xmm6,%%xmm1                  \n"
3621      "pmaddubsw  %%xmm1,%%xmm0                  \n"
3622      "paddw      %%xmm7,%%xmm0                  \n"
3623      "psrlw      $0x8,%%xmm0                    \n"
3624      "packuswb   %%xmm0,%%xmm0                  \n"
3625      "movq       %%xmm0,(%3,%2,1)               \n"
3626      "lea        0x8(%2),%2                     \n"
3627      "sub        $0x8,%4                        \n"
3628      "jg        1b                              \n"
3629      : "+r"(src0),   // %0
3630        "+r"(src1),   // %1
3631        "+r"(alpha),  // %2
3632        "+r"(dst),    // %3
3633        "+rm"(width)  // %4
3634        ::"memory",
3635        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm5", "xmm6", "xmm7");
3636}
3637#endif  // HAS_BLENDPLANEROW_SSSE3
3638
3639#ifdef HAS_BLENDPLANEROW_AVX2
3640// Blend 32 pixels at a time.
3641// unsigned version of math
3642// =((A2*C2)+(B2*(255-C2))+255)/256
3643// signed version of math
3644// =(((A2-128)*C2)+((B2-128)*(255-C2))+32768+127)/256
3645void BlendPlaneRow_AVX2(const uint8* src0,
3646                        const uint8* src1,
3647                        const uint8* alpha,
3648                        uint8* dst,
3649                        int width) {
3650  asm volatile(
3651      "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3652      "vpsllw     $0x8,%%ymm5,%%ymm5             \n"
3653      "mov        $0x80808080,%%eax              \n"
3654      "vmovd      %%eax,%%xmm6                   \n"
3655      "vbroadcastss %%xmm6,%%ymm6                \n"
3656      "mov        $0x807f807f,%%eax              \n"
3657      "vmovd      %%eax,%%xmm7                   \n"
3658      "vbroadcastss %%xmm7,%%ymm7                \n"
3659      "sub        %2,%0                          \n"
3660      "sub        %2,%1                          \n"
3661      "sub        %2,%3                          \n"
3662
3663      // 32 pixel loop.
3664      LABELALIGN
3665      "1:                                        \n"
3666      "vmovdqu    (%2),%%ymm0                    \n"
3667      "vpunpckhbw %%ymm0,%%ymm0,%%ymm3           \n"
3668      "vpunpcklbw %%ymm0,%%ymm0,%%ymm0           \n"
3669      "vpxor      %%ymm5,%%ymm3,%%ymm3           \n"
3670      "vpxor      %%ymm5,%%ymm0,%%ymm0           \n"
3671      "vmovdqu    (%0,%2,1),%%ymm1               \n"
3672      "vmovdqu    (%1,%2,1),%%ymm2               \n"
3673      "vpunpckhbw %%ymm2,%%ymm1,%%ymm4           \n"
3674      "vpunpcklbw %%ymm2,%%ymm1,%%ymm1           \n"
3675      "vpsubb     %%ymm6,%%ymm4,%%ymm4           \n"
3676      "vpsubb     %%ymm6,%%ymm1,%%ymm1           \n"
3677      "vpmaddubsw %%ymm4,%%ymm3,%%ymm3           \n"
3678      "vpmaddubsw %%ymm1,%%ymm0,%%ymm0           \n"
3679      "vpaddw     %%ymm7,%%ymm3,%%ymm3           \n"
3680      "vpaddw     %%ymm7,%%ymm0,%%ymm0           \n"
3681      "vpsrlw     $0x8,%%ymm3,%%ymm3             \n"
3682      "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3683      "vpackuswb  %%ymm3,%%ymm0,%%ymm0           \n"
3684      "vmovdqu    %%ymm0,(%3,%2,1)               \n"
3685      "lea        0x20(%2),%2                    \n"
3686      "sub        $0x20,%4                       \n"
3687      "jg        1b                              \n"
3688      "vzeroupper                                \n"
3689      : "+r"(src0),   // %0
3690        "+r"(src1),   // %1
3691        "+r"(alpha),  // %2
3692        "+r"(dst),    // %3
3693        "+rm"(width)  // %4
3694        ::"memory",
3695        "cc", "eax", "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6",
3696        "xmm7");
3697}
3698#endif  // HAS_BLENDPLANEROW_AVX2
3699
3700#ifdef HAS_ARGBATTENUATEROW_SSSE3
3701// Shuffle table duplicating alpha
3702static uvec8 kShuffleAlpha0 = {3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u,
3703                               7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u};
3704static uvec8 kShuffleAlpha1 = {11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
3705                               15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u};
3706// Attenuate 4 pixels at a time.
3707void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3708  asm volatile (
3709    "pcmpeqb   %%xmm3,%%xmm3                   \n"
3710    "pslld     $0x18,%%xmm3                    \n"
3711    "movdqa    %3,%%xmm4                       \n"
3712    "movdqa    %4,%%xmm5                       \n"
3713
3714    // 4 pixel loop.
3715    LABELALIGN
3716    "1:                                        \n"
3717    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3718    "pshufb    %%xmm4,%%xmm0                   \n"
3719    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3720    "punpcklbw %%xmm1,%%xmm1                   \n"
3721    "pmulhuw   %%xmm1,%%xmm0                   \n"
3722    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3723    "pshufb    %%xmm5,%%xmm1                   \n"
3724    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3725    "punpckhbw %%xmm2,%%xmm2                   \n"
3726    "pmulhuw   %%xmm2,%%xmm1                   \n"
3727    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3728    "lea       " MEMLEA(0x10,0) ",%0           \n"
3729    "pand      %%xmm3,%%xmm2                   \n"
3730    "psrlw     $0x8,%%xmm0                     \n"
3731    "psrlw     $0x8,%%xmm1                     \n"
3732    "packuswb  %%xmm1,%%xmm0                   \n"
3733    "por       %%xmm2,%%xmm0                   \n"
3734    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3735    "lea       " MEMLEA(0x10,1) ",%1           \n"
3736    "sub       $0x4,%2                         \n"
3737    "jg        1b                              \n"
3738  : "+r"(src_argb),    // %0
3739    "+r"(dst_argb),    // %1
3740    "+r"(width)        // %2
3741  : "m"(kShuffleAlpha0),  // %3
3742    "m"(kShuffleAlpha1)  // %4
3743  : "memory", "cc"
3744    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3745  );
3746}
3747#endif  // HAS_ARGBATTENUATEROW_SSSE3
3748
3749#ifdef HAS_ARGBATTENUATEROW_AVX2
3750// Shuffle table duplicating alpha.
3751static const uvec8 kShuffleAlpha_AVX2 = {6u,   7u,   6u,   7u,  6u,  7u,
3752                                         128u, 128u, 14u,  15u, 14u, 15u,
3753                                         14u,  15u,  128u, 128u};
3754// Attenuate 8 pixels at a time.
3755void ARGBAttenuateRow_AVX2(const uint8* src_argb, uint8* dst_argb, int width) {
3756  asm volatile (
3757    "vbroadcastf128 %3,%%ymm4                  \n"
3758    "vpcmpeqb   %%ymm5,%%ymm5,%%ymm5           \n"
3759    "vpslld     $0x18,%%ymm5,%%ymm5            \n"
3760    "sub        %0,%1                          \n"
3761
3762    // 8 pixel loop.
3763    LABELALIGN
3764    "1:                                        \n"
3765    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3766    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3767    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3768    "vpshufb    %%ymm4,%%ymm0,%%ymm2           \n"
3769    "vpshufb    %%ymm4,%%ymm1,%%ymm3           \n"
3770    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3771    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3772    "vpand      %%ymm5,%%ymm6,%%ymm6           \n"
3773    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
3774    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
3775    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3776    "vpor       %%ymm6,%%ymm0,%%ymm0           \n"
3777    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3778    "lea       " MEMLEA(0x20,0) ",%0           \n"
3779    "sub        $0x8,%2                        \n"
3780    "jg        1b                              \n"
3781    "vzeroupper                                \n"
3782  : "+r"(src_argb),    // %0
3783    "+r"(dst_argb),    // %1
3784    "+r"(width)        // %2
3785  : "m"(kShuffleAlpha_AVX2)  // %3
3786  : "memory", "cc"
3787    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3788  );
3789}
3790#endif  // HAS_ARGBATTENUATEROW_AVX2
3791
3792#ifdef HAS_ARGBUNATTENUATEROW_SSE2
3793// Unattenuate 4 pixels at a time.
3794void ARGBUnattenuateRow_SSE2(const uint8* src_argb,
3795                             uint8* dst_argb,
3796                             int width) {
3797  uintptr_t alpha;
3798  asm volatile (
3799    // 4 pixel loop.
3800    LABELALIGN
3801    "1:                                        \n"
3802    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3803    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3804    "punpcklbw %%xmm0,%%xmm0                   \n"
3805    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3806    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3807    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3808    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3809    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3810    "movlhps   %%xmm3,%%xmm2                   \n"
3811    "pmulhuw   %%xmm2,%%xmm0                   \n"
3812    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
3813    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3814    "punpckhbw %%xmm1,%%xmm1                   \n"
3815    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
3816    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3817    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
3818    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3819    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3820    "movlhps   %%xmm3,%%xmm2                   \n"
3821    "pmulhuw   %%xmm2,%%xmm1                   \n"
3822    "lea       " MEMLEA(0x10,0) ",%0           \n"
3823    "packuswb  %%xmm1,%%xmm0                   \n"
3824    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3825    "lea       " MEMLEA(0x10,1) ",%1           \n"
3826    "sub       $0x4,%2                         \n"
3827    "jg        1b                              \n"
3828  : "+r"(src_argb),     // %0
3829    "+r"(dst_argb),     // %1
3830    "+r"(width),        // %2
3831    "=&r"(alpha)        // %3
3832  : "r"(fixed_invtbl8)  // %4
3833  : "memory", "cc", NACL_R14
3834    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3835  );
3836}
3837#endif  // HAS_ARGBUNATTENUATEROW_SSE2
3838
3839#ifdef HAS_ARGBUNATTENUATEROW_AVX2
3840// Shuffle table duplicating alpha.
3841static const uvec8 kUnattenShuffleAlpha_AVX2 = {
3842    0u, 1u, 0u, 1u, 0u, 1u, 6u, 7u, 8u, 9u, 8u, 9u, 8u, 9u, 14u, 15u};
3843// Unattenuate 8 pixels at a time.
3844void ARGBUnattenuateRow_AVX2(const uint8* src_argb,
3845                             uint8* dst_argb,
3846                             int width) {
3847  uintptr_t alpha;
3848  asm volatile (
3849    "sub        %0,%1                          \n"
3850    "vbroadcastf128 %5,%%ymm5                  \n"
3851
3852    // 8 pixel loop.
3853    LABELALIGN
3854    "1:                                        \n"
3855    // replace VPGATHER
3856    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
3857    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3858    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
3859    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3860    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
3861    "vpunpckldq %%xmm1,%%xmm0,%%xmm6           \n"
3862    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3863    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
3864    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3865    "movzb     " MEMACCESS2(0x13,0) ",%3       \n"
3866    "vpunpckldq %%xmm3,%%xmm2,%%xmm7           \n"
3867    MEMOPREG(vmovd,0x00,4,3,4,xmm0)             //  vmovd 0x0(%4,%3,4),%%xmm0
3868    "movzb     " MEMACCESS2(0x17,0) ",%3       \n"
3869    MEMOPREG(vmovd,0x00,4,3,4,xmm1)             //  vmovd 0x0(%4,%3,4),%%xmm1
3870    "movzb     " MEMACCESS2(0x1b,0) ",%3       \n"
3871    "vpunpckldq %%xmm1,%%xmm0,%%xmm0           \n"
3872    MEMOPREG(vmovd,0x00,4,3,4,xmm2)             //  vmovd 0x0(%4,%3,4),%%xmm2
3873    "movzb     " MEMACCESS2(0x1f,0) ",%3       \n"
3874    MEMOPREG(vmovd,0x00,4,3,4,xmm3)             //  vmovd 0x0(%4,%3,4),%%xmm3
3875    "vpunpckldq %%xmm3,%%xmm2,%%xmm2           \n"
3876    "vpunpcklqdq %%xmm7,%%xmm6,%%xmm3          \n"
3877    "vpunpcklqdq %%xmm2,%%xmm0,%%xmm0          \n"
3878    "vinserti128 $0x1,%%xmm0,%%ymm3,%%ymm3     \n"
3879    // end of VPGATHER
3880
3881    "vmovdqu    " MEMACCESS(0) ",%%ymm6        \n"
3882    "vpunpcklbw %%ymm6,%%ymm6,%%ymm0           \n"
3883    "vpunpckhbw %%ymm6,%%ymm6,%%ymm1           \n"
3884    "vpunpcklwd %%ymm3,%%ymm3,%%ymm2           \n"
3885    "vpunpckhwd %%ymm3,%%ymm3,%%ymm3           \n"
3886    "vpshufb    %%ymm5,%%ymm2,%%ymm2           \n"
3887    "vpshufb    %%ymm5,%%ymm3,%%ymm3           \n"
3888    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
3889    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
3890    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
3891    MEMOPMEM(vmovdqu,ymm0,0x00,0,1,1)          //  vmovdqu %%ymm0,(%0,%1)
3892    "lea       " MEMLEA(0x20,0) ",%0           \n"
3893    "sub        $0x8,%2                        \n"
3894    "jg        1b                              \n"
3895    "vzeroupper                                \n"
3896  : "+r"(src_argb),      // %0
3897    "+r"(dst_argb),      // %1
3898    "+r"(width),         // %2
3899    "=&r"(alpha)         // %3
3900  : "r"(fixed_invtbl8),  // %4
3901    "m"(kUnattenShuffleAlpha_AVX2)  // %5
3902  : "memory", "cc", NACL_R14
3903    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3904  );
3905}
3906#endif  // HAS_ARGBUNATTENUATEROW_AVX2
3907
3908#ifdef HAS_ARGBGRAYROW_SSSE3
3909// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3910void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3911  asm volatile (
3912    "movdqa    %3,%%xmm4                       \n"
3913    "movdqa    %4,%%xmm5                       \n"
3914
3915    // 8 pixel loop.
3916    LABELALIGN
3917    "1:                                        \n"
3918    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3919    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3920    "pmaddubsw %%xmm4,%%xmm0                   \n"
3921    "pmaddubsw %%xmm4,%%xmm1                   \n"
3922    "phaddw    %%xmm1,%%xmm0                   \n"
3923    "paddw     %%xmm5,%%xmm0                   \n"
3924    "psrlw     $0x7,%%xmm0                     \n"
3925    "packuswb  %%xmm0,%%xmm0                   \n"
3926    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
3927    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
3928    "lea       " MEMLEA(0x20,0) ",%0           \n"
3929    "psrld     $0x18,%%xmm2                    \n"
3930    "psrld     $0x18,%%xmm3                    \n"
3931    "packuswb  %%xmm3,%%xmm2                   \n"
3932    "packuswb  %%xmm2,%%xmm2                   \n"
3933    "movdqa    %%xmm0,%%xmm3                   \n"
3934    "punpcklbw %%xmm0,%%xmm0                   \n"
3935    "punpcklbw %%xmm2,%%xmm3                   \n"
3936    "movdqa    %%xmm0,%%xmm1                   \n"
3937    "punpcklwd %%xmm3,%%xmm0                   \n"
3938    "punpckhwd %%xmm3,%%xmm1                   \n"
3939    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3940    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
3941    "lea       " MEMLEA(0x20,1) ",%1           \n"
3942    "sub       $0x8,%2                         \n"
3943    "jg        1b                              \n"
3944  : "+r"(src_argb),   // %0
3945    "+r"(dst_argb),   // %1
3946    "+r"(width)       // %2
3947  : "m"(kARGBToYJ),   // %3
3948    "m"(kAddYJ64)     // %4
3949  : "memory", "cc"
3950    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3951  );
3952}
3953#endif  // HAS_ARGBGRAYROW_SSSE3
3954
3955#ifdef HAS_ARGBSEPIAROW_SSSE3
3956//    b = (r * 35 + g * 68 + b * 17) >> 7
3957//    g = (r * 45 + g * 88 + b * 22) >> 7
3958//    r = (r * 50 + g * 98 + b * 24) >> 7
3959// Constant for ARGB color to sepia tone
3960static vec8 kARGBToSepiaB = {17, 68, 35, 0, 17, 68, 35, 0,
3961                             17, 68, 35, 0, 17, 68, 35, 0};
3962
3963static vec8 kARGBToSepiaG = {22, 88, 45, 0, 22, 88, 45, 0,
3964                             22, 88, 45, 0, 22, 88, 45, 0};
3965
3966static vec8 kARGBToSepiaR = {24, 98, 50, 0, 24, 98, 50, 0,
3967                             24, 98, 50, 0, 24, 98, 50, 0};
3968
3969// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3970void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3971  asm volatile (
3972    "movdqa    %2,%%xmm2                       \n"
3973    "movdqa    %3,%%xmm3                       \n"
3974    "movdqa    %4,%%xmm4                       \n"
3975
3976    // 8 pixel loop.
3977    LABELALIGN
3978    "1:                                        \n"
3979    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3980    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
3981    "pmaddubsw %%xmm2,%%xmm0                   \n"
3982    "pmaddubsw %%xmm2,%%xmm6                   \n"
3983    "phaddw    %%xmm6,%%xmm0                   \n"
3984    "psrlw     $0x7,%%xmm0                     \n"
3985    "packuswb  %%xmm0,%%xmm0                   \n"
3986    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3987    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3988    "pmaddubsw %%xmm3,%%xmm5                   \n"
3989    "pmaddubsw %%xmm3,%%xmm1                   \n"
3990    "phaddw    %%xmm1,%%xmm5                   \n"
3991    "psrlw     $0x7,%%xmm5                     \n"
3992    "packuswb  %%xmm5,%%xmm5                   \n"
3993    "punpcklbw %%xmm5,%%xmm0                   \n"
3994    "movdqu    " MEMACCESS(0) ",%%xmm5         \n"
3995    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3996    "pmaddubsw %%xmm4,%%xmm5                   \n"
3997    "pmaddubsw %%xmm4,%%xmm1                   \n"
3998    "phaddw    %%xmm1,%%xmm5                   \n"
3999    "psrlw     $0x7,%%xmm5                     \n"
4000    "packuswb  %%xmm5,%%xmm5                   \n"
4001    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4002    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4003    "psrld     $0x18,%%xmm6                    \n"
4004    "psrld     $0x18,%%xmm1                    \n"
4005    "packuswb  %%xmm1,%%xmm6                   \n"
4006    "packuswb  %%xmm6,%%xmm6                   \n"
4007    "punpcklbw %%xmm6,%%xmm5                   \n"
4008    "movdqa    %%xmm0,%%xmm1                   \n"
4009    "punpcklwd %%xmm5,%%xmm0                   \n"
4010    "punpckhwd %%xmm5,%%xmm1                   \n"
4011    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
4012    "movdqu    %%xmm1," MEMACCESS2(0x10,0) "   \n"
4013    "lea       " MEMLEA(0x20,0) ",%0           \n"
4014    "sub       $0x8,%1                         \n"
4015    "jg        1b                              \n"
4016  : "+r"(dst_argb),      // %0
4017    "+r"(width)          // %1
4018  : "m"(kARGBToSepiaB),  // %2
4019    "m"(kARGBToSepiaG),  // %3
4020    "m"(kARGBToSepiaR)   // %4
4021  : "memory", "cc"
4022    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4023  );
4024}
4025#endif  // HAS_ARGBSEPIAROW_SSSE3
4026
4027#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4028// Tranform 8 ARGB pixels (32 bytes) with color matrix.
4029// Same as Sepia except matrix is provided.
4030void ARGBColorMatrixRow_SSSE3(const uint8* src_argb,
4031                              uint8* dst_argb,
4032                              const int8* matrix_argb,
4033                              int width) {
4034  asm volatile (
4035    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
4036    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
4037    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
4038    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
4039    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
4040
4041    // 8 pixel loop.
4042    LABELALIGN
4043    "1:                                        \n"
4044    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4045    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4046    "pmaddubsw %%xmm2,%%xmm0                   \n"
4047    "pmaddubsw %%xmm2,%%xmm7                   \n"
4048    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4049    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4050    "pmaddubsw %%xmm3,%%xmm6                   \n"
4051    "pmaddubsw %%xmm3,%%xmm1                   \n"
4052    "phaddsw   %%xmm7,%%xmm0                   \n"
4053    "phaddsw   %%xmm1,%%xmm6                   \n"
4054    "psraw     $0x6,%%xmm0                     \n"
4055    "psraw     $0x6,%%xmm6                     \n"
4056    "packuswb  %%xmm0,%%xmm0                   \n"
4057    "packuswb  %%xmm6,%%xmm6                   \n"
4058    "punpcklbw %%xmm6,%%xmm0                   \n"
4059    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4060    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4061    "pmaddubsw %%xmm4,%%xmm1                   \n"
4062    "pmaddubsw %%xmm4,%%xmm7                   \n"
4063    "phaddsw   %%xmm7,%%xmm1                   \n"
4064    "movdqu    " MEMACCESS(0) ",%%xmm6         \n"
4065    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4066    "pmaddubsw %%xmm5,%%xmm6                   \n"
4067    "pmaddubsw %%xmm5,%%xmm7                   \n"
4068    "phaddsw   %%xmm7,%%xmm6                   \n"
4069    "psraw     $0x6,%%xmm1                     \n"
4070    "psraw     $0x6,%%xmm6                     \n"
4071    "packuswb  %%xmm1,%%xmm1                   \n"
4072    "packuswb  %%xmm6,%%xmm6                   \n"
4073    "punpcklbw %%xmm6,%%xmm1                   \n"
4074    "movdqa    %%xmm0,%%xmm6                   \n"
4075    "punpcklwd %%xmm1,%%xmm0                   \n"
4076    "punpckhwd %%xmm1,%%xmm6                   \n"
4077    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4078    "movdqu    %%xmm6," MEMACCESS2(0x10,1) "   \n"
4079    "lea       " MEMLEA(0x20,0) ",%0           \n"
4080    "lea       " MEMLEA(0x20,1) ",%1           \n"
4081    "sub       $0x8,%2                         \n"
4082    "jg        1b                              \n"
4083  : "+r"(src_argb),      // %0
4084    "+r"(dst_argb),      // %1
4085    "+r"(width)          // %2
4086  : "r"(matrix_argb)     // %3
4087  : "memory", "cc"
4088    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4089  );
4090}
4091#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4092
4093#ifdef HAS_ARGBQUANTIZEROW_SSE2
4094// Quantize 4 ARGB pixels (16 bytes).
4095void ARGBQuantizeRow_SSE2(uint8* dst_argb,
4096                          int scale,
4097                          int interval_size,
4098                          int interval_offset,
4099                          int width) {
4100  asm volatile (
4101    "movd      %2,%%xmm2                       \n"
4102    "movd      %3,%%xmm3                       \n"
4103    "movd      %4,%%xmm4                       \n"
4104    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4105    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
4106    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4107    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
4108    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
4109    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
4110    "pxor      %%xmm5,%%xmm5                   \n"
4111    "pcmpeqb   %%xmm6,%%xmm6                   \n"
4112    "pslld     $0x18,%%xmm6                    \n"
4113
4114    // 4 pixel loop.
4115    LABELALIGN
4116    "1:                                        \n"
4117    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4118    "punpcklbw %%xmm5,%%xmm0                   \n"
4119    "pmulhuw   %%xmm2,%%xmm0                   \n"
4120    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4121    "punpckhbw %%xmm5,%%xmm1                   \n"
4122    "pmulhuw   %%xmm2,%%xmm1                   \n"
4123    "pmullw    %%xmm3,%%xmm0                   \n"
4124    "movdqu    " MEMACCESS(0) ",%%xmm7         \n"
4125    "pmullw    %%xmm3,%%xmm1                   \n"
4126    "pand      %%xmm6,%%xmm7                   \n"
4127    "paddw     %%xmm4,%%xmm0                   \n"
4128    "paddw     %%xmm4,%%xmm1                   \n"
4129    "packuswb  %%xmm1,%%xmm0                   \n"
4130    "por       %%xmm7,%%xmm0                   \n"
4131    "movdqu    %%xmm0," MEMACCESS(0) "         \n"
4132    "lea       " MEMLEA(0x10,0) ",%0           \n"
4133    "sub       $0x4,%1                         \n"
4134    "jg        1b                              \n"
4135  : "+r"(dst_argb),       // %0
4136    "+r"(width)           // %1
4137  : "r"(scale),           // %2
4138    "r"(interval_size),   // %3
4139    "r"(interval_offset)  // %4
4140  : "memory", "cc"
4141    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4142  );
4143}
4144#endif  // HAS_ARGBQUANTIZEROW_SSE2
4145
4146#ifdef HAS_ARGBSHADEROW_SSE2
4147// Shade 4 pixels at a time by specified value.
4148void ARGBShadeRow_SSE2(const uint8* src_argb,
4149                       uint8* dst_argb,
4150                       int width,
4151                       uint32 value) {
4152  asm volatile (
4153    "movd      %3,%%xmm2                       \n"
4154    "punpcklbw %%xmm2,%%xmm2                   \n"
4155    "punpcklqdq %%xmm2,%%xmm2                  \n"
4156
4157    // 4 pixel loop.
4158    LABELALIGN
4159    "1:                                        \n"
4160    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4161    "lea       " MEMLEA(0x10,0) ",%0           \n"
4162    "movdqa    %%xmm0,%%xmm1                   \n"
4163    "punpcklbw %%xmm0,%%xmm0                   \n"
4164    "punpckhbw %%xmm1,%%xmm1                   \n"
4165    "pmulhuw   %%xmm2,%%xmm0                   \n"
4166    "pmulhuw   %%xmm2,%%xmm1                   \n"
4167    "psrlw     $0x8,%%xmm0                     \n"
4168    "psrlw     $0x8,%%xmm1                     \n"
4169    "packuswb  %%xmm1,%%xmm0                   \n"
4170    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4171    "lea       " MEMLEA(0x10,1) ",%1           \n"
4172    "sub       $0x4,%2                         \n"
4173    "jg        1b                              \n"
4174  : "+r"(src_argb),  // %0
4175    "+r"(dst_argb),  // %1
4176    "+r"(width)      // %2
4177  : "r"(value)       // %3
4178  : "memory", "cc"
4179    , "xmm0", "xmm1", "xmm2"
4180  );
4181}
4182#endif  // HAS_ARGBSHADEROW_SSE2
4183
4184#ifdef HAS_ARGBMULTIPLYROW_SSE2
4185// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4186void ARGBMultiplyRow_SSE2(const uint8* src_argb0,
4187                          const uint8* src_argb1,
4188                          uint8* dst_argb,
4189                          int width) {
4190  asm volatile (
4191    "pxor      %%xmm5,%%xmm5                   \n"
4192
4193    // 4 pixel loop.
4194    LABELALIGN
4195    "1:                                        \n"
4196    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4197    "lea       " MEMLEA(0x10,0) ",%0           \n"
4198    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
4199    "lea       " MEMLEA(0x10,1) ",%1           \n"
4200    "movdqu    %%xmm0,%%xmm1                   \n"
4201    "movdqu    %%xmm2,%%xmm3                   \n"
4202    "punpcklbw %%xmm0,%%xmm0                   \n"
4203    "punpckhbw %%xmm1,%%xmm1                   \n"
4204    "punpcklbw %%xmm5,%%xmm2                   \n"
4205    "punpckhbw %%xmm5,%%xmm3                   \n"
4206    "pmulhuw   %%xmm2,%%xmm0                   \n"
4207    "pmulhuw   %%xmm3,%%xmm1                   \n"
4208    "packuswb  %%xmm1,%%xmm0                   \n"
4209    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4210    "lea       " MEMLEA(0x10,2) ",%2           \n"
4211    "sub       $0x4,%3                         \n"
4212    "jg        1b                              \n"
4213  : "+r"(src_argb0),  // %0
4214    "+r"(src_argb1),  // %1
4215    "+r"(dst_argb),   // %2
4216    "+r"(width)       // %3
4217  :
4218  : "memory", "cc"
4219    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4220  );
4221}
4222#endif  // HAS_ARGBMULTIPLYROW_SSE2
4223
4224#ifdef HAS_ARGBMULTIPLYROW_AVX2
4225// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
4226void ARGBMultiplyRow_AVX2(const uint8* src_argb0,
4227                          const uint8* src_argb1,
4228                          uint8* dst_argb,
4229                          int width) {
4230  asm volatile (
4231    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
4232
4233    // 4 pixel loop.
4234    LABELALIGN
4235    "1:                                        \n"
4236    "vmovdqu    " MEMACCESS(0) ",%%ymm1        \n"
4237    "lea        " MEMLEA(0x20,0) ",%0          \n"
4238    "vmovdqu    " MEMACCESS(1) ",%%ymm3        \n"
4239    "lea        " MEMLEA(0x20,1) ",%1          \n"
4240    "vpunpcklbw %%ymm1,%%ymm1,%%ymm0           \n"
4241    "vpunpckhbw %%ymm1,%%ymm1,%%ymm1           \n"
4242    "vpunpcklbw %%ymm5,%%ymm3,%%ymm2           \n"
4243    "vpunpckhbw %%ymm5,%%ymm3,%%ymm3           \n"
4244    "vpmulhuw   %%ymm2,%%ymm0,%%ymm0           \n"
4245    "vpmulhuw   %%ymm3,%%ymm1,%%ymm1           \n"
4246    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
4247    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4248    "lea       " MEMLEA(0x20,2) ",%2           \n"
4249    "sub        $0x8,%3                        \n"
4250    "jg        1b                              \n"
4251    "vzeroupper                                \n"
4252  : "+r"(src_argb0),  // %0
4253    "+r"(src_argb1),  // %1
4254    "+r"(dst_argb),   // %2
4255    "+r"(width)       // %3
4256  :
4257  : "memory", "cc"
4258#if defined(__AVX2__)
4259    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4260#endif
4261  );
4262}
4263#endif  // HAS_ARGBMULTIPLYROW_AVX2
4264
4265#ifdef HAS_ARGBADDROW_SSE2
4266// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4267void ARGBAddRow_SSE2(const uint8* src_argb0,
4268                     const uint8* src_argb1,
4269                     uint8* dst_argb,
4270                     int width) {
4271  asm volatile (
4272    // 4 pixel loop.
4273    LABELALIGN
4274    "1:                                        \n"
4275    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4276    "lea       " MEMLEA(0x10,0) ",%0           \n"
4277    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4278    "lea       " MEMLEA(0x10,1) ",%1           \n"
4279    "paddusb   %%xmm1,%%xmm0                   \n"
4280    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4281    "lea       " MEMLEA(0x10,2) ",%2           \n"
4282    "sub       $0x4,%3                         \n"
4283    "jg        1b                              \n"
4284  : "+r"(src_argb0),  // %0
4285    "+r"(src_argb1),  // %1
4286    "+r"(dst_argb),   // %2
4287    "+r"(width)       // %3
4288  :
4289  : "memory", "cc"
4290    , "xmm0", "xmm1"
4291  );
4292}
4293#endif  // HAS_ARGBADDROW_SSE2
4294
4295#ifdef HAS_ARGBADDROW_AVX2
4296// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4297void ARGBAddRow_AVX2(const uint8* src_argb0,
4298                     const uint8* src_argb1,
4299                     uint8* dst_argb,
4300                     int width) {
4301  asm volatile (
4302    // 4 pixel loop.
4303    LABELALIGN
4304    "1:                                        \n"
4305    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4306    "lea        " MEMLEA(0x20,0) ",%0          \n"
4307    "vpaddusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4308    "lea        " MEMLEA(0x20,1) ",%1          \n"
4309    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4310    "lea        " MEMLEA(0x20,2) ",%2          \n"
4311    "sub        $0x8,%3                        \n"
4312    "jg        1b                              \n"
4313    "vzeroupper                                \n"
4314  : "+r"(src_argb0),  // %0
4315    "+r"(src_argb1),  // %1
4316    "+r"(dst_argb),   // %2
4317    "+r"(width)       // %3
4318  :
4319  : "memory", "cc"
4320    , "xmm0"
4321  );
4322}
4323#endif  // HAS_ARGBADDROW_AVX2
4324
4325#ifdef HAS_ARGBSUBTRACTROW_SSE2
4326// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4327void ARGBSubtractRow_SSE2(const uint8* src_argb0,
4328                          const uint8* src_argb1,
4329                          uint8* dst_argb,
4330                          int width) {
4331  asm volatile (
4332    // 4 pixel loop.
4333    LABELALIGN
4334    "1:                                        \n"
4335    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4336    "lea       " MEMLEA(0x10,0) ",%0           \n"
4337    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4338    "lea       " MEMLEA(0x10,1) ",%1           \n"
4339    "psubusb   %%xmm1,%%xmm0                   \n"
4340    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4341    "lea       " MEMLEA(0x10,2) ",%2           \n"
4342    "sub       $0x4,%3                         \n"
4343    "jg        1b                              \n"
4344  : "+r"(src_argb0),  // %0
4345    "+r"(src_argb1),  // %1
4346    "+r"(dst_argb),   // %2
4347    "+r"(width)       // %3
4348  :
4349  : "memory", "cc"
4350    , "xmm0", "xmm1"
4351  );
4352}
4353#endif  // HAS_ARGBSUBTRACTROW_SSE2
4354
4355#ifdef HAS_ARGBSUBTRACTROW_AVX2
4356// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
4357void ARGBSubtractRow_AVX2(const uint8* src_argb0,
4358                          const uint8* src_argb1,
4359                          uint8* dst_argb,
4360                          int width) {
4361  asm volatile (
4362    // 4 pixel loop.
4363    LABELALIGN
4364    "1:                                        \n"
4365    "vmovdqu    " MEMACCESS(0) ",%%ymm0        \n"
4366    "lea        " MEMLEA(0x20,0) ",%0          \n"
4367    "vpsubusb   " MEMACCESS(1) ",%%ymm0,%%ymm0 \n"
4368    "lea        " MEMLEA(0x20,1) ",%1          \n"
4369    "vmovdqu    %%ymm0," MEMACCESS(2) "        \n"
4370    "lea        " MEMLEA(0x20,2) ",%2          \n"
4371    "sub        $0x8,%3                        \n"
4372    "jg         1b                             \n"
4373    "vzeroupper                                \n"
4374  : "+r"(src_argb0),  // %0
4375    "+r"(src_argb1),  // %1
4376    "+r"(dst_argb),   // %2
4377    "+r"(width)       // %3
4378  :
4379  : "memory", "cc"
4380    , "xmm0"
4381  );
4382}
4383#endif  // HAS_ARGBSUBTRACTROW_AVX2
4384
4385#ifdef HAS_SOBELXROW_SSE2
4386// SobelX as a matrix is
4387// -1  0  1
4388// -2  0  2
4389// -1  0  1
4390void SobelXRow_SSE2(const uint8* src_y0,
4391                    const uint8* src_y1,
4392                    const uint8* src_y2,
4393                    uint8* dst_sobelx,
4394                    int width) {
4395  asm volatile (
4396    "sub       %0,%1                           \n"
4397    "sub       %0,%2                           \n"
4398    "sub       %0,%3                           \n"
4399    "pxor      %%xmm5,%%xmm5                   \n"
4400
4401    // 8 pixel loop.
4402    LABELALIGN
4403    "1:                                        \n"
4404    "movq      " MEMACCESS(0) ",%%xmm0         \n"
4405    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
4406    "punpcklbw %%xmm5,%%xmm0                   \n"
4407    "punpcklbw %%xmm5,%%xmm1                   \n"
4408    "psubw     %%xmm1,%%xmm0                   \n"
4409    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4410    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
4411    "punpcklbw %%xmm5,%%xmm1                   \n"
4412    "punpcklbw %%xmm5,%%xmm2                   \n"
4413    "psubw     %%xmm2,%%xmm1                   \n"
4414    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
4415    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
4416    "punpcklbw %%xmm5,%%xmm2                   \n"
4417    "punpcklbw %%xmm5,%%xmm3                   \n"
4418    "psubw     %%xmm3,%%xmm2                   \n"
4419    "paddw     %%xmm2,%%xmm0                   \n"
4420    "paddw     %%xmm1,%%xmm0                   \n"
4421    "paddw     %%xmm1,%%xmm0                   \n"
4422    "pxor      %%xmm1,%%xmm1                   \n"
4423    "psubw     %%xmm0,%%xmm1                   \n"
4424    "pmaxsw    %%xmm1,%%xmm0                   \n"
4425    "packuswb  %%xmm0,%%xmm0                   \n"
4426    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
4427    "lea       " MEMLEA(0x8,0) ",%0            \n"
4428    "sub       $0x8,%4                         \n"
4429    "jg        1b                              \n"
4430  : "+r"(src_y0),      // %0
4431    "+r"(src_y1),      // %1
4432    "+r"(src_y2),      // %2
4433    "+r"(dst_sobelx),  // %3
4434    "+r"(width)        // %4
4435  :
4436  : "memory", "cc", NACL_R14
4437    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4438  );
4439}
4440#endif  // HAS_SOBELXROW_SSE2
4441
4442#ifdef HAS_SOBELYROW_SSE2
4443// SobelY as a matrix is
4444// -1 -2 -1
4445//  0  0  0
4446//  1  2  1
4447void SobelYRow_SSE2(const uint8* src_y0,
4448                    const uint8* src_y1,
4449                    uint8* dst_sobely,
4450                    int width) {
4451  asm volatile (
4452    "sub       %0,%1                           \n"
4453    "sub       %0,%2                           \n"
4454    "pxor      %%xmm5,%%xmm5                   \n"
4455
4456    // 8 pixel loop.
4457    LABELALIGN
4458    "1:                                        \n"
4459    "movq      " MEMACCESS(0) ",%%xmm0         \n"
4460    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4461    "punpcklbw %%xmm5,%%xmm0                   \n"
4462    "punpcklbw %%xmm5,%%xmm1                   \n"
4463    "psubw     %%xmm1,%%xmm0                   \n"
4464    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
4465    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
4466    "punpcklbw %%xmm5,%%xmm1                   \n"
4467    "punpcklbw %%xmm5,%%xmm2                   \n"
4468    "psubw     %%xmm2,%%xmm1                   \n"
4469    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
4470    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
4471    "punpcklbw %%xmm5,%%xmm2                   \n"
4472    "punpcklbw %%xmm5,%%xmm3                   \n"
4473    "psubw     %%xmm3,%%xmm2                   \n"
4474    "paddw     %%xmm2,%%xmm0                   \n"
4475    "paddw     %%xmm1,%%xmm0                   \n"
4476    "paddw     %%xmm1,%%xmm0                   \n"
4477    "pxor      %%xmm1,%%xmm1                   \n"
4478    "psubw     %%xmm0,%%xmm1                   \n"
4479    "pmaxsw    %%xmm1,%%xmm0                   \n"
4480    "packuswb  %%xmm0,%%xmm0                   \n"
4481    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
4482    "lea       " MEMLEA(0x8,0) ",%0            \n"
4483    "sub       $0x8,%3                         \n"
4484    "jg        1b                              \n"
4485  : "+r"(src_y0),      // %0
4486    "+r"(src_y1),      // %1
4487    "+r"(dst_sobely),  // %2
4488    "+r"(width)        // %3
4489  :
4490  : "memory", "cc", NACL_R14
4491    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4492  );
4493}
4494#endif  // HAS_SOBELYROW_SSE2
4495
4496#ifdef HAS_SOBELROW_SSE2
4497// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4498// A = 255
4499// R = Sobel
4500// G = Sobel
4501// B = Sobel
4502void SobelRow_SSE2(const uint8* src_sobelx,
4503                   const uint8* src_sobely,
4504                   uint8* dst_argb,
4505                   int width) {
4506  asm volatile (
4507    "sub       %0,%1                           \n"
4508    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4509    "pslld     $0x18,%%xmm5                    \n"
4510
4511    // 8 pixel loop.
4512    LABELALIGN
4513    "1:                                        \n"
4514    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4515    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4516    "lea       " MEMLEA(0x10,0) ",%0           \n"
4517    "paddusb   %%xmm1,%%xmm0                   \n"
4518    "movdqa    %%xmm0,%%xmm2                   \n"
4519    "punpcklbw %%xmm0,%%xmm2                   \n"
4520    "punpckhbw %%xmm0,%%xmm0                   \n"
4521    "movdqa    %%xmm2,%%xmm1                   \n"
4522    "punpcklwd %%xmm2,%%xmm1                   \n"
4523    "punpckhwd %%xmm2,%%xmm2                   \n"
4524    "por       %%xmm5,%%xmm1                   \n"
4525    "por       %%xmm5,%%xmm2                   \n"
4526    "movdqa    %%xmm0,%%xmm3                   \n"
4527    "punpcklwd %%xmm0,%%xmm3                   \n"
4528    "punpckhwd %%xmm0,%%xmm0                   \n"
4529    "por       %%xmm5,%%xmm3                   \n"
4530    "por       %%xmm5,%%xmm0                   \n"
4531    "movdqu    %%xmm1," MEMACCESS(2) "         \n"
4532    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "   \n"
4533    "movdqu    %%xmm3," MEMACCESS2(0x20,2) "   \n"
4534    "movdqu    %%xmm0," MEMACCESS2(0x30,2) "   \n"
4535    "lea       " MEMLEA(0x40,2) ",%2           \n"
4536    "sub       $0x10,%3                        \n"
4537    "jg        1b                              \n"
4538  : "+r"(src_sobelx),  // %0
4539    "+r"(src_sobely),  // %1
4540    "+r"(dst_argb),    // %2
4541    "+r"(width)        // %3
4542  :
4543  : "memory", "cc", NACL_R14
4544    "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4545  );
4546}
4547#endif  // HAS_SOBELROW_SSE2
4548
4549#ifdef HAS_SOBELTOPLANEROW_SSE2
4550// Adds Sobel X and Sobel Y and stores Sobel into a plane.
4551void SobelToPlaneRow_SSE2(const uint8* src_sobelx,
4552                          const uint8* src_sobely,
4553                          uint8* dst_y,
4554                          int width) {
4555  asm volatile (
4556    "sub       %0,%1                           \n"
4557    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4558    "pslld     $0x18,%%xmm5                    \n"
4559
4560    // 8 pixel loop.
4561    LABELALIGN
4562    "1:                                        \n"
4563    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4564    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4565    "lea       " MEMLEA(0x10,0) ",%0           \n"
4566    "paddusb   %%xmm1,%%xmm0                   \n"
4567    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4568    "lea       " MEMLEA(0x10,2) ",%2           \n"
4569    "sub       $0x10,%3                        \n"
4570    "jg        1b                              \n"
4571  : "+r"(src_sobelx),  // %0
4572    "+r"(src_sobely),  // %1
4573    "+r"(dst_y),       // %2
4574    "+r"(width)        // %3
4575  :
4576  : "memory", "cc", NACL_R14
4577    "xmm0", "xmm1"
4578  );
4579}
4580#endif  // HAS_SOBELTOPLANEROW_SSE2
4581
4582#ifdef HAS_SOBELXYROW_SSE2
4583// Mixes Sobel X, Sobel Y and Sobel into ARGB.
4584// A = 255
4585// R = Sobel X
4586// G = Sobel
4587// B = Sobel Y
4588void SobelXYRow_SSE2(const uint8* src_sobelx,
4589                     const uint8* src_sobely,
4590                     uint8* dst_argb,
4591                     int width) {
4592  asm volatile (
4593    "sub       %0,%1                           \n"
4594    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4595
4596    // 8 pixel loop.
4597    LABELALIGN
4598    "1:                                        \n"
4599    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4600    MEMOPREG(movdqu,0x00,0,1,1,xmm1)           //  movdqu    (%0,%1,1),%%xmm1
4601    "lea       " MEMLEA(0x10,0) ",%0           \n"
4602    "movdqa    %%xmm0,%%xmm2                   \n"
4603    "paddusb   %%xmm1,%%xmm2                   \n"
4604    "movdqa    %%xmm0,%%xmm3                   \n"
4605    "punpcklbw %%xmm5,%%xmm3                   \n"
4606    "punpckhbw %%xmm5,%%xmm0                   \n"
4607    "movdqa    %%xmm1,%%xmm4                   \n"
4608    "punpcklbw %%xmm2,%%xmm4                   \n"
4609    "punpckhbw %%xmm2,%%xmm1                   \n"
4610    "movdqa    %%xmm4,%%xmm6                   \n"
4611    "punpcklwd %%xmm3,%%xmm6                   \n"
4612    "punpckhwd %%xmm3,%%xmm4                   \n"
4613    "movdqa    %%xmm1,%%xmm7                   \n"
4614    "punpcklwd %%xmm0,%%xmm7                   \n"
4615    "punpckhwd %%xmm0,%%xmm1                   \n"
4616    "movdqu    %%xmm6," MEMACCESS(2) "         \n"
4617    "movdqu    %%xmm4," MEMACCESS2(0x10,2) "   \n"
4618    "movdqu    %%xmm7," MEMACCESS2(0x20,2) "   \n"
4619    "movdqu    %%xmm1," MEMACCESS2(0x30,2) "   \n"
4620    "lea       " MEMLEA(0x40,2) ",%2           \n"
4621    "sub       $0x10,%3                        \n"
4622    "jg        1b                              \n"
4623  : "+r"(src_sobelx),  // %0
4624    "+r"(src_sobely),  // %1
4625    "+r"(dst_argb),    // %2
4626    "+r"(width)        // %3
4627  :
4628  : "memory", "cc", NACL_R14
4629    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4630  );
4631}
4632#endif  // HAS_SOBELXYROW_SSE2
4633
4634#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
4635// Creates a table of cumulative sums where each value is a sum of all values
4636// above and to the left of the value, inclusive of the value.
4637void ComputeCumulativeSumRow_SSE2(const uint8* row,
4638                                  int32* cumsum,
4639                                  const int32* previous_cumsum,
4640                                  int width) {
4641  asm volatile (
4642    "pxor      %%xmm0,%%xmm0                   \n"
4643    "pxor      %%xmm1,%%xmm1                   \n"
4644    "sub       $0x4,%3                         \n"
4645    "jl        49f                             \n"
4646    "test      $0xf,%1                         \n"
4647    "jne       49f                             \n"
4648
4649    // 4 pixel loop.
4650    LABELALIGN
4651    "40:                                       \n"
4652    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4653    "lea       " MEMLEA(0x10,0) ",%0           \n"
4654    "movdqa    %%xmm2,%%xmm4                   \n"
4655    "punpcklbw %%xmm1,%%xmm2                   \n"
4656    "movdqa    %%xmm2,%%xmm3                   \n"
4657    "punpcklwd %%xmm1,%%xmm2                   \n"
4658    "punpckhwd %%xmm1,%%xmm3                   \n"
4659    "punpckhbw %%xmm1,%%xmm4                   \n"
4660    "movdqa    %%xmm4,%%xmm5                   \n"
4661    "punpcklwd %%xmm1,%%xmm4                   \n"
4662    "punpckhwd %%xmm1,%%xmm5                   \n"
4663    "paddd     %%xmm2,%%xmm0                   \n"
4664    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4665    "paddd     %%xmm0,%%xmm2                   \n"
4666    "paddd     %%xmm3,%%xmm0                   \n"
4667    "movdqu    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
4668    "paddd     %%xmm0,%%xmm3                   \n"
4669    "paddd     %%xmm4,%%xmm0                   \n"
4670    "movdqu    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
4671    "paddd     %%xmm0,%%xmm4                   \n"
4672    "paddd     %%xmm5,%%xmm0                   \n"
4673    "movdqu    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
4674    "lea       " MEMLEA(0x40,2) ",%2           \n"
4675    "paddd     %%xmm0,%%xmm5                   \n"
4676    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4677    "movdqu    %%xmm3," MEMACCESS2(0x10,1) "   \n"
4678    "movdqu    %%xmm4," MEMACCESS2(0x20,1) "   \n"
4679    "movdqu    %%xmm5," MEMACCESS2(0x30,1) "   \n"
4680    "lea       " MEMLEA(0x40,1) ",%1           \n"
4681    "sub       $0x4,%3                         \n"
4682    "jge       40b                             \n"
4683
4684    "49:                                       \n"
4685    "add       $0x3,%3                         \n"
4686    "jl        19f                             \n"
4687
4688    // 1 pixel loop.
4689    LABELALIGN
4690    "10:                                       \n"
4691    "movd      " MEMACCESS(0) ",%%xmm2         \n"
4692    "lea       " MEMLEA(0x4,0) ",%0            \n"
4693    "punpcklbw %%xmm1,%%xmm2                   \n"
4694    "punpcklwd %%xmm1,%%xmm2                   \n"
4695    "paddd     %%xmm2,%%xmm0                   \n"
4696    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
4697    "lea       " MEMLEA(0x10,2) ",%2           \n"
4698    "paddd     %%xmm0,%%xmm2                   \n"
4699    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
4700    "lea       " MEMLEA(0x10,1) ",%1           \n"
4701    "sub       $0x1,%3                         \n"
4702    "jge       10b                             \n"
4703
4704    "19:                                       \n"
4705  : "+r"(row),  // %0
4706    "+r"(cumsum),  // %1
4707    "+r"(previous_cumsum),  // %2
4708    "+r"(width)  // %3
4709  :
4710  : "memory", "cc"
4711    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4712  );
4713}
4714#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
4715
4716#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4717void CumulativeSumToAverageRow_SSE2(const int32* topleft,
4718                                    const int32* botleft,
4719                                    int width,
4720                                    int area,
4721                                    uint8* dst,
4722                                    int count) {
4723  asm volatile (
4724    "movd      %5,%%xmm5                       \n"
4725    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
4726    "rcpss     %%xmm5,%%xmm4                   \n"
4727    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4728    "sub       $0x4,%3                         \n"
4729    "jl        49f                             \n"
4730    "cmpl      $0x80,%5                        \n"
4731    "ja        40f                             \n"
4732
4733    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4734    "pcmpeqb   %%xmm6,%%xmm6                   \n"
4735    "psrld     $0x10,%%xmm6                    \n"
4736    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
4737    "addps     %%xmm6,%%xmm5                   \n"
4738    "mulps     %%xmm4,%%xmm5                   \n"
4739    "cvtps2dq  %%xmm5,%%xmm5                   \n"
4740    "packssdw  %%xmm5,%%xmm5                   \n"
4741
4742    // 4 pixel small loop.
4743    LABELALIGN
4744  "4:                                         \n"
4745    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4746    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4747    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4748    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4749    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4750    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4751    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4752    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4753    "lea       " MEMLEA(0x40,0) ",%0           \n"
4754    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4755    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4756    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4757    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4758    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4759    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4760    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4761    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4762    "lea       " MEMLEA(0x40,1) ",%1           \n"
4763    "packssdw  %%xmm1,%%xmm0                   \n"
4764    "packssdw  %%xmm3,%%xmm2                   \n"
4765    "pmulhuw   %%xmm5,%%xmm0                   \n"
4766    "pmulhuw   %%xmm5,%%xmm2                   \n"
4767    "packuswb  %%xmm2,%%xmm0                   \n"
4768    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4769    "lea       " MEMLEA(0x10,2) ",%2           \n"
4770    "sub       $0x4,%3                         \n"
4771    "jge       4b                              \n"
4772    "jmp       49f                             \n"
4773
4774  // 4 pixel loop                              \n"
4775    LABELALIGN
4776  "40:                                         \n"
4777    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4778    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4779    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
4780    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
4781    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4782    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
4783    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
4784    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
4785    "lea       " MEMLEA(0x40,0) ",%0           \n"
4786    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4787    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
4788    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
4789    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
4790    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4791    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
4792    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
4793    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
4794    "lea       " MEMLEA(0x40,1) ",%1           \n"
4795    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4796    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
4797    "mulps     %%xmm4,%%xmm0                   \n"
4798    "mulps     %%xmm4,%%xmm1                   \n"
4799    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
4800    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
4801    "mulps     %%xmm4,%%xmm2                   \n"
4802    "mulps     %%xmm4,%%xmm3                   \n"
4803    "cvtps2dq  %%xmm0,%%xmm0                   \n"
4804    "cvtps2dq  %%xmm1,%%xmm1                   \n"
4805    "cvtps2dq  %%xmm2,%%xmm2                   \n"
4806    "cvtps2dq  %%xmm3,%%xmm3                   \n"
4807    "packssdw  %%xmm1,%%xmm0                   \n"
4808    "packssdw  %%xmm3,%%xmm2                   \n"
4809    "packuswb  %%xmm2,%%xmm0                   \n"
4810    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4811    "lea       " MEMLEA(0x10,2) ",%2           \n"
4812    "sub       $0x4,%3                         \n"
4813    "jge       40b                             \n"
4814
4815  "49:                                         \n"
4816    "add       $0x3,%3                         \n"
4817    "jl        19f                             \n"
4818
4819  // 1 pixel loop                              \n"
4820    LABELALIGN
4821  "10:                                         \n"
4822    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4823    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
4824    "lea       " MEMLEA(0x10,0) ",%0           \n"
4825    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
4826    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
4827    "lea       " MEMLEA(0x10,1) ",%1           \n"
4828    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
4829    "mulps     %%xmm4,%%xmm0                   \n"
4830    "cvtps2dq  %%xmm0,%%xmm0                   \n"
4831    "packssdw  %%xmm0,%%xmm0                   \n"
4832    "packuswb  %%xmm0,%%xmm0                   \n"
4833    "movd      %%xmm0," MEMACCESS(2) "         \n"
4834    "lea       " MEMLEA(0x4,2) ",%2            \n"
4835    "sub       $0x1,%3                         \n"
4836    "jge       10b                             \n"
4837  "19:                                         \n"
4838  : "+r"(topleft),  // %0
4839    "+r"(botleft),  // %1
4840    "+r"(dst),      // %2
4841    "+rm"(count)    // %3
4842  : "r"((intptr_t)(width)),  // %4
4843    "rm"(area)     // %5
4844  : "memory", "cc", NACL_R14
4845    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4846  );
4847}
4848#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
4849
4850#ifdef HAS_ARGBAFFINEROW_SSE2
4851// Copy ARGB pixels from source image with slope to a row of destination.
4852LIBYUV_API
4853void ARGBAffineRow_SSE2(const uint8* src_argb,
4854                        int src_argb_stride,
4855                        uint8* dst_argb,
4856                        const float* src_dudv,
4857                        int width) {
4858  intptr_t src_argb_stride_temp = src_argb_stride;
4859  intptr_t temp;
4860  asm volatile (
4861    "movq      " MEMACCESS(3) ",%%xmm2         \n"
4862    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
4863    "shl       $0x10,%1                        \n"
4864    "add       $0x4,%1                         \n"
4865    "movd      %1,%%xmm5                       \n"
4866    "sub       $0x4,%4                         \n"
4867    "jl        49f                             \n"
4868
4869    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
4870    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4871    "movdqa    %%xmm2,%%xmm0                   \n"
4872    "addps     %%xmm7,%%xmm0                   \n"
4873    "movlhps   %%xmm0,%%xmm2                   \n"
4874    "movdqa    %%xmm7,%%xmm4                   \n"
4875    "addps     %%xmm4,%%xmm4                   \n"
4876    "movdqa    %%xmm2,%%xmm3                   \n"
4877    "addps     %%xmm4,%%xmm3                   \n"
4878    "addps     %%xmm4,%%xmm4                   \n"
4879
4880  // 4 pixel loop                              \n"
4881    LABELALIGN
4882  "40:                                         \n"
4883    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
4884    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
4885    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
4886    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
4887    "movd      %%xmm0,%k1                      \n"
4888    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4889    "movd      %%xmm0,%k5                      \n"
4890    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4891    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
4892    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4893    "punpckldq %%xmm6,%%xmm1                   \n"
4894    "addps     %%xmm4,%%xmm2                   \n"
4895    "movq      %%xmm1," MEMACCESS(2) "         \n"
4896    "movd      %%xmm0,%k1                      \n"
4897    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
4898    "movd      %%xmm0,%k5                      \n"
4899    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4900    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
4901    "punpckldq %%xmm6,%%xmm0                   \n"
4902    "addps     %%xmm4,%%xmm3                   \n"
4903    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
4904    "lea       " MEMLEA(0x10,2) ",%2           \n"
4905    "sub       $0x4,%4                         \n"
4906    "jge       40b                             \n"
4907
4908  "49:                                         \n"
4909    "add       $0x3,%4                         \n"
4910    "jl        19f                             \n"
4911
4912  // 1 pixel loop                              \n"
4913    LABELALIGN
4914  "10:                                         \n"
4915    "cvttps2dq %%xmm2,%%xmm0                   \n"
4916    "packssdw  %%xmm0,%%xmm0                   \n"
4917    "pmaddwd   %%xmm5,%%xmm0                   \n"
4918    "addps     %%xmm7,%%xmm2                   \n"
4919    "movd      %%xmm0,%k1                      \n"
4920    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
4921    "movd      %%xmm0," MEMACCESS(2) "         \n"
4922    "lea       " MEMLEA(0x04,2) ",%2           \n"
4923    "sub       $0x1,%4                         \n"
4924    "jge       10b                             \n"
4925  "19:                                         \n"
4926  : "+r"(src_argb),  // %0
4927    "+r"(src_argb_stride_temp),  // %1
4928    "+r"(dst_argb),  // %2
4929    "+r"(src_dudv),  // %3
4930    "+rm"(width),    // %4
4931    "=&r"(temp)      // %5
4932  :
4933  : "memory", "cc", NACL_R14
4934    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4935  );
4936}
4937#endif  // HAS_ARGBAFFINEROW_SSE2
4938
4939#ifdef HAS_INTERPOLATEROW_SSSE3
4940// Bilinear filter 16x2 -> 16x1
4941void InterpolateRow_SSSE3(uint8* dst_ptr,
4942                          const uint8* src_ptr,
4943                          ptrdiff_t src_stride,
4944                          int dst_width,
4945                          int source_y_fraction) {
4946  asm volatile (
4947    "sub       %1,%0                           \n"
4948    "cmp       $0x0,%3                         \n"
4949    "je        100f                            \n"
4950    "cmp       $0x80,%3                        \n"
4951    "je        50f                             \n"
4952
4953    "movd      %3,%%xmm0                       \n"
4954    "neg       %3                              \n"
4955    "add       $0x100,%3                       \n"
4956    "movd      %3,%%xmm5                       \n"
4957    "punpcklbw %%xmm0,%%xmm5                   \n"
4958    "punpcklwd %%xmm5,%%xmm5                   \n"
4959    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
4960    "mov       $0x80808080,%%eax               \n"
4961    "movd      %%eax,%%xmm4                    \n"
4962    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
4963
4964    // General purpose row blend.
4965    LABELALIGN
4966    "1:                                        \n"
4967    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4968    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
4969    "movdqa     %%xmm0,%%xmm1                  \n"
4970    "punpcklbw  %%xmm2,%%xmm0                  \n"
4971    "punpckhbw  %%xmm2,%%xmm1                  \n"
4972    "psubb      %%xmm4,%%xmm0                  \n"
4973    "psubb      %%xmm4,%%xmm1                  \n"
4974    "movdqa     %%xmm5,%%xmm2                  \n"
4975    "movdqa     %%xmm5,%%xmm3                  \n"
4976    "pmaddubsw  %%xmm0,%%xmm2                  \n"
4977    "pmaddubsw  %%xmm1,%%xmm3                  \n"
4978    "paddw      %%xmm4,%%xmm2                  \n"
4979    "paddw      %%xmm4,%%xmm3                  \n"
4980    "psrlw      $0x8,%%xmm2                    \n"
4981    "psrlw      $0x8,%%xmm3                    \n"
4982    "packuswb   %%xmm3,%%xmm2                  \n"
4983    MEMOPMEM(movdqu,xmm2,0x00,1,0,1)
4984    "lea       " MEMLEA(0x10,1) ",%1           \n"
4985    "sub       $0x10,%2                        \n"
4986    "jg        1b                              \n"
4987    "jmp       99f                             \n"
4988
4989    // Blend 50 / 50.
4990    LABELALIGN
4991  "50:                                         \n"
4992    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
4993    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
4994    "pavgb     %%xmm1,%%xmm0                   \n"
4995    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
4996    "lea       " MEMLEA(0x10,1) ",%1           \n"
4997    "sub       $0x10,%2                        \n"
4998    "jg        50b                             \n"
4999    "jmp       99f                             \n"
5000
5001    // Blend 100 / 0 - Copy row unchanged.
5002    LABELALIGN
5003  "100:                                        \n"
5004    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5005    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5006    "lea       " MEMLEA(0x10,1) ",%1           \n"
5007    "sub       $0x10,%2                        \n"
5008    "jg        100b                            \n"
5009
5010  "99:                                         \n"
5011  : "+r"(dst_ptr),     // %0
5012    "+r"(src_ptr),     // %1
5013    "+rm"(dst_width),  // %2
5014    "+r"(source_y_fraction)  // %3
5015  : "r"((intptr_t)(src_stride))  // %4
5016  : "memory", "cc", "eax", NACL_R14
5017    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5018  );
5019}
5020#endif  // HAS_INTERPOLATEROW_SSSE3
5021
5022#ifdef HAS_INTERPOLATEROW_AVX2
5023// Bilinear filter 32x2 -> 32x1
5024void InterpolateRow_AVX2(uint8* dst_ptr,
5025                         const uint8* src_ptr,
5026                         ptrdiff_t src_stride,
5027                         int dst_width,
5028                         int source_y_fraction) {
5029  asm volatile (
5030    "cmp       $0x0,%3                         \n"
5031    "je        100f                            \n"
5032    "sub       %1,%0                           \n"
5033    "cmp       $0x80,%3                        \n"
5034    "je        50f                             \n"
5035
5036    "vmovd      %3,%%xmm0                      \n"
5037    "neg        %3                             \n"
5038    "add        $0x100,%3                      \n"
5039    "vmovd      %3,%%xmm5                      \n"
5040    "vpunpcklbw %%xmm0,%%xmm5,%%xmm5           \n"
5041    "vpunpcklwd %%xmm5,%%xmm5,%%xmm5           \n"
5042    "vbroadcastss %%xmm5,%%ymm5                \n"
5043    "mov        $0x80808080,%%eax              \n"
5044    "vmovd      %%eax,%%xmm4                   \n"
5045    "vbroadcastss %%xmm4,%%ymm4                \n"
5046
5047    // General purpose row blend.
5048    LABELALIGN
5049    "1:                                        \n"
5050    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
5051    MEMOPREG(vmovdqu,0x00,1,4,1,ymm2)
5052    "vpunpckhbw %%ymm2,%%ymm0,%%ymm1           \n"
5053    "vpunpcklbw %%ymm2,%%ymm0,%%ymm0           \n"
5054    "vpsubb     %%ymm4,%%ymm1,%%ymm1           \n"
5055    "vpsubb     %%ymm4,%%ymm0,%%ymm0           \n"
5056    "vpmaddubsw %%ymm1,%%ymm5,%%ymm1           \n"
5057    "vpmaddubsw %%ymm0,%%ymm5,%%ymm0           \n"
5058    "vpaddw     %%ymm4,%%ymm1,%%ymm1           \n"
5059    "vpaddw     %%ymm4,%%ymm0,%%ymm0           \n"
5060    "vpsrlw     $0x8,%%ymm1,%%ymm1             \n"
5061    "vpsrlw     $0x8,%%ymm0,%%ymm0             \n"
5062    "vpackuswb  %%ymm1,%%ymm0,%%ymm0           \n"
5063    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5064    "lea       " MEMLEA(0x20,1) ",%1           \n"
5065    "sub       $0x20,%2                        \n"
5066    "jg        1b                              \n"
5067    "jmp       99f                             \n"
5068
5069    // Blend 50 / 50.
5070    LABELALIGN
5071  "50:                                         \n"
5072    "vmovdqu    " MEMACCESS(1) ",%%ymm0        \n"
5073    VMEMOPREG(vpavgb,0x00,1,4,1,ymm0,ymm0)     // vpavgb (%1,%4,1),%%ymm0,%%ymm0
5074    MEMOPMEM(vmovdqu,ymm0,0x00,1,0,1)
5075    "lea       " MEMLEA(0x20,1) ",%1           \n"
5076    "sub       $0x20,%2                        \n"
5077    "jg        50b                             \n"
5078    "jmp       99f                             \n"
5079
5080    // Blend 100 / 0 - Copy row unchanged.
5081    LABELALIGN
5082  "100:                                        \n"
5083    "rep movsb " MEMMOVESTRING(1,0) "          \n"
5084    "jmp       999f                            \n"
5085
5086  "99:                                         \n"
5087    "vzeroupper                                \n"
5088  "999:                                        \n"
5089  : "+D"(dst_ptr),    // %0
5090    "+S"(src_ptr),    // %1
5091    "+cm"(dst_width),  // %2
5092    "+r"(source_y_fraction)  // %3
5093  : "r"((intptr_t)(src_stride))  // %4
5094  : "memory", "cc", "eax", NACL_R14
5095    "xmm0", "xmm1", "xmm2", "xmm4", "xmm5"
5096  );
5097}
5098#endif  // HAS_INTERPOLATEROW_AVX2
5099
5100#ifdef HAS_ARGBSHUFFLEROW_SSSE3
5101// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5102void ARGBShuffleRow_SSSE3(const uint8* src_argb,
5103                          uint8* dst_argb,
5104                          const uint8* shuffler,
5105                          int width) {
5106  asm volatile (
5107    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
5108    LABELALIGN
5109    "1:                                        \n"
5110    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5111    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5112    "lea       " MEMLEA(0x20,0) ",%0           \n"
5113    "pshufb    %%xmm5,%%xmm0                   \n"
5114    "pshufb    %%xmm5,%%xmm1                   \n"
5115    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5116    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
5117    "lea       " MEMLEA(0x20,1) ",%1           \n"
5118    "sub       $0x8,%2                         \n"
5119    "jg        1b                              \n"
5120  : "+r"(src_argb),  // %0
5121    "+r"(dst_argb),  // %1
5122    "+r"(width)        // %2
5123  : "r"(shuffler)    // %3
5124  : "memory", "cc"
5125    , "xmm0", "xmm1", "xmm5"
5126  );
5127}
5128#endif  // HAS_ARGBSHUFFLEROW_SSSE3
5129
5130#ifdef HAS_ARGBSHUFFLEROW_AVX2
5131// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5132void ARGBShuffleRow_AVX2(const uint8* src_argb,
5133                         uint8* dst_argb,
5134                         const uint8* shuffler,
5135                         int width) {
5136  asm volatile (
5137    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
5138    LABELALIGN
5139    "1:                                        \n"
5140    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
5141    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
5142    "lea       " MEMLEA(0x40,0) ",%0           \n"
5143    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
5144    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
5145    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
5146    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
5147    "lea       " MEMLEA(0x40,1) ",%1           \n"
5148    "sub       $0x10,%2                        \n"
5149    "jg        1b                              \n"
5150    "vzeroupper                                \n"
5151  : "+r"(src_argb),  // %0
5152    "+r"(dst_argb),  // %1
5153    "+r"(width)        // %2
5154  : "r"(shuffler)    // %3
5155  : "memory", "cc"
5156    , "xmm0", "xmm1", "xmm5"
5157  );
5158}
5159#endif  // HAS_ARGBSHUFFLEROW_AVX2
5160
5161#ifdef HAS_ARGBSHUFFLEROW_SSE2
5162// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5163void ARGBShuffleRow_SSE2(const uint8* src_argb,
5164                         uint8* dst_argb,
5165                         const uint8* shuffler,
5166                         int width) {
5167  uintptr_t pixel_temp;
5168  asm volatile (
5169    "pxor      %%xmm5,%%xmm5                   \n"
5170    "mov       " MEMACCESS(4) ",%k2            \n"
5171    "cmp       $0x3000102,%k2                  \n"
5172    "je        3012f                           \n"
5173    "cmp       $0x10203,%k2                    \n"
5174    "je        123f                            \n"
5175    "cmp       $0x30201,%k2                    \n"
5176    "je        321f                            \n"
5177    "cmp       $0x2010003,%k2                  \n"
5178    "je        2103f                           \n"
5179
5180    LABELALIGN
5181    "1:                                        \n"
5182    "movzb     " MEMACCESS(4) ",%2             \n"
5183    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5184    "mov       %b2," MEMACCESS(1) "            \n"
5185    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
5186    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5187    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
5188    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
5189    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5190    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
5191    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
5192    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5193    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
5194    "lea       " MEMLEA(0x4,0) ",%0            \n"
5195    "lea       " MEMLEA(0x4,1) ",%1            \n"
5196    "sub       $0x1,%3                         \n"
5197    "jg        1b                              \n"
5198    "jmp       99f                             \n"
5199
5200    LABELALIGN
5201  "123:                                        \n"
5202    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5203    "lea       " MEMLEA(0x10,0) ",%0           \n"
5204    "movdqa    %%xmm0,%%xmm1                   \n"
5205    "punpcklbw %%xmm5,%%xmm0                   \n"
5206    "punpckhbw %%xmm5,%%xmm1                   \n"
5207    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
5208    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
5209    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
5210    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
5211    "packuswb  %%xmm1,%%xmm0                   \n"
5212    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5213    "lea       " MEMLEA(0x10,1) ",%1           \n"
5214    "sub       $0x4,%3                         \n"
5215    "jg        123b                            \n"
5216    "jmp       99f                             \n"
5217
5218    LABELALIGN
5219  "321:                                        \n"
5220    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5221    "lea       " MEMLEA(0x10,0) ",%0           \n"
5222    "movdqa    %%xmm0,%%xmm1                   \n"
5223    "punpcklbw %%xmm5,%%xmm0                   \n"
5224    "punpckhbw %%xmm5,%%xmm1                   \n"
5225    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
5226    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
5227    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
5228    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
5229    "packuswb  %%xmm1,%%xmm0                   \n"
5230    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5231    "lea       " MEMLEA(0x10,1) ",%1           \n"
5232    "sub       $0x4,%3                         \n"
5233    "jg        321b                            \n"
5234    "jmp       99f                             \n"
5235
5236    LABELALIGN
5237  "2103:                                       \n"
5238    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5239    "lea       " MEMLEA(0x10,0) ",%0           \n"
5240    "movdqa    %%xmm0,%%xmm1                   \n"
5241    "punpcklbw %%xmm5,%%xmm0                   \n"
5242    "punpckhbw %%xmm5,%%xmm1                   \n"
5243    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
5244    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
5245    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
5246    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
5247    "packuswb  %%xmm1,%%xmm0                   \n"
5248    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5249    "lea       " MEMLEA(0x10,1) ",%1           \n"
5250    "sub       $0x4,%3                         \n"
5251    "jg        2103b                           \n"
5252    "jmp       99f                             \n"
5253
5254    LABELALIGN
5255  "3012:                                       \n"
5256    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5257    "lea       " MEMLEA(0x10,0) ",%0           \n"
5258    "movdqa    %%xmm0,%%xmm1                   \n"
5259    "punpcklbw %%xmm5,%%xmm0                   \n"
5260    "punpckhbw %%xmm5,%%xmm1                   \n"
5261    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
5262    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
5263    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
5264    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
5265    "packuswb  %%xmm1,%%xmm0                   \n"
5266    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5267    "lea       " MEMLEA(0x10,1) ",%1           \n"
5268    "sub       $0x4,%3                         \n"
5269    "jg        3012b                           \n"
5270
5271  "99:                                         \n"
5272  : "+r"(src_argb),     // %0
5273    "+r"(dst_argb),     // %1
5274    "=&d"(pixel_temp),  // %2
5275    "+r"(width)         // %3
5276  : "r"(shuffler)       // %4
5277  : "memory", "cc", NACL_R14
5278    "xmm0", "xmm1", "xmm5"
5279  );
5280}
5281#endif  // HAS_ARGBSHUFFLEROW_SSE2
5282
5283#ifdef HAS_I422TOYUY2ROW_SSE2
5284void I422ToYUY2Row_SSE2(const uint8* src_y,
5285                        const uint8* src_u,
5286                        const uint8* src_v,
5287                        uint8* dst_frame,
5288                        int width) {
5289  asm volatile (
5290    "sub       %1,%2                             \n"
5291    LABELALIGN
5292    "1:                                        \n"
5293    "movq      " MEMACCESS(1) ",%%xmm2           \n"
5294    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5295    "lea       " MEMLEA(0x8,1) ",%1              \n"
5296    "punpcklbw %%xmm3,%%xmm2                     \n"
5297    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5298    "lea       " MEMLEA(0x10,0) ",%0             \n"
5299    "movdqa    %%xmm0,%%xmm1                     \n"
5300    "punpcklbw %%xmm2,%%xmm0                     \n"
5301    "punpckhbw %%xmm2,%%xmm1                     \n"
5302    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
5303    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
5304    "lea       " MEMLEA(0x20,3) ",%3             \n"
5305    "sub       $0x10,%4                          \n"
5306    "jg         1b                               \n"
5307    : "+r"(src_y),  // %0
5308      "+r"(src_u),  // %1
5309      "+r"(src_v),  // %2
5310      "+r"(dst_frame),  // %3
5311      "+rm"(width)  // %4
5312    :
5313    : "memory", "cc", NACL_R14
5314    "xmm0", "xmm1", "xmm2", "xmm3"
5315  );
5316}
5317#endif  // HAS_I422TOYUY2ROW_SSE2
5318
5319#ifdef HAS_I422TOUYVYROW_SSE2
5320void I422ToUYVYRow_SSE2(const uint8* src_y,
5321                        const uint8* src_u,
5322                        const uint8* src_v,
5323                        uint8* dst_frame,
5324                        int width) {
5325  asm volatile (
5326    "sub        %1,%2                            \n"
5327    LABELALIGN
5328    "1:                                        \n"
5329    "movq      " MEMACCESS(1) ",%%xmm2           \n"
5330    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
5331    "lea       " MEMLEA(0x8,1) ",%1              \n"
5332    "punpcklbw %%xmm3,%%xmm2                     \n"
5333    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
5334    "movdqa    %%xmm2,%%xmm1                     \n"
5335    "lea       " MEMLEA(0x10,0) ",%0             \n"
5336    "punpcklbw %%xmm0,%%xmm1                     \n"
5337    "punpckhbw %%xmm0,%%xmm2                     \n"
5338    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
5339    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
5340    "lea       " MEMLEA(0x20,3) ",%3             \n"
5341    "sub       $0x10,%4                          \n"
5342    "jg         1b                               \n"
5343    : "+r"(src_y),  // %0
5344      "+r"(src_u),  // %1
5345      "+r"(src_v),  // %2
5346      "+r"(dst_frame),  // %3
5347      "+rm"(width)  // %4
5348    :
5349    : "memory", "cc", NACL_R14
5350    "xmm0", "xmm1", "xmm2", "xmm3"
5351  );
5352}
5353#endif  // HAS_I422TOUYVYROW_SSE2
5354
5355#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
5356void ARGBPolynomialRow_SSE2(const uint8* src_argb,
5357                            uint8* dst_argb,
5358                            const float* poly,
5359                            int width) {
5360  asm volatile (
5361    "pxor      %%xmm3,%%xmm3                   \n"
5362
5363    // 2 pixel loop.
5364    LABELALIGN
5365    "1:                                        \n"
5366    "movq      " MEMACCESS(0) ",%%xmm0         \n"
5367    "lea       " MEMLEA(0x8,0) ",%0            \n"
5368    "punpcklbw %%xmm3,%%xmm0                   \n"
5369    "movdqa    %%xmm0,%%xmm4                   \n"
5370    "punpcklwd %%xmm3,%%xmm0                   \n"
5371    "punpckhwd %%xmm3,%%xmm4                   \n"
5372    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5373    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
5374    "movdqa    %%xmm0,%%xmm1                   \n"
5375    "movdqa    %%xmm4,%%xmm5                   \n"
5376    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
5377    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
5378    "addps     " MEMACCESS(3) ",%%xmm0         \n"
5379    "addps     " MEMACCESS(3) ",%%xmm4         \n"
5380    "movdqa    %%xmm1,%%xmm2                   \n"
5381    "movdqa    %%xmm5,%%xmm6                   \n"
5382    "mulps     %%xmm1,%%xmm2                   \n"
5383    "mulps     %%xmm5,%%xmm6                   \n"
5384    "mulps     %%xmm2,%%xmm1                   \n"
5385    "mulps     %%xmm6,%%xmm5                   \n"
5386    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
5387    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
5388    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
5389    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
5390    "addps     %%xmm2,%%xmm0                   \n"
5391    "addps     %%xmm6,%%xmm4                   \n"
5392    "addps     %%xmm1,%%xmm0                   \n"
5393    "addps     %%xmm5,%%xmm4                   \n"
5394    "cvttps2dq %%xmm0,%%xmm0                   \n"
5395    "cvttps2dq %%xmm4,%%xmm4                   \n"
5396    "packuswb  %%xmm4,%%xmm0                   \n"
5397    "packuswb  %%xmm0,%%xmm0                   \n"
5398    "movq      %%xmm0," MEMACCESS(1) "         \n"
5399    "lea       " MEMLEA(0x8,1) ",%1            \n"
5400    "sub       $0x2,%2                         \n"
5401    "jg        1b                              \n"
5402  : "+r"(src_argb),  // %0
5403    "+r"(dst_argb),  // %1
5404    "+r"(width)      // %2
5405  : "r"(poly)        // %3
5406  : "memory", "cc"
5407    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5408  );
5409}
5410#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
5411
5412#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
5413void ARGBPolynomialRow_AVX2(const uint8* src_argb,
5414                            uint8* dst_argb,
5415                            const float* poly,
5416                            int width) {
5417  asm volatile (
5418    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
5419    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
5420    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
5421    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
5422
5423    // 2 pixel loop.
5424    LABELALIGN
5425    "1:                                        \n"
5426    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
5427    "lea         " MEMLEA(0x8,0) ",%0          \n"
5428    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
5429    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
5430    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
5431    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
5432    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
5433    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
5434    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
5435    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
5436    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
5437    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
5438    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
5439    "lea         " MEMLEA(0x8,1) ",%1          \n"
5440    "sub         $0x2,%2                       \n"
5441    "jg          1b                            \n"
5442    "vzeroupper                                \n"
5443  : "+r"(src_argb),  // %0
5444    "+r"(dst_argb),  // %1
5445    "+r"(width)      // %2
5446  : "r"(poly)        // %3
5447  : "memory", "cc",
5448    "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5449  );
5450}
5451#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
5452
5453#ifdef HAS_HALFFLOATROW_SSE2
5454static float kScaleBias = 1.9259299444e-34f;
5455void HalfFloatRow_SSE2(const uint16* src, uint16* dst, float scale, int width) {
5456  asm volatile (
5457    "pshufd      $0x0,%3,%%xmm4                \n"
5458    "pxor        %%xmm5,%%xmm5                 \n"
5459    "sub         %0,%1                         \n"
5460
5461    // 16 pixel loop.
5462    LABELALIGN
5463    "1:                                        \n"
5464    "movdqu      " MEMACCESS(0) ",%%xmm2       \n"  // 8 shorts
5465    "add         $0x10,%0                      \n"
5466    "movdqa      %%xmm2,%%xmm3                 \n"
5467    "punpcklwd   %%xmm5,%%xmm2                 \n"  // 8 ints in xmm2/1
5468    "cvtdq2ps    %%xmm2,%%xmm2                 \n"  // 8 floats
5469    "punpckhwd   %%xmm5,%%xmm3                 \n"
5470    "cvtdq2ps    %%xmm3,%%xmm3                 \n"
5471    "mulps       %%xmm4,%%xmm2                 \n"
5472    "mulps       %%xmm4,%%xmm3                 \n"
5473    "psrld       $0xd,%%xmm2                   \n"
5474    "psrld       $0xd,%%xmm3                   \n"
5475    "packssdw    %%xmm3,%%xmm2                 \n"
5476    MEMOPMEM(movdqu,xmm2,-0x10,0,1,1)
5477    "sub         $0x8,%2                       \n"
5478    "jg          1b                            \n"
5479  : "+r"(src),    // %0
5480    "+r"(dst),    // %1
5481    "+r"(width)   // %2
5482  : "x"(scale * kScaleBias)   // %3
5483  : "memory", "cc",
5484    "xmm2", "xmm3", "xmm4", "xmm5"
5485  );
5486}
5487#endif  // HAS_HALFFLOATROW_SSE2
5488
5489#ifdef HAS_HALFFLOATROW_AVX2
5490void HalfFloatRow_AVX2(const uint16* src, uint16* dst, float scale, int width) {
5491  asm volatile (
5492    "vbroadcastss  %3, %%ymm4                  \n"
5493    "vpxor      %%ymm5,%%ymm5,%%ymm5           \n"
5494    "sub        %0,%1                          \n"
5495
5496    // 16 pixel loop.
5497    LABELALIGN
5498    "1:                                        \n"
5499    "vmovdqu    " MEMACCESS(0) ",%%ymm2        \n"  // 16 shorts
5500    "add        $0x20,%0                       \n"
5501    "vpunpckhwd %%ymm5,%%ymm2,%%ymm3           \n"  // mutates
5502    "vpunpcklwd %%ymm5,%%ymm2,%%ymm2           \n"
5503    "vcvtdq2ps  %%ymm3,%%ymm3                  \n"
5504    "vcvtdq2ps  %%ymm2,%%ymm2                  \n"
5505    "vmulps     %%ymm3,%%ymm4,%%ymm3           \n"
5506    "vmulps     %%ymm2,%%ymm4,%%ymm2           \n"
5507    "vpsrld     $0xd,%%ymm3,%%ymm3             \n"
5508    "vpsrld     $0xd,%%ymm2,%%ymm2             \n"
5509    "vpackssdw  %%ymm3, %%ymm2, %%ymm2         \n"  // unmutates
5510    MEMOPMEM(vmovdqu,ymm2,-0x20,0,1,1)
5511    "sub        $0x10,%2                       \n"
5512    "jg         1b                             \n"
5513
5514    "vzeroupper                                \n"
5515  : "+r"(src),    // %0
5516    "+r"(dst),    // %1
5517    "+r"(width)   // %2
5518  : "x"(scale * kScaleBias)   // %3
5519  : "memory", "cc",
5520    "xmm2", "xmm3", "xmm4", "xmm5"
5521  );
5522}
5523#endif  // HAS_HALFFLOATROW_AVX2
5524
5525#ifdef HAS_HALFFLOATROW_F16C
5526void HalfFloatRow_F16C(const uint16* src, uint16* dst, float scale, int width) {
5527  asm volatile (
5528    "vbroadcastss  %3, %%ymm4                  \n"
5529    "sub        %0,%1                          \n"
5530
5531    // 16 pixel loop.
5532    LABELALIGN
5533    "1:                                        \n"
5534    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
5535    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5536    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
5537    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
5538    "vmulps      %%ymm2,%%ymm4,%%ymm2          \n"
5539    "vmulps      %%ymm3,%%ymm4,%%ymm3          \n"
5540    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
5541    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
5542    MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5543    MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5544    "add         $0x20,%0                      \n"
5545    "sub         $0x10,%2                      \n"
5546    "jg          1b                            \n"
5547    "vzeroupper                                \n"
5548  : "+r"(src),   // %0
5549    "+r"(dst),   // %1
5550    "+r"(width)  // %2
5551  : "x"(scale)   // %3
5552  : "memory", "cc",
5553    "xmm2", "xmm3", "xmm4"
5554  );
5555}
5556#endif  // HAS_HALFFLOATROW_F16C
5557
5558#ifdef HAS_HALFFLOATROW_F16C
5559void HalfFloat1Row_F16C(const uint16* src, uint16* dst, float, int width) {
5560  asm volatile (
5561    "sub        %0,%1                          \n"
5562    // 16 pixel loop.
5563    LABELALIGN
5564    "1:                                        \n"
5565    "vpmovzxwd   " MEMACCESS(0) ",%%ymm2       \n"  // 16 shorts -> 16 ints
5566    "vpmovzxwd   " MEMACCESS2(0x10,0) ",%%ymm3 \n"
5567    "vcvtdq2ps   %%ymm2,%%ymm2                 \n"
5568    "vcvtdq2ps   %%ymm3,%%ymm3                 \n"
5569    "vcvtps2ph   $3, %%ymm2, %%xmm2            \n"
5570    "vcvtps2ph   $3, %%ymm3, %%xmm3            \n"
5571    MEMOPMEM(vmovdqu,xmm2,0x00,0,1,1)
5572    MEMOPMEM(vmovdqu,xmm3,0x10,0,1,1)
5573    "add         $0x20,%0                      \n"
5574    "sub         $0x10,%2                      \n"
5575    "jg          1b                            \n"
5576    "vzeroupper                                \n"
5577  : "+r"(src),   // %0
5578    "+r"(dst),   // %1
5579    "+r"(width)  // %2
5580  :
5581  : "memory", "cc",
5582    "xmm2", "xmm3"
5583  );
5584}
5585#endif  // HAS_HALFFLOATROW_F16C
5586
5587#ifdef HAS_ARGBCOLORTABLEROW_X86
5588// Tranform ARGB pixels with color table.
5589void ARGBColorTableRow_X86(uint8* dst_argb,
5590                           const uint8* table_argb,
5591                           int width) {
5592  uintptr_t pixel_temp;
5593  asm volatile (
5594    // 1 pixel loop.
5595    LABELALIGN
5596    "1:                                        \n"
5597    "movzb     " MEMACCESS(0) ",%1             \n"
5598    "lea       " MEMLEA(0x4,0) ",%0            \n"
5599    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5600    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5601    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5602    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5603    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5604    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5605    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5606    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5607    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
5608    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
5609    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
5610    "dec       %2                              \n"
5611    "jg        1b                              \n"
5612  : "+r"(dst_argb),     // %0
5613    "=&d"(pixel_temp),  // %1
5614    "+r"(width)         // %2
5615  : "r"(table_argb)     // %3
5616  : "memory", "cc");
5617}
5618#endif  // HAS_ARGBCOLORTABLEROW_X86
5619
5620#ifdef HAS_RGBCOLORTABLEROW_X86
5621// Tranform RGB pixels with color table.
5622void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
5623  uintptr_t pixel_temp;
5624  asm volatile (
5625    // 1 pixel loop.
5626    LABELALIGN
5627    "1:                                        \n"
5628    "movzb     " MEMACCESS(0) ",%1             \n"
5629    "lea       " MEMLEA(0x4,0) ",%0            \n"
5630    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
5631    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
5632    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
5633    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
5634    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
5635    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
5636    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
5637    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
5638    "dec       %2                              \n"
5639    "jg        1b                              \n"
5640  : "+r"(dst_argb),     // %0
5641    "=&d"(pixel_temp),  // %1
5642    "+r"(width)         // %2
5643  : "r"(table_argb)     // %3
5644  : "memory", "cc");
5645}
5646#endif  // HAS_RGBCOLORTABLEROW_X86
5647
5648#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
5649// Tranform RGB pixels with luma table.
5650void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb,
5651                                 uint8* dst_argb,
5652                                 int width,
5653                                 const uint8* luma,
5654                                 uint32 lumacoeff) {
5655  uintptr_t pixel_temp;
5656  uintptr_t table_temp;
5657  asm volatile (
5658    "movd      %6,%%xmm3                       \n"
5659    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
5660    "pcmpeqb   %%xmm4,%%xmm4                   \n"
5661    "psllw     $0x8,%%xmm4                     \n"
5662    "pxor      %%xmm5,%%xmm5                   \n"
5663
5664    // 4 pixel loop.
5665    LABELALIGN
5666    "1:                                        \n"
5667    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
5668    "pmaddubsw %%xmm3,%%xmm0                   \n"
5669    "phaddw    %%xmm0,%%xmm0                   \n"
5670    "pand      %%xmm4,%%xmm0                   \n"
5671    "punpcklwd %%xmm5,%%xmm0                   \n"
5672    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5673    "add       %5,%1                           \n"
5674    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5675
5676    "movzb     " MEMACCESS(2) ",%0             \n"
5677    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5678    "mov       %b0," MEMACCESS(3) "            \n"
5679    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
5680    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5681    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
5682    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
5683    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5684    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
5685    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
5686    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
5687
5688    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5689    "add       %5,%1                           \n"
5690    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5691
5692    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
5693    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5694    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
5695    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
5696    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5697    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
5698    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
5699    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5700    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
5701    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
5702    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
5703
5704    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5705    "add       %5,%1                           \n"
5706    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5707
5708    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
5709    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5710    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
5711    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
5712    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5713    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
5714    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
5715    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5716    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
5717    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
5718    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
5719
5720    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
5721    "add       %5,%1                           \n"
5722
5723    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
5724    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5725    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
5726    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
5727    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5728    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
5729    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
5730    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
5731    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
5732    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
5733    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
5734    "lea       " MEMLEA(0x10,2) ",%2           \n"
5735    "lea       " MEMLEA(0x10,3) ",%3           \n"
5736    "sub       $0x4,%4                         \n"
5737    "jg        1b                              \n"
5738  : "=&d"(pixel_temp),  // %0
5739    "=&a"(table_temp),  // %1
5740    "+r"(src_argb),     // %2
5741    "+r"(dst_argb),     // %3
5742    "+rm"(width)        // %4
5743  : "r"(luma),          // %5
5744    "rm"(lumacoeff)     // %6
5745  : "memory", "cc", "xmm0", "xmm3", "xmm4", "xmm5"
5746  );
5747}
5748#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
5749
5750#endif  // defined(__x86_64__) || defined(__i386__)
5751
5752#ifdef __cplusplus
5753}  // extern "C"
5754}  // namespace libyuv
5755#endif
5756