1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS.  All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#include "libyuv/basic_types.h"
14
15#ifdef __cplusplus
16namespace libyuv {
17extern "C" {
18#endif
19
20// This module is for GCC x86 and x64
21#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
22
23// GCC 4.2 on OSX has link error when passing static or const to inline.
24// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
25#ifdef __APPLE__
26#define CONST
27#else
28#define CONST static const
29#endif
30
31#ifdef HAS_ARGBTOYROW_SSSE3
32
33// Constants for ARGB
34CONST vec8 kARGBToY = {
35  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
36};
37
38CONST vec8 kARGBToU = {
39  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
40};
41
42CONST vec8 kARGBToV = {
43  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
44};
45
46// Constants for BGRA
47CONST vec8 kBGRAToY = {
48  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
49};
50
51CONST vec8 kBGRAToU = {
52  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
53};
54
55CONST vec8 kBGRAToV = {
56  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
57};
58
59// Constants for ABGR
60CONST vec8 kABGRToY = {
61  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
62};
63
64CONST vec8 kABGRToU = {
65  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
66};
67
68CONST vec8 kABGRToV = {
69  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
70};
71
72CONST uvec8 kAddY16 = {
73  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
74};
75
76CONST uvec8 kAddUV128 = {
77  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
78  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
79};
80
81// Shuffle table for converting RGB24 to ARGB.
82CONST uvec8 kShuffleMaskRGB24ToARGB = {
83  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
84};
85
86// Shuffle table for converting RAW to ARGB.
87CONST uvec8 kShuffleMaskRAWToARGB = {
88  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
89};
90
91// Shuffle table for converting ABGR to ARGB.
92CONST uvec8 kShuffleMaskABGRToARGB = {
93  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
94};
95
96// Shuffle table for converting BGRA to ARGB.
97CONST uvec8 kShuffleMaskBGRAToARGB = {
98  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
99};
100
101// Shuffle table for converting RGBA to ARGB.
102CONST uvec8 kShuffleMaskRGBAToARGB = {
103  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
104};
105
106// Shuffle table for converting ARGB to RGBA.
107CONST uvec8 kShuffleMaskARGBToRGBA = {
108  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
109};
110
111// Shuffle table for converting ARGB to RGB24.
112CONST uvec8 kShuffleMaskARGBToRGB24 = {
113  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
114};
115
116// Shuffle table for converting ARGB to RAW.
117CONST uvec8 kShuffleMaskARGBToRAW = {
118  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
119};
120
121void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
122  asm volatile (
123    "pcmpeqb   %%xmm5,%%xmm5                   \n"
124    "pslld     $0x18,%%xmm5                    \n"
125    ".p2align  4                               \n"
126  "1:                                          \n"
127    "movq      (%0),%%xmm0                     \n"
128    "lea       0x8(%0),%0                      \n"
129    "punpcklbw %%xmm0,%%xmm0                   \n"
130    "movdqa    %%xmm0,%%xmm1                   \n"
131    "punpcklwd %%xmm0,%%xmm0                   \n"
132    "punpckhwd %%xmm1,%%xmm1                   \n"
133    "por       %%xmm5,%%xmm0                   \n"
134    "por       %%xmm5,%%xmm1                   \n"
135    "movdqa    %%xmm0,(%1)                     \n"
136    "movdqa    %%xmm1,0x10(%1)                 \n"
137    "lea       0x20(%1),%1                     \n"
138    "sub       $0x8,%2                         \n"
139    "jg        1b                              \n"
140  : "+r"(src_y),     // %0
141    "+r"(dst_argb),  // %1
142    "+r"(pix)        // %2
143  :
144  : "memory", "cc"
145#if defined(__SSE2__)
146    , "xmm0", "xmm1", "xmm5"
147#endif
148  );
149}
150
151void ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
152  asm volatile (
153    "movdqa    %3,%%xmm5                       \n"
154    "sub       %0,%1                           \n"
155    ".p2align  4                               \n"
156  "1:                                          \n"
157    "movdqa    (%0),%%xmm0                     \n"
158    "pshufb    %%xmm5,%%xmm0                   \n"
159    "sub       $0x4,%2                         \n"
160    "movdqa    %%xmm0,(%0,%1,1)                \n"
161    "lea       0x10(%0),%0                     \n"
162    "jg        1b                              \n"
163
164  : "+r"(src_abgr),  // %0
165    "+r"(dst_argb),  // %1
166    "+r"(pix)        // %2
167  : "m"(kShuffleMaskABGRToARGB)  // %3
168  : "memory", "cc"
169#if defined(__SSE2__)
170    , "xmm0", "xmm5"
171#endif
172  );
173}
174
175void BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
176  asm volatile (
177    "movdqa    %3,%%xmm5                       \n"
178    "sub       %0,%1                           \n"
179    ".p2align  4                               \n"
180  "1:                                          \n"
181    "movdqa    (%0),%%xmm0                     \n"
182    "pshufb    %%xmm5,%%xmm0                   \n"
183    "sub       $0x4,%2                         \n"
184    "movdqa    %%xmm0,(%0,%1,1)                \n"
185    "lea       0x10(%0),%0                     \n"
186    "jg        1b                              \n"
187  : "+r"(src_bgra),  // %0
188    "+r"(dst_argb),  // %1
189    "+r"(pix)        // %2
190  : "m"(kShuffleMaskBGRAToARGB)  // %3
191  : "memory", "cc"
192#if defined(__SSE2__)
193    , "xmm0", "xmm5"
194#endif
195  );
196}
197
198void RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
199  asm volatile (
200    "movdqa    %3,%%xmm5                       \n"
201    "sub       %0,%1                           \n"
202    ".p2align  4                               \n"
203  "1:                                          \n"
204    "movdqa    (%0),%%xmm0                     \n"
205    "pshufb    %%xmm5,%%xmm0                   \n"
206    "sub       $0x4,%2                         \n"
207    "movdqa    %%xmm0,(%0,%1,1)                \n"
208    "lea       0x10(%0),%0                     \n"
209    "jg        1b                              \n"
210
211  : "+r"(src_rgba),  // %0
212    "+r"(dst_argb),  // %1
213    "+r"(pix)        // %2
214  : "m"(kShuffleMaskRGBAToARGB)  // %3
215  : "memory", "cc"
216#if defined(__SSE2__)
217    , "xmm0", "xmm5"
218#endif
219  );
220}
221
222void ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
223  asm volatile (
224    "movdqa    %3,%%xmm5                       \n"
225    "sub       %0,%1                           \n"
226    ".p2align  4                               \n"
227  "1:                                          \n"
228    "movdqa    (%0),%%xmm0                     \n"
229    "pshufb    %%xmm5,%%xmm0                   \n"
230    "sub       $0x4,%2                         \n"
231    "movdqa    %%xmm0,(%0,%1,1)                \n"
232    "lea       0x10(%0),%0                     \n"
233    "jg        1b                              \n"
234
235  : "+r"(src_argb),  // %0
236    "+r"(dst_rgba),  // %1
237    "+r"(pix)        // %2
238  : "m"(kShuffleMaskARGBToRGBA)  // %3
239  : "memory", "cc"
240#if defined(__SSE2__)
241    , "xmm0", "xmm5"
242#endif
243  );
244}
245
246void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
247  asm volatile (
248    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
249    "pslld     $0x18,%%xmm5                    \n"
250    "movdqa    %3,%%xmm4                       \n"
251    ".p2align  4                               \n"
252  "1:                                          \n"
253    "movdqu    (%0),%%xmm0                     \n"
254    "movdqu    0x10(%0),%%xmm1                 \n"
255    "movdqu    0x20(%0),%%xmm3                 \n"
256    "lea       0x30(%0),%0                     \n"
257    "movdqa    %%xmm3,%%xmm2                   \n"
258    "palignr   $0x8,%%xmm1,%%xmm2              \n"
259    "pshufb    %%xmm4,%%xmm2                   \n"
260    "por       %%xmm5,%%xmm2                   \n"
261    "palignr   $0xc,%%xmm0,%%xmm1              \n"
262    "pshufb    %%xmm4,%%xmm0                   \n"
263    "movdqa    %%xmm2,0x20(%1)                 \n"
264    "por       %%xmm5,%%xmm0                   \n"
265    "pshufb    %%xmm4,%%xmm1                   \n"
266    "movdqa    %%xmm0,(%1)                     \n"
267    "por       %%xmm5,%%xmm1                   \n"
268    "palignr   $0x4,%%xmm3,%%xmm3              \n"
269    "pshufb    %%xmm4,%%xmm3                   \n"
270    "movdqa    %%xmm1,0x10(%1)                 \n"
271    "por       %%xmm5,%%xmm3                   \n"
272    "sub       $0x10,%2                        \n"
273    "movdqa    %%xmm3,0x30(%1)                 \n"
274    "lea       0x40(%1),%1                     \n"
275    "jg        1b                              \n"
276  : "+r"(src_rgb24),  // %0
277    "+r"(dst_argb),  // %1
278    "+r"(pix)        // %2
279  : "m"(kShuffleMaskRGB24ToARGB)  // %3
280  : "memory", "cc"
281#if defined(__SSE2__)
282    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
283#endif
284  );
285}
286
287void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
288  asm volatile (
289    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
290    "pslld     $0x18,%%xmm5                    \n"
291    "movdqa    %3,%%xmm4                       \n"
292    ".p2align  4                               \n"
293  "1:                                          \n"
294    "movdqu    (%0),%%xmm0                     \n"
295    "movdqu    0x10(%0),%%xmm1                 \n"
296    "movdqu    0x20(%0),%%xmm3                 \n"
297    "lea       0x30(%0),%0                     \n"
298    "movdqa    %%xmm3,%%xmm2                   \n"
299    "palignr   $0x8,%%xmm1,%%xmm2              \n"
300    "pshufb    %%xmm4,%%xmm2                   \n"
301    "por       %%xmm5,%%xmm2                   \n"
302    "palignr   $0xc,%%xmm0,%%xmm1              \n"
303    "pshufb    %%xmm4,%%xmm0                   \n"
304    "movdqa    %%xmm2,0x20(%1)                 \n"
305    "por       %%xmm5,%%xmm0                   \n"
306    "pshufb    %%xmm4,%%xmm1                   \n"
307    "movdqa    %%xmm0,(%1)                     \n"
308    "por       %%xmm5,%%xmm1                   \n"
309    "palignr   $0x4,%%xmm3,%%xmm3              \n"
310    "pshufb    %%xmm4,%%xmm3                   \n"
311    "movdqa    %%xmm1,0x10(%1)                 \n"
312    "por       %%xmm5,%%xmm3                   \n"
313    "sub       $0x10,%2                        \n"
314    "movdqa    %%xmm3,0x30(%1)                 \n"
315    "lea       0x40(%1),%1                     \n"
316    "jg        1b                              \n"
317  : "+r"(src_raw),   // %0
318    "+r"(dst_argb),  // %1
319    "+r"(pix)        // %2
320  : "m"(kShuffleMaskRAWToARGB)  // %3
321  : "memory", "cc"
322#if defined(__SSE2__)
323    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
324#endif
325  );
326}
327
328void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
329  asm volatile (
330    "mov       $0x1080108,%%eax                \n"
331    "movd      %%eax,%%xmm5                    \n"
332    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
333    "mov       $0x20802080,%%eax               \n"
334    "movd      %%eax,%%xmm6                    \n"
335    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
336    "pcmpeqb   %%xmm3,%%xmm3                   \n"
337    "psllw     $0xb,%%xmm3                     \n"
338    "pcmpeqb   %%xmm4,%%xmm4                   \n"
339    "psllw     $0xa,%%xmm4                     \n"
340    "psrlw     $0x5,%%xmm4                     \n"
341    "pcmpeqb   %%xmm7,%%xmm7                   \n"
342    "psllw     $0x8,%%xmm7                     \n"
343    "sub       %0,%1                           \n"
344    "sub       %0,%1                           \n"
345    ".p2align  4                               \n"
346  "1:                                          \n"
347    "movdqu    (%0),%%xmm0                     \n"
348    "movdqa    %%xmm0,%%xmm1                   \n"
349    "movdqa    %%xmm0,%%xmm2                   \n"
350    "pand      %%xmm3,%%xmm1                   \n"
351    "psllw     $0xb,%%xmm2                     \n"
352    "pmulhuw   %%xmm5,%%xmm1                   \n"
353    "pmulhuw   %%xmm5,%%xmm2                   \n"
354    "psllw     $0x8,%%xmm1                     \n"
355    "por       %%xmm2,%%xmm1                   \n"
356    "pand      %%xmm4,%%xmm0                   \n"
357    "pmulhuw   %%xmm6,%%xmm0                   \n"
358    "por       %%xmm7,%%xmm0                   \n"
359    "movdqa    %%xmm1,%%xmm2                   \n"
360    "punpcklbw %%xmm0,%%xmm1                   \n"
361    "punpckhbw %%xmm0,%%xmm2                   \n"
362    "movdqa    %%xmm1,(%1,%0,2)                \n"
363    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
364    "lea       0x10(%0),%0                     \n"
365    "sub       $0x8,%2                         \n"
366    "jg        1b                              \n"
367  : "+r"(src),  // %0
368    "+r"(dst),  // %1
369    "+r"(pix)   // %2
370  :
371  : "memory", "cc", "eax"
372#if defined(__SSE2__)
373    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
374#endif
375  );
376}
377
378void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
379  asm volatile (
380    "mov       $0x1080108,%%eax                \n"
381    "movd      %%eax,%%xmm5                    \n"
382    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
383    "mov       $0x42004200,%%eax               \n"
384    "movd      %%eax,%%xmm6                    \n"
385    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
386    "pcmpeqb   %%xmm3,%%xmm3                   \n"
387    "psllw     $0xb,%%xmm3                     \n"
388    "movdqa    %%xmm3,%%xmm4                   \n"
389    "psrlw     $0x6,%%xmm4                     \n"
390    "pcmpeqb   %%xmm7,%%xmm7                   \n"
391    "psllw     $0x8,%%xmm7                     \n"
392    "sub       %0,%1                           \n"
393    "sub       %0,%1                           \n"
394    ".p2align  4                               \n"
395  "1:                                          \n"
396    "movdqu    (%0),%%xmm0                     \n"
397    "movdqa    %%xmm0,%%xmm1                   \n"
398    "movdqa    %%xmm0,%%xmm2                   \n"
399    "psllw     $0x1,%%xmm1                     \n"
400    "psllw     $0xb,%%xmm2                     \n"
401    "pand      %%xmm3,%%xmm1                   \n"
402    "pmulhuw   %%xmm5,%%xmm2                   \n"
403    "pmulhuw   %%xmm5,%%xmm1                   \n"
404    "psllw     $0x8,%%xmm1                     \n"
405    "por       %%xmm2,%%xmm1                   \n"
406    "movdqa    %%xmm0,%%xmm2                   \n"
407    "pand      %%xmm4,%%xmm0                   \n"
408    "psraw     $0x8,%%xmm2                     \n"
409    "pmulhuw   %%xmm6,%%xmm0                   \n"
410    "pand      %%xmm7,%%xmm2                   \n"
411    "por       %%xmm2,%%xmm0                   \n"
412    "movdqa    %%xmm1,%%xmm2                   \n"
413    "punpcklbw %%xmm0,%%xmm1                   \n"
414    "punpckhbw %%xmm0,%%xmm2                   \n"
415    "movdqa    %%xmm1,(%1,%0,2)                \n"
416    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
417    "lea       0x10(%0),%0                     \n"
418    "sub       $0x8,%2                         \n"
419    "jg        1b                              \n"
420  : "+r"(src),  // %0
421    "+r"(dst),  // %1
422    "+r"(pix)   // %2
423  :
424  : "memory", "cc", "eax"
425#if defined(__SSE2__)
426    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
427#endif
428  );
429}
430
431void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
432  asm volatile (
433    "mov       $0xf0f0f0f,%%eax                \n"
434    "movd      %%eax,%%xmm4                    \n"
435    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
436    "movdqa    %%xmm4,%%xmm5                   \n"
437    "pslld     $0x4,%%xmm5                     \n"
438    "sub       %0,%1                           \n"
439    "sub       %0,%1                           \n"
440    ".p2align  4                               \n"
441  "1:                                          \n"
442    "movdqu    (%0),%%xmm0                     \n"
443    "movdqa    %%xmm0,%%xmm2                   \n"
444    "pand      %%xmm4,%%xmm0                   \n"
445    "pand      %%xmm5,%%xmm2                   \n"
446    "movdqa    %%xmm0,%%xmm1                   \n"
447    "movdqa    %%xmm2,%%xmm3                   \n"
448    "psllw     $0x4,%%xmm1                     \n"
449    "psrlw     $0x4,%%xmm3                     \n"
450    "por       %%xmm1,%%xmm0                   \n"
451    "por       %%xmm3,%%xmm2                   \n"
452    "movdqa    %%xmm0,%%xmm1                   \n"
453    "punpcklbw %%xmm2,%%xmm0                   \n"
454    "punpckhbw %%xmm2,%%xmm1                   \n"
455    "movdqa    %%xmm0,(%1,%0,2)                \n"
456    "movdqa    %%xmm1,0x10(%1,%0,2)            \n"
457    "lea       0x10(%0),%0                     \n"
458    "sub       $0x8,%2                         \n"
459    "jg        1b                              \n"
460  : "+r"(src),  // %0
461    "+r"(dst),  // %1
462    "+r"(pix)   // %2
463  :
464  : "memory", "cc", "eax"
465#if defined(__SSE2__)
466    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
467#endif
468  );
469}
470
471void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
472  asm volatile (
473    "movdqa    %3,%%xmm6                       \n"
474    ".p2align  4                               \n"
475  "1:                                          \n"
476    "movdqa    (%0),%%xmm0                     \n"
477    "movdqa    0x10(%0),%%xmm1                 \n"
478    "movdqa    0x20(%0),%%xmm2                 \n"
479    "movdqa    0x30(%0),%%xmm3                 \n"
480    "lea       0x40(%0),%0                     \n"
481    "pshufb    %%xmm6,%%xmm0                   \n"
482    "pshufb    %%xmm6,%%xmm1                   \n"
483    "pshufb    %%xmm6,%%xmm2                   \n"
484    "pshufb    %%xmm6,%%xmm3                   \n"
485    "movdqa    %%xmm1,%%xmm4                   \n"
486    "psrldq    $0x4,%%xmm1                     \n"
487    "pslldq    $0xc,%%xmm4                     \n"
488    "movdqa    %%xmm2,%%xmm5                   \n"
489    "por       %%xmm4,%%xmm0                   \n"
490    "pslldq    $0x8,%%xmm5                     \n"
491    "movdqa    %%xmm0,(%1)                     \n"
492    "por       %%xmm5,%%xmm1                   \n"
493    "psrldq    $0x8,%%xmm2                     \n"
494    "pslldq    $0x4,%%xmm3                     \n"
495    "por       %%xmm3,%%xmm2                   \n"
496    "movdqa    %%xmm1,0x10(%1)                 \n"
497    "movdqa    %%xmm2,0x20(%1)                 \n"
498    "lea       0x30(%1),%1                     \n"
499    "sub       $0x10,%2                        \n"
500    "jg        1b                              \n"
501  : "+r"(src),  // %0
502    "+r"(dst),  // %1
503    "+r"(pix)   // %2
504  : "m"(kShuffleMaskARGBToRGB24)  // %3
505  : "memory", "cc"
506#if defined(__SSE2__)
507    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
508#endif
509  );
510}
511
512void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
513  asm volatile (
514    "movdqa    %3,%%xmm6                       \n"
515    ".p2align  4                               \n"
516  "1:                                          \n"
517    "movdqa    (%0),%%xmm0                     \n"
518    "movdqa    0x10(%0),%%xmm1                 \n"
519    "movdqa    0x20(%0),%%xmm2                 \n"
520    "movdqa    0x30(%0),%%xmm3                 \n"
521    "lea       0x40(%0),%0                     \n"
522    "pshufb    %%xmm6,%%xmm0                   \n"
523    "pshufb    %%xmm6,%%xmm1                   \n"
524    "pshufb    %%xmm6,%%xmm2                   \n"
525    "pshufb    %%xmm6,%%xmm3                   \n"
526    "movdqa    %%xmm1,%%xmm4                   \n"
527    "psrldq    $0x4,%%xmm1                     \n"
528    "pslldq    $0xc,%%xmm4                     \n"
529    "movdqa    %%xmm2,%%xmm5                   \n"
530    "por       %%xmm4,%%xmm0                   \n"
531    "pslldq    $0x8,%%xmm5                     \n"
532    "movdqa    %%xmm0,(%1)                     \n"
533    "por       %%xmm5,%%xmm1                   \n"
534    "psrldq    $0x8,%%xmm2                     \n"
535    "pslldq    $0x4,%%xmm3                     \n"
536    "por       %%xmm3,%%xmm2                   \n"
537    "movdqa    %%xmm1,0x10(%1)                 \n"
538    "movdqa    %%xmm2,0x20(%1)                 \n"
539    "lea       0x30(%1),%1                     \n"
540    "sub       $0x10,%2                        \n"
541    "jg        1b                              \n"
542  : "+r"(src),  // %0
543    "+r"(dst),  // %1
544    "+r"(pix)   // %2
545  : "m"(kShuffleMaskARGBToRAW)  // %3
546  : "memory", "cc"
547#if defined(__SSE2__)
548    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
549#endif
550  );
551}
552
553void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
554  asm volatile (
555    "pcmpeqb   %%xmm3,%%xmm3                   \n"
556    "psrld     $0x1b,%%xmm3                    \n"
557    "pcmpeqb   %%xmm4,%%xmm4                   \n"
558    "psrld     $0x1a,%%xmm4                    \n"
559    "pslld     $0x5,%%xmm4                     \n"
560    "pcmpeqb   %%xmm5,%%xmm5                   \n"
561    "pslld     $0xb,%%xmm5                     \n"
562    ".p2align  4                               \n"
563  "1:                                          \n"
564    "movdqa    (%0),%%xmm0                     \n"
565    "movdqa    %%xmm0,%%xmm1                   \n"
566    "movdqa    %%xmm0,%%xmm2                   \n"
567    "pslld     $0x8,%%xmm0                     \n"
568    "psrld     $0x3,%%xmm1                     \n"
569    "psrld     $0x5,%%xmm2                     \n"
570    "psrad     $0x10,%%xmm0                    \n"
571    "pand      %%xmm3,%%xmm1                   \n"
572    "pand      %%xmm4,%%xmm2                   \n"
573    "pand      %%xmm5,%%xmm0                   \n"
574    "por       %%xmm2,%%xmm1                   \n"
575    "por       %%xmm1,%%xmm0                   \n"
576    "packssdw  %%xmm0,%%xmm0                   \n"
577    "lea       0x10(%0),%0                     \n"
578    "movq      %%xmm0,(%1)                     \n"
579    "lea       0x8(%1),%1                      \n"
580    "sub       $0x4,%2                         \n"
581    "jg        1b                              \n"
582  : "+r"(src),  // %0
583    "+r"(dst),  // %1
584    "+r"(pix)   // %2
585  :
586  : "memory", "cc"
587#if defined(__SSE2__)
588    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
589#endif
590  );
591}
592
593void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
594  asm volatile (
595    "pcmpeqb   %%xmm4,%%xmm4                   \n"
596    "psrld     $0x1b,%%xmm4                    \n"
597    "movdqa    %%xmm4,%%xmm5                   \n"
598    "pslld     $0x5,%%xmm5                     \n"
599    "movdqa    %%xmm4,%%xmm6                   \n"
600    "pslld     $0xa,%%xmm6                     \n"
601    "pcmpeqb   %%xmm7,%%xmm7                   \n"
602    "pslld     $0xf,%%xmm7                     \n"
603    ".p2align  4                               \n"
604  "1:                                          \n"
605    "movdqa    (%0),%%xmm0                     \n"
606    "movdqa    %%xmm0,%%xmm1                   \n"
607    "movdqa    %%xmm0,%%xmm2                   \n"
608    "movdqa    %%xmm0,%%xmm3                   \n"
609    "psrad     $0x10,%%xmm0                    \n"
610    "psrld     $0x3,%%xmm1                     \n"
611    "psrld     $0x6,%%xmm2                     \n"
612    "psrld     $0x9,%%xmm3                     \n"
613    "pand      %%xmm7,%%xmm0                   \n"
614    "pand      %%xmm4,%%xmm1                   \n"
615    "pand      %%xmm5,%%xmm2                   \n"
616    "pand      %%xmm6,%%xmm3                   \n"
617    "por       %%xmm1,%%xmm0                   \n"
618    "por       %%xmm3,%%xmm2                   \n"
619    "por       %%xmm2,%%xmm0                   \n"
620    "packssdw  %%xmm0,%%xmm0                   \n"
621    "lea       0x10(%0),%0                     \n"
622    "movq      %%xmm0,(%1)                     \n"
623    "lea       0x8(%1),%1                      \n"
624    "sub       $0x4,%2                         \n"
625    "jg        1b                              \n"
626  : "+r"(src),  // %0
627    "+r"(dst),  // %1
628    "+r"(pix)   // %2
629  :
630  : "memory", "cc"
631#if defined(__SSE2__)
632    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
633#endif
634  );
635}
636
637void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
638  asm volatile (
639    "pcmpeqb   %%xmm4,%%xmm4                   \n"
640    "psllw     $0xc,%%xmm4                     \n"
641    "movdqa    %%xmm4,%%xmm3                   \n"
642    "psrlw     $0x8,%%xmm3                     \n"
643    ".p2align  4                               \n"
644  "1:                                          \n"
645    "movdqa    (%0),%%xmm0                     \n"
646    "movdqa    %%xmm0,%%xmm1                   \n"
647    "pand      %%xmm3,%%xmm0                   \n"
648    "pand      %%xmm4,%%xmm1                   \n"
649    "psrlq     $0x4,%%xmm0                     \n"
650    "psrlq     $0x8,%%xmm1                     \n"
651    "por       %%xmm1,%%xmm0                   \n"
652    "packuswb  %%xmm0,%%xmm0                   \n"
653    "lea       0x10(%0),%0                     \n"
654    "movq      %%xmm0,(%1)                     \n"
655    "lea       0x8(%1),%1                      \n"
656    "sub       $0x4,%2                         \n"
657    "jg        1b                              \n"
658  : "+r"(src),  // %0
659    "+r"(dst),  // %1
660    "+r"(pix)   // %2
661  :
662  : "memory", "cc"
663#if defined(__SSE2__)
664    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
665#endif
666  );
667}
668
669void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
670  asm volatile (
671    "movdqa    %4,%%xmm5                       \n"
672    "movdqa    %3,%%xmm4                       \n"
673    ".p2align  4                               \n"
674  "1:                                          \n"
675    "movdqa    (%0),%%xmm0                     \n"
676    "movdqa    0x10(%0),%%xmm1                 \n"
677    "movdqa    0x20(%0),%%xmm2                 \n"
678    "movdqa    0x30(%0),%%xmm3                 \n"
679    "pmaddubsw %%xmm4,%%xmm0                   \n"
680    "pmaddubsw %%xmm4,%%xmm1                   \n"
681    "pmaddubsw %%xmm4,%%xmm2                   \n"
682    "pmaddubsw %%xmm4,%%xmm3                   \n"
683    "lea       0x40(%0),%0                     \n"
684    "phaddw    %%xmm1,%%xmm0                   \n"
685    "phaddw    %%xmm3,%%xmm2                   \n"
686    "psrlw     $0x7,%%xmm0                     \n"
687    "psrlw     $0x7,%%xmm2                     \n"
688    "packuswb  %%xmm2,%%xmm0                   \n"
689    "paddb     %%xmm5,%%xmm0                   \n"
690    "sub       $0x10,%2                        \n"
691    "movdqa    %%xmm0,(%1)                     \n"
692    "lea       0x10(%1),%1                     \n"
693    "jg        1b                              \n"
694  : "+r"(src_argb),  // %0
695    "+r"(dst_y),     // %1
696    "+r"(pix)        // %2
697  : "m"(kARGBToY),   // %3
698    "m"(kAddY16)     // %4
699  : "memory", "cc"
700#if defined(__SSE2__)
701    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
702#endif
703  );
704}
705
706void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
707  asm volatile (
708    "movdqa    %4,%%xmm5                       \n"
709    "movdqa    %3,%%xmm4                       \n"
710    ".p2align  4                               \n"
711  "1:                                          \n"
712    "movdqu    (%0),%%xmm0                     \n"
713    "movdqu    0x10(%0),%%xmm1                 \n"
714    "movdqu    0x20(%0),%%xmm2                 \n"
715    "movdqu    0x30(%0),%%xmm3                 \n"
716    "pmaddubsw %%xmm4,%%xmm0                   \n"
717    "pmaddubsw %%xmm4,%%xmm1                   \n"
718    "pmaddubsw %%xmm4,%%xmm2                   \n"
719    "pmaddubsw %%xmm4,%%xmm3                   \n"
720    "lea       0x40(%0),%0                     \n"
721    "phaddw    %%xmm1,%%xmm0                   \n"
722    "phaddw    %%xmm3,%%xmm2                   \n"
723    "psrlw     $0x7,%%xmm0                     \n"
724    "psrlw     $0x7,%%xmm2                     \n"
725    "packuswb  %%xmm2,%%xmm0                   \n"
726    "paddb     %%xmm5,%%xmm0                   \n"
727    "sub       $0x10,%2                        \n"
728    "movdqu    %%xmm0,(%1)                     \n"
729    "lea       0x10(%1),%1                     \n"
730    "jg        1b                              \n"
731  : "+r"(src_argb),  // %0
732    "+r"(dst_y),     // %1
733    "+r"(pix)        // %2
734  : "m"(kARGBToY),   // %3
735    "m"(kAddY16)     // %4
736  : "memory", "cc"
737#if defined(__SSE2__)
738    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
739#endif
740  );
741}
742
743// TODO(fbarchard): pass xmm constants to single block of assembly.
744// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
745// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
746// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
747// and considered unsafe.
748void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
749                       uint8* dst_u, uint8* dst_v, int width) {
750  asm volatile (
751    "movdqa    %0,%%xmm4                       \n"
752    "movdqa    %1,%%xmm3                       \n"
753    "movdqa    %2,%%xmm5                       \n"
754  :
755  : "m"(kARGBToU),  // %0
756    "m"(kARGBToV),  // %1
757    "m"(kAddUV128)  // %2
758  );
759  asm volatile (
760    "sub       %1,%2                           \n"
761    ".p2align  4                               \n"
762  "1:                                          \n"
763    "movdqa    (%0),%%xmm0                     \n"
764    "movdqa    0x10(%0),%%xmm1                 \n"
765    "movdqa    0x20(%0),%%xmm2                 \n"
766    "movdqa    0x30(%0),%%xmm6                 \n"
767    "pavgb     (%0,%4,1),%%xmm0                \n"
768    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
769    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
770    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
771    "lea       0x40(%0),%0                     \n"
772    "movdqa    %%xmm0,%%xmm7                   \n"
773    "shufps    $0x88,%%xmm1,%%xmm0             \n"
774    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
775    "pavgb     %%xmm7,%%xmm0                   \n"
776    "movdqa    %%xmm2,%%xmm7                   \n"
777    "shufps    $0x88,%%xmm6,%%xmm2             \n"
778    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
779    "pavgb     %%xmm7,%%xmm2                   \n"
780    "movdqa    %%xmm0,%%xmm1                   \n"
781    "movdqa    %%xmm2,%%xmm6                   \n"
782    "pmaddubsw %%xmm4,%%xmm0                   \n"
783    "pmaddubsw %%xmm4,%%xmm2                   \n"
784    "pmaddubsw %%xmm3,%%xmm1                   \n"
785    "pmaddubsw %%xmm3,%%xmm6                   \n"
786    "phaddw    %%xmm2,%%xmm0                   \n"
787    "phaddw    %%xmm6,%%xmm1                   \n"
788    "psraw     $0x8,%%xmm0                     \n"
789    "psraw     $0x8,%%xmm1                     \n"
790    "packsswb  %%xmm1,%%xmm0                   \n"
791    "paddb     %%xmm5,%%xmm0                   \n"
792    "sub       $0x10,%3                        \n"
793    "movlps    %%xmm0,(%1)                     \n"
794    "movhps    %%xmm0,(%1,%2,1)                \n"
795    "lea       0x8(%1),%1                      \n"
796    "jg        1b                              \n"
797  : "+r"(src_argb0),       // %0
798    "+r"(dst_u),           // %1
799    "+r"(dst_v),           // %2
800    "+rm"(width)           // %3
801  : "r"(static_cast<intptr_t>(src_stride_argb))
802  : "memory", "cc"
803#if defined(__SSE2__)
804    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
805#endif
806  );
807}
808
809void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
810                                 uint8* dst_u, uint8* dst_v, int width) {
811  asm volatile (
812    "movdqa    %0,%%xmm4                       \n"
813    "movdqa    %1,%%xmm3                       \n"
814    "movdqa    %2,%%xmm5                       \n"
815  :
816  : "m"(kARGBToU),         // %0
817    "m"(kARGBToV),         // %1
818    "m"(kAddUV128)         // %2
819  );
820  asm volatile (
821    "sub       %1,%2                           \n"
822    ".p2align  4                               \n"
823  "1:                                          \n"
824    "movdqu    (%0),%%xmm0                     \n"
825    "movdqu    0x10(%0),%%xmm1                 \n"
826    "movdqu    0x20(%0),%%xmm2                 \n"
827    "movdqu    0x30(%0),%%xmm6                 \n"
828    "movdqu    (%0,%4,1),%%xmm7                \n"
829    "pavgb     %%xmm7,%%xmm0                   \n"
830    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
831    "pavgb     %%xmm7,%%xmm1                   \n"
832    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
833    "pavgb     %%xmm7,%%xmm2                   \n"
834    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
835    "pavgb     %%xmm7,%%xmm6                   \n"
836    "lea       0x40(%0),%0                     \n"
837    "movdqa    %%xmm0,%%xmm7                   \n"
838    "shufps    $0x88,%%xmm1,%%xmm0             \n"
839    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
840    "pavgb     %%xmm7,%%xmm0                   \n"
841    "movdqa    %%xmm2,%%xmm7                   \n"
842    "shufps    $0x88,%%xmm6,%%xmm2             \n"
843    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
844    "pavgb     %%xmm7,%%xmm2                   \n"
845    "movdqa    %%xmm0,%%xmm1                   \n"
846    "movdqa    %%xmm2,%%xmm6                   \n"
847    "pmaddubsw %%xmm4,%%xmm0                   \n"
848    "pmaddubsw %%xmm4,%%xmm2                   \n"
849    "pmaddubsw %%xmm3,%%xmm1                   \n"
850    "pmaddubsw %%xmm3,%%xmm6                   \n"
851    "phaddw    %%xmm2,%%xmm0                   \n"
852    "phaddw    %%xmm6,%%xmm1                   \n"
853    "psraw     $0x8,%%xmm0                     \n"
854    "psraw     $0x8,%%xmm1                     \n"
855    "packsswb  %%xmm1,%%xmm0                   \n"
856    "paddb     %%xmm5,%%xmm0                   \n"
857    "sub       $0x10,%3                        \n"
858    "movlps    %%xmm0,(%1)                     \n"
859    "movhps    %%xmm0,(%1,%2,1)                \n"
860    "lea       0x8(%1),%1                      \n"
861    "jg        1b                              \n"
862  : "+r"(src_argb0),       // %0
863    "+r"(dst_u),           // %1
864    "+r"(dst_v),           // %2
865    "+rm"(width)           // %3
866  : "r"(static_cast<intptr_t>(src_stride_argb))
867  : "memory", "cc"
868#if defined(__SSE2__)
869    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
870#endif
871  );
872}
873
874void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
875  asm volatile (
876    "movdqa    %4,%%xmm5                       \n"
877    "movdqa    %3,%%xmm4                       \n"
878    ".p2align  4                               \n"
879  "1:                                          \n"
880    "movdqa    (%0),%%xmm0                     \n"
881    "movdqa    0x10(%0),%%xmm1                 \n"
882    "movdqa    0x20(%0),%%xmm2                 \n"
883    "movdqa    0x30(%0),%%xmm3                 \n"
884    "pmaddubsw %%xmm4,%%xmm0                   \n"
885    "pmaddubsw %%xmm4,%%xmm1                   \n"
886    "pmaddubsw %%xmm4,%%xmm2                   \n"
887    "pmaddubsw %%xmm4,%%xmm3                   \n"
888    "lea       0x40(%0),%0                     \n"
889    "phaddw    %%xmm1,%%xmm0                   \n"
890    "phaddw    %%xmm3,%%xmm2                   \n"
891    "psrlw     $0x7,%%xmm0                     \n"
892    "psrlw     $0x7,%%xmm2                     \n"
893    "packuswb  %%xmm2,%%xmm0                   \n"
894    "paddb     %%xmm5,%%xmm0                   \n"
895    "sub       $0x10,%2                        \n"
896    "movdqa    %%xmm0,(%1)                     \n"
897    "lea       0x10(%1),%1                     \n"
898    "jg        1b                              \n"
899  : "+r"(src_bgra),  // %0
900    "+r"(dst_y),     // %1
901    "+r"(pix)        // %2
902  : "m"(kBGRAToY),   // %3
903    "m"(kAddY16)     // %4
904  : "memory", "cc"
905#if defined(__SSE2__)
906    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
907#endif
908  );
909}
910
911void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
912  asm volatile (
913    "movdqa    %4,%%xmm5                       \n"
914    "movdqa    %3,%%xmm4                       \n"
915    ".p2align  4                               \n"
916  "1:                                          \n"
917    "movdqu    (%0),%%xmm0                     \n"
918    "movdqu    0x10(%0),%%xmm1                 \n"
919    "movdqu    0x20(%0),%%xmm2                 \n"
920    "movdqu    0x30(%0),%%xmm3                 \n"
921    "pmaddubsw %%xmm4,%%xmm0                   \n"
922    "pmaddubsw %%xmm4,%%xmm1                   \n"
923    "pmaddubsw %%xmm4,%%xmm2                   \n"
924    "pmaddubsw %%xmm4,%%xmm3                   \n"
925    "lea       0x40(%0),%0                     \n"
926    "phaddw    %%xmm1,%%xmm0                   \n"
927    "phaddw    %%xmm3,%%xmm2                   \n"
928    "psrlw     $0x7,%%xmm0                     \n"
929    "psrlw     $0x7,%%xmm2                     \n"
930    "packuswb  %%xmm2,%%xmm0                   \n"
931    "paddb     %%xmm5,%%xmm0                   \n"
932    "sub       $0x10,%2                        \n"
933    "movdqu    %%xmm0,(%1)                     \n"
934    "lea       0x10(%1),%1                     \n"
935    "jg        1b                              \n"
936  : "+r"(src_bgra),  // %0
937    "+r"(dst_y),     // %1
938    "+r"(pix)        // %2
939  : "m"(kBGRAToY),   // %3
940    "m"(kAddY16)     // %4
941  : "memory", "cc"
942#if defined(__SSE2__)
943    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
944#endif
945  );
946}
947
948void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
949                       uint8* dst_u, uint8* dst_v, int width) {
950  asm volatile (
951    "movdqa    %0,%%xmm4                       \n"
952    "movdqa    %1,%%xmm3                       \n"
953    "movdqa    %2,%%xmm5                       \n"
954  :
955  : "m"(kBGRAToU),         // %0
956    "m"(kBGRAToV),         // %1
957    "m"(kAddUV128)         // %2
958  );
959  asm volatile (
960    "sub       %1,%2                           \n"
961    ".p2align  4                               \n"
962  "1:                                          \n"
963    "movdqa    (%0),%%xmm0                     \n"
964    "movdqa    0x10(%0),%%xmm1                 \n"
965    "movdqa    0x20(%0),%%xmm2                 \n"
966    "movdqa    0x30(%0),%%xmm6                 \n"
967    "pavgb     (%0,%4,1),%%xmm0                \n"
968    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
969    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
970    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
971    "lea       0x40(%0),%0                     \n"
972    "movdqa    %%xmm0,%%xmm7                   \n"
973    "shufps    $0x88,%%xmm1,%%xmm0             \n"
974    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
975    "pavgb     %%xmm7,%%xmm0                   \n"
976    "movdqa    %%xmm2,%%xmm7                   \n"
977    "shufps    $0x88,%%xmm6,%%xmm2             \n"
978    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
979    "pavgb     %%xmm7,%%xmm2                   \n"
980    "movdqa    %%xmm0,%%xmm1                   \n"
981    "movdqa    %%xmm2,%%xmm6                   \n"
982    "pmaddubsw %%xmm4,%%xmm0                   \n"
983    "pmaddubsw %%xmm4,%%xmm2                   \n"
984    "pmaddubsw %%xmm3,%%xmm1                   \n"
985    "pmaddubsw %%xmm3,%%xmm6                   \n"
986    "phaddw    %%xmm2,%%xmm0                   \n"
987    "phaddw    %%xmm6,%%xmm1                   \n"
988    "psraw     $0x8,%%xmm0                     \n"
989    "psraw     $0x8,%%xmm1                     \n"
990    "packsswb  %%xmm1,%%xmm0                   \n"
991    "paddb     %%xmm5,%%xmm0                   \n"
992    "sub       $0x10,%3                        \n"
993    "movlps    %%xmm0,(%1)                     \n"
994    "movhps    %%xmm0,(%1,%2,1)                \n"
995    "lea       0x8(%1),%1                      \n"
996    "jg        1b                              \n"
997  : "+r"(src_bgra0),       // %0
998    "+r"(dst_u),           // %1
999    "+r"(dst_v),           // %2
1000    "+rm"(width)           // %3
1001  : "r"(static_cast<intptr_t>(src_stride_bgra))
1002  : "memory", "cc"
1003#if defined(__SSE2__)
1004    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1005#endif
1006  );
1007}
1008
1009void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1010                                 uint8* dst_u, uint8* dst_v, int width) {
1011  asm volatile (
1012    "movdqa    %0,%%xmm4                       \n"
1013    "movdqa    %1,%%xmm3                       \n"
1014    "movdqa    %2,%%xmm5                       \n"
1015  :
1016  : "m"(kBGRAToU),         // %0
1017    "m"(kBGRAToV),         // %1
1018    "m"(kAddUV128)         // %2
1019  );
1020  asm volatile (
1021    "sub       %1,%2                           \n"
1022    ".p2align  4                               \n"
1023  "1:                                          \n"
1024    "movdqu    (%0),%%xmm0                     \n"
1025    "movdqu    0x10(%0),%%xmm1                 \n"
1026    "movdqu    0x20(%0),%%xmm2                 \n"
1027    "movdqu    0x30(%0),%%xmm6                 \n"
1028    "movdqu    (%0,%4,1),%%xmm7                \n"
1029    "pavgb     %%xmm7,%%xmm0                   \n"
1030    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1031    "pavgb     %%xmm7,%%xmm1                   \n"
1032    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1033    "pavgb     %%xmm7,%%xmm2                   \n"
1034    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1035    "pavgb     %%xmm7,%%xmm6                   \n"
1036    "lea       0x40(%0),%0                     \n"
1037    "movdqa    %%xmm0,%%xmm7                   \n"
1038    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1039    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1040    "pavgb     %%xmm7,%%xmm0                   \n"
1041    "movdqa    %%xmm2,%%xmm7                   \n"
1042    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1043    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1044    "pavgb     %%xmm7,%%xmm2                   \n"
1045    "movdqa    %%xmm0,%%xmm1                   \n"
1046    "movdqa    %%xmm2,%%xmm6                   \n"
1047    "pmaddubsw %%xmm4,%%xmm0                   \n"
1048    "pmaddubsw %%xmm4,%%xmm2                   \n"
1049    "pmaddubsw %%xmm3,%%xmm1                   \n"
1050    "pmaddubsw %%xmm3,%%xmm6                   \n"
1051    "phaddw    %%xmm2,%%xmm0                   \n"
1052    "phaddw    %%xmm6,%%xmm1                   \n"
1053    "psraw     $0x8,%%xmm0                     \n"
1054    "psraw     $0x8,%%xmm1                     \n"
1055    "packsswb  %%xmm1,%%xmm0                   \n"
1056    "paddb     %%xmm5,%%xmm0                   \n"
1057    "sub       $0x10,%3                        \n"
1058    "movlps    %%xmm0,(%1)                     \n"
1059    "movhps    %%xmm0,(%1,%2,1)                \n"
1060    "lea       0x8(%1),%1                      \n"
1061    "jg        1b                              \n"
1062  : "+r"(src_bgra0),       // %0
1063    "+r"(dst_u),           // %1
1064    "+r"(dst_v),           // %2
1065    "+rm"(width)           // %3
1066  : "r"(static_cast<intptr_t>(src_stride_bgra))
1067  : "memory", "cc"
1068#if defined(__SSE2__)
1069    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1070#endif
1071  );
1072}
1073
1074void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1075  asm volatile (
1076    "movdqa    %4,%%xmm5                       \n"
1077    "movdqa    %3,%%xmm4                       \n"
1078    ".p2align  4                               \n"
1079  "1:                                          \n"
1080    "movdqa    (%0),%%xmm0                     \n"
1081    "movdqa    0x10(%0),%%xmm1                 \n"
1082    "movdqa    0x20(%0),%%xmm2                 \n"
1083    "movdqa    0x30(%0),%%xmm3                 \n"
1084    "pmaddubsw %%xmm4,%%xmm0                   \n"
1085    "pmaddubsw %%xmm4,%%xmm1                   \n"
1086    "pmaddubsw %%xmm4,%%xmm2                   \n"
1087    "pmaddubsw %%xmm4,%%xmm3                   \n"
1088    "lea       0x40(%0),%0                     \n"
1089    "phaddw    %%xmm1,%%xmm0                   \n"
1090    "phaddw    %%xmm3,%%xmm2                   \n"
1091    "psrlw     $0x7,%%xmm0                     \n"
1092    "psrlw     $0x7,%%xmm2                     \n"
1093    "packuswb  %%xmm2,%%xmm0                   \n"
1094    "paddb     %%xmm5,%%xmm0                   \n"
1095    "sub       $0x10,%2                        \n"
1096    "movdqa    %%xmm0,(%1)                     \n"
1097    "lea       0x10(%1),%1                     \n"
1098    "jg        1b                              \n"
1099  : "+r"(src_abgr),  // %0
1100    "+r"(dst_y),     // %1
1101    "+r"(pix)        // %2
1102  : "m"(kABGRToY),   // %3
1103    "m"(kAddY16)     // %4
1104  : "memory", "cc"
1105#if defined(__SSE2__)
1106    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1107#endif
1108  );
1109}
1110
1111void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1112  asm volatile (
1113    "movdqa    %4,%%xmm5                       \n"
1114    "movdqa    %3,%%xmm4                       \n"
1115    ".p2align  4                               \n"
1116  "1:                                          \n"
1117    "movdqu    (%0),%%xmm0                     \n"
1118    "movdqu    0x10(%0),%%xmm1                 \n"
1119    "movdqu    0x20(%0),%%xmm2                 \n"
1120    "movdqu    0x30(%0),%%xmm3                 \n"
1121    "pmaddubsw %%xmm4,%%xmm0                   \n"
1122    "pmaddubsw %%xmm4,%%xmm1                   \n"
1123    "pmaddubsw %%xmm4,%%xmm2                   \n"
1124    "pmaddubsw %%xmm4,%%xmm3                   \n"
1125    "lea       0x40(%0),%0                     \n"
1126    "phaddw    %%xmm1,%%xmm0                   \n"
1127    "phaddw    %%xmm3,%%xmm2                   \n"
1128    "psrlw     $0x7,%%xmm0                     \n"
1129    "psrlw     $0x7,%%xmm2                     \n"
1130    "packuswb  %%xmm2,%%xmm0                   \n"
1131    "paddb     %%xmm5,%%xmm0                   \n"
1132    "sub       $0x10,%2                        \n"
1133    "movdqu    %%xmm0,(%1)                     \n"
1134    "lea       0x10(%1),%1                     \n"
1135    "jg        1b                              \n"
1136  : "+r"(src_abgr),  // %0
1137    "+r"(dst_y),     // %1
1138    "+r"(pix)        // %2
1139  : "m"(kABGRToY),   // %3
1140    "m"(kAddY16)     // %4
1141  : "memory", "cc"
1142#if defined(__SSE2__)
1143    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1144#endif
1145  );
1146}
1147
1148void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1149                       uint8* dst_u, uint8* dst_v, int width) {
1150  asm volatile (
1151    "movdqa    %0,%%xmm4                       \n"
1152    "movdqa    %1,%%xmm3                       \n"
1153    "movdqa    %2,%%xmm5                       \n"
1154  :
1155  : "m"(kABGRToU),         // %0
1156    "m"(kABGRToV),         // %1
1157    "m"(kAddUV128)         // %2
1158  );
1159  asm volatile (
1160    "sub       %1,%2                           \n"
1161    ".p2align  4                               \n"
1162  "1:                                          \n"
1163    "movdqa    (%0),%%xmm0                     \n"
1164    "movdqa    0x10(%0),%%xmm1                 \n"
1165    "movdqa    0x20(%0),%%xmm2                 \n"
1166    "movdqa    0x30(%0),%%xmm6                 \n"
1167    "pavgb     (%0,%4,1),%%xmm0                \n"
1168    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
1169    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
1170    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
1171    "lea       0x40(%0),%0                     \n"
1172    "movdqa    %%xmm0,%%xmm7                   \n"
1173    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1174    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1175    "pavgb     %%xmm7,%%xmm0                   \n"
1176    "movdqa    %%xmm2,%%xmm7                   \n"
1177    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1178    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1179    "pavgb     %%xmm7,%%xmm2                   \n"
1180    "movdqa    %%xmm0,%%xmm1                   \n"
1181    "movdqa    %%xmm2,%%xmm6                   \n"
1182    "pmaddubsw %%xmm4,%%xmm0                   \n"
1183    "pmaddubsw %%xmm4,%%xmm2                   \n"
1184    "pmaddubsw %%xmm3,%%xmm1                   \n"
1185    "pmaddubsw %%xmm3,%%xmm6                   \n"
1186    "phaddw    %%xmm2,%%xmm0                   \n"
1187    "phaddw    %%xmm6,%%xmm1                   \n"
1188    "psraw     $0x8,%%xmm0                     \n"
1189    "psraw     $0x8,%%xmm1                     \n"
1190    "packsswb  %%xmm1,%%xmm0                   \n"
1191    "paddb     %%xmm5,%%xmm0                   \n"
1192    "sub       $0x10,%3                        \n"
1193    "movlps    %%xmm0,(%1)                     \n"
1194    "movhps    %%xmm0,(%1,%2,1)                \n"
1195    "lea       0x8(%1),%1                      \n"
1196    "jg        1b                              \n"
1197  : "+r"(src_abgr0),       // %0
1198    "+r"(dst_u),           // %1
1199    "+r"(dst_v),           // %2
1200    "+rm"(width)           // %3
1201  : "r"(static_cast<intptr_t>(src_stride_abgr))
1202  : "memory", "cc"
1203#if defined(__SSE2__)
1204    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1205#endif
1206  );
1207}
1208
1209void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1210                                 uint8* dst_u, uint8* dst_v, int width) {
1211  asm volatile (
1212    "movdqa    %0,%%xmm4                       \n"
1213    "movdqa    %1,%%xmm3                       \n"
1214    "movdqa    %2,%%xmm5                       \n"
1215  :
1216  : "m"(kABGRToU),         // %0
1217    "m"(kABGRToV),         // %1
1218    "m"(kAddUV128)         // %2
1219  );
1220  asm volatile (
1221    "sub       %1,%2                           \n"
1222    ".p2align  4                               \n"
1223  "1:                                          \n"
1224    "movdqu    (%0),%%xmm0                     \n"
1225    "movdqu    0x10(%0),%%xmm1                 \n"
1226    "movdqu    0x20(%0),%%xmm2                 \n"
1227    "movdqu    0x30(%0),%%xmm6                 \n"
1228    "movdqu    (%0,%4,1),%%xmm7                \n"
1229    "pavgb     %%xmm7,%%xmm0                   \n"
1230    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
1231    "pavgb     %%xmm7,%%xmm1                   \n"
1232    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
1233    "pavgb     %%xmm7,%%xmm2                   \n"
1234    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
1235    "pavgb     %%xmm7,%%xmm6                   \n"
1236    "lea       0x40(%0),%0                     \n"
1237    "movdqa    %%xmm0,%%xmm7                   \n"
1238    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1239    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1240    "pavgb     %%xmm7,%%xmm0                   \n"
1241    "movdqa    %%xmm2,%%xmm7                   \n"
1242    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1243    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1244    "pavgb     %%xmm7,%%xmm2                   \n"
1245    "movdqa    %%xmm0,%%xmm1                   \n"
1246    "movdqa    %%xmm2,%%xmm6                   \n"
1247    "pmaddubsw %%xmm4,%%xmm0                   \n"
1248    "pmaddubsw %%xmm4,%%xmm2                   \n"
1249    "pmaddubsw %%xmm3,%%xmm1                   \n"
1250    "pmaddubsw %%xmm3,%%xmm6                   \n"
1251    "phaddw    %%xmm2,%%xmm0                   \n"
1252    "phaddw    %%xmm6,%%xmm1                   \n"
1253    "psraw     $0x8,%%xmm0                     \n"
1254    "psraw     $0x8,%%xmm1                     \n"
1255    "packsswb  %%xmm1,%%xmm0                   \n"
1256    "paddb     %%xmm5,%%xmm0                   \n"
1257    "sub       $0x10,%3                        \n"
1258    "movlps    %%xmm0,(%1)                     \n"
1259    "movhps    %%xmm0,(%1,%2,1)                \n"
1260    "lea       0x8(%1),%1                      \n"
1261    "jg        1b                              \n"
1262  : "+r"(src_abgr0),       // %0
1263    "+r"(dst_u),           // %1
1264    "+r"(dst_v),           // %2
1265    "+rm"(width)           // %3
1266  : "r"(static_cast<intptr_t>(src_stride_abgr))
1267  : "memory", "cc"
1268#if defined(__SSE2__)
1269    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1270#endif
1271  );
1272}
1273#endif  // HAS_ARGBTOYROW_SSSE3
1274
1275#ifdef HAS_I422TOARGBROW_SSSE3
1276#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
1277#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
1278#define UR 0
1279
1280#define VB 0
1281#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
1282#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
1283
1284// Bias
1285#define BB UB * 128 + VB * 128
1286#define BG UG * 128 + VG * 128
1287#define BR UR * 128 + VR * 128
1288
1289#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
1290
1291struct {
1292  vec8 kUVToB;  // 0
1293  vec8 kUVToG;  // 16
1294  vec8 kUVToR;  // 32
1295  vec16 kUVBiasB;  // 48
1296  vec16 kUVBiasG;  // 64
1297  vec16 kUVBiasR;  // 80
1298  vec16 kYSub16;  // 96
1299  vec16 kYToRgb;  // 112
1300  vec8 kVUToB;  // 128
1301  vec8 kVUToG;  // 144
1302  vec8 kVUToR;  // 160
1303} CONST SIMD_ALIGNED(kYuvConstants) = {
1304  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
1305  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
1306  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
1307  { BB, BB, BB, BB, BB, BB, BB, BB },
1308  { BG, BG, BG, BG, BG, BG, BG, BG },
1309  { BR, BR, BR, BR, BR, BR, BR, BR },
1310  { 16, 16, 16, 16, 16, 16, 16, 16 },
1311  { YG, YG, YG, YG, YG, YG, YG, YG },
1312  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
1313  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
1314  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
1315};
1316
1317
1318// Read 8 UV from 411
1319#define READYUV444                                                             \
1320    "movq       (%[u_buf]),%%xmm0              \n"                             \
1321    "movq       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
1322    "lea        0x8(%[u_buf]),%[u_buf]         \n"                             \
1323    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
1324
1325// Read 4 UV from 422, upsample to 8 UV
1326#define READYUV422                                                             \
1327    "movd       (%[u_buf]),%%xmm0              \n"                             \
1328    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
1329    "lea        0x4(%[u_buf]),%[u_buf]         \n"                             \
1330    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
1331    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
1332
1333// Read 2 UV from 411, upsample to 8 UV
1334#define READYUV411                                                             \
1335    "movd       (%[u_buf]),%%xmm0              \n"                             \
1336    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
1337    "lea        0x2(%[u_buf]),%[u_buf]         \n"                             \
1338    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
1339    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
1340    "punpckldq  %%xmm0,%%xmm0                  \n"                             \
1341
1342// Read 4 UV from NV12, upsample to 8 UV
1343#define READNV12                                                               \
1344    "movq       (%[uv_buf]),%%xmm0             \n"                             \
1345    "lea        0x8(%[uv_buf]),%[uv_buf]       \n"                             \
1346    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
1347
1348// Convert 8 pixels: 8 UV and 8 Y
1349#define YUVTORGB                                                               \
1350    "movdqa     %%xmm0,%%xmm1                  \n"                             \
1351    "movdqa     %%xmm0,%%xmm2                  \n"                             \
1352    "pmaddubsw  (%[kYuvConstants]),%%xmm0      \n"                             \
1353    "pmaddubsw  16(%[kYuvConstants]),%%xmm1    \n"                             \
1354    "pmaddubsw  32(%[kYuvConstants]),%%xmm2    \n"                             \
1355    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
1356    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
1357    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
1358    "movq       (%[y_buf]),%%xmm3              \n"                             \
1359    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
1360    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
1361    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
1362    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
1363    "paddsw     %%xmm3,%%xmm0                  \n"                             \
1364    "paddsw     %%xmm3,%%xmm1                  \n"                             \
1365    "paddsw     %%xmm3,%%xmm2                  \n"                             \
1366    "psraw      $0x6,%%xmm0                    \n"                             \
1367    "psraw      $0x6,%%xmm1                    \n"                             \
1368    "psraw      $0x6,%%xmm2                    \n"                             \
1369    "packuswb   %%xmm0,%%xmm0                  \n"                             \
1370    "packuswb   %%xmm1,%%xmm1                  \n"                             \
1371    "packuswb   %%xmm2,%%xmm2                  \n"                             \
1372
1373// Convert 8 pixels: 8 VU and 8 Y
1374#define YVUTORGB                                                               \
1375    "movdqa     %%xmm0,%%xmm1                  \n"                             \
1376    "movdqa     %%xmm0,%%xmm2                  \n"                             \
1377    "pmaddubsw  128(%[kYuvConstants]),%%xmm0   \n"                             \
1378    "pmaddubsw  144(%[kYuvConstants]),%%xmm1   \n"                             \
1379    "pmaddubsw  160(%[kYuvConstants]),%%xmm2   \n"                             \
1380    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
1381    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
1382    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
1383    "movq       (%[y_buf]),%%xmm3              \n"                             \
1384    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
1385    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
1386    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
1387    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
1388    "paddsw     %%xmm3,%%xmm0                  \n"                             \
1389    "paddsw     %%xmm3,%%xmm1                  \n"                             \
1390    "paddsw     %%xmm3,%%xmm2                  \n"                             \
1391    "psraw      $0x6,%%xmm0                    \n"                             \
1392    "psraw      $0x6,%%xmm1                    \n"                             \
1393    "psraw      $0x6,%%xmm2                    \n"                             \
1394    "packuswb   %%xmm0,%%xmm0                  \n"                             \
1395    "packuswb   %%xmm1,%%xmm1                  \n"                             \
1396    "packuswb   %%xmm2,%%xmm2                  \n"                             \
1397
1398void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
1399                                const uint8* u_buf,
1400                                const uint8* v_buf,
1401                                uint8* argb_buf,
1402                                int width) {
1403  asm volatile (
1404    "sub       %[u_buf],%[v_buf]               \n"
1405    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1406    "pxor      %%xmm4,%%xmm4                   \n"
1407    ".p2align  4                               \n"
1408  "1:                                          \n"
1409    READYUV444
1410    YUVTORGB
1411    "punpcklbw %%xmm1,%%xmm0                   \n"
1412    "punpcklbw %%xmm5,%%xmm2                   \n"
1413    "movdqa    %%xmm0,%%xmm1                   \n"
1414    "punpcklwd %%xmm2,%%xmm0                   \n"
1415    "punpckhwd %%xmm2,%%xmm1                   \n"
1416    "movdqa    %%xmm0,(%[argb_buf])            \n"
1417    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1418    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1419    "sub       $0x8,%[width]                   \n"
1420    "jg        1b                              \n"
1421  : [y_buf]"+r"(y_buf),    // %[y_buf]
1422    [u_buf]"+r"(u_buf),    // %[u_buf]
1423    [v_buf]"+r"(v_buf),    // %[v_buf]
1424    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1425    [width]"+rm"(width)    // %[width]
1426  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1427  : "memory", "cc"
1428#if defined(__SSE2__)
1429    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1430#endif
1431  );
1432}
1433
1434void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
1435                                const uint8* u_buf,
1436                                const uint8* v_buf,
1437                                uint8* argb_buf,
1438                                int width) {
1439  asm volatile (
1440    "sub       %[u_buf],%[v_buf]               \n"
1441    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1442    "pxor      %%xmm4,%%xmm4                   \n"
1443    ".p2align  4                               \n"
1444  "1:                                          \n"
1445    READYUV422
1446    YUVTORGB
1447    "punpcklbw %%xmm1,%%xmm0                   \n"
1448    "punpcklbw %%xmm5,%%xmm2                   \n"
1449    "movdqa    %%xmm0,%%xmm1                   \n"
1450    "punpcklwd %%xmm2,%%xmm0                   \n"
1451    "punpckhwd %%xmm2,%%xmm1                   \n"
1452    "movdqa    %%xmm0,(%[argb_buf])            \n"
1453    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1454    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1455    "sub       $0x8,%[width]                   \n"
1456    "jg        1b                              \n"
1457  : [y_buf]"+r"(y_buf),    // %[y_buf]
1458    [u_buf]"+r"(u_buf),    // %[u_buf]
1459    [v_buf]"+r"(v_buf),    // %[v_buf]
1460    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1461    [width]"+rm"(width)    // %[width]
1462  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1463  : "memory", "cc"
1464#if defined(__SSE2__)
1465    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1466#endif
1467  );
1468}
1469
1470void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
1471                                const uint8* u_buf,
1472                                const uint8* v_buf,
1473                                uint8* argb_buf,
1474                                int width) {
1475  asm volatile (
1476    "sub       %[u_buf],%[v_buf]               \n"
1477    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1478    "pxor      %%xmm4,%%xmm4                   \n"
1479    ".p2align  4                               \n"
1480  "1:                                          \n"
1481    READYUV411
1482    YUVTORGB
1483    "punpcklbw %%xmm1,%%xmm0                   \n"
1484    "punpcklbw %%xmm5,%%xmm2                   \n"
1485    "movdqa    %%xmm0,%%xmm1                   \n"
1486    "punpcklwd %%xmm2,%%xmm0                   \n"
1487    "punpckhwd %%xmm2,%%xmm1                   \n"
1488    "movdqa    %%xmm0,(%[argb_buf])            \n"
1489    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1490    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1491    "sub       $0x8,%[width]                   \n"
1492    "jg        1b                              \n"
1493  : [y_buf]"+r"(y_buf),    // %[y_buf]
1494    [u_buf]"+r"(u_buf),    // %[u_buf]
1495    [v_buf]"+r"(v_buf),    // %[v_buf]
1496    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1497    [width]"+rm"(width)    // %[width]
1498  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1499  : "memory", "cc"
1500#if defined(__SSE2__)
1501    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1502#endif
1503  );
1504}
1505
1506void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
1507                                const uint8* uv_buf,
1508                                uint8* argb_buf,
1509                                int width) {
1510  asm volatile (
1511    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1512    "pxor      %%xmm4,%%xmm4                   \n"
1513    ".p2align  4                               \n"
1514  "1:                                          \n"
1515    READNV12
1516    YUVTORGB
1517    "punpcklbw %%xmm1,%%xmm0                   \n"
1518    "punpcklbw %%xmm5,%%xmm2                   \n"
1519    "movdqa    %%xmm0,%%xmm1                   \n"
1520    "punpcklwd %%xmm2,%%xmm0                   \n"
1521    "punpckhwd %%xmm2,%%xmm1                   \n"
1522    "movdqa    %%xmm0,(%[argb_buf])            \n"
1523    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1524    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1525    "sub       $0x8,%[width]                   \n"
1526    "jg        1b                              \n"
1527  : [y_buf]"+r"(y_buf),    // %[y_buf]
1528    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1529    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1530    [width]"+rm"(width)    // %[width]
1531  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1532  : "memory", "cc"
1533#if defined(__SSE2__)
1534    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1535#endif
1536  );
1537}
1538
1539void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
1540                                const uint8* vu_buf,
1541                                uint8* argb_buf,
1542                                int width) {
1543  asm volatile (
1544    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1545    "pxor      %%xmm4,%%xmm4                   \n"
1546    ".p2align  4                               \n"
1547  "1:                                          \n"
1548    READNV12
1549    YVUTORGB
1550    "punpcklbw %%xmm1,%%xmm0                   \n"
1551    "punpcklbw %%xmm5,%%xmm2                   \n"
1552    "movdqa    %%xmm0,%%xmm1                   \n"
1553    "punpcklwd %%xmm2,%%xmm0                   \n"
1554    "punpckhwd %%xmm2,%%xmm1                   \n"
1555    "movdqa    %%xmm0,(%[argb_buf])            \n"
1556    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1557    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1558    "sub       $0x8,%[width]                   \n"
1559    "jg        1b                              \n"
1560  : [y_buf]"+r"(y_buf),    // %[y_buf]
1561    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
1562    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1563    [width]"+rm"(width)    // %[width]
1564  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1565  : "memory", "cc"
1566#if defined(__SSE2__)
1567    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1568#endif
1569  );
1570}
1571
1572void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1573                                          const uint8* u_buf,
1574                                          const uint8* v_buf,
1575                                          uint8* argb_buf,
1576                                          int width) {
1577  asm volatile (
1578    "sub       %[u_buf],%[v_buf]               \n"
1579    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1580    "pxor      %%xmm4,%%xmm4                   \n"
1581    ".p2align  4                               \n"
1582  "1:                                          \n"
1583    READYUV444
1584    YUVTORGB
1585    "punpcklbw %%xmm1,%%xmm0                   \n"
1586    "punpcklbw %%xmm5,%%xmm2                   \n"
1587    "movdqa    %%xmm0,%%xmm1                   \n"
1588    "punpcklwd %%xmm2,%%xmm0                   \n"
1589    "punpckhwd %%xmm2,%%xmm1                   \n"
1590    "movdqu    %%xmm0,(%[argb_buf])            \n"
1591    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1592    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1593    "sub       $0x8,%[width]                   \n"
1594    "jg        1b                              \n"
1595  : [y_buf]"+r"(y_buf),    // %[y_buf]
1596    [u_buf]"+r"(u_buf),    // %[u_buf]
1597    [v_buf]"+r"(v_buf),    // %[v_buf]
1598    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1599    [width]"+rm"(width)    // %[width]
1600  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1601  : "memory", "cc"
1602#if defined(__SSE2__)
1603    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1604#endif
1605  );
1606}
1607
1608void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1609                                          const uint8* u_buf,
1610                                          const uint8* v_buf,
1611                                          uint8* argb_buf,
1612                                          int width) {
1613  asm volatile (
1614    "sub       %[u_buf],%[v_buf]               \n"
1615    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1616    "pxor      %%xmm4,%%xmm4                   \n"
1617    ".p2align  4                               \n"
1618  "1:                                          \n"
1619    READYUV422
1620    YUVTORGB
1621    "punpcklbw %%xmm1,%%xmm0                   \n"
1622    "punpcklbw %%xmm5,%%xmm2                   \n"
1623    "movdqa    %%xmm0,%%xmm1                   \n"
1624    "punpcklwd %%xmm2,%%xmm0                   \n"
1625    "punpckhwd %%xmm2,%%xmm1                   \n"
1626    "movdqu    %%xmm0,(%[argb_buf])            \n"
1627    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1628    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1629    "sub       $0x8,%[width]                   \n"
1630    "jg        1b                              \n"
1631  : [y_buf]"+r"(y_buf),    // %[y_buf]
1632    [u_buf]"+r"(u_buf),    // %[u_buf]
1633    [v_buf]"+r"(v_buf),    // %[v_buf]
1634    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1635    [width]"+rm"(width)    // %[width]
1636  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1637  : "memory", "cc"
1638#if defined(__SSE2__)
1639    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1640#endif
1641  );
1642}
1643
1644void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1645                                          const uint8* u_buf,
1646                                          const uint8* v_buf,
1647                                          uint8* argb_buf,
1648                                          int width) {
1649  asm volatile (
1650    "sub       %[u_buf],%[v_buf]               \n"
1651    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1652    "pxor      %%xmm4,%%xmm4                   \n"
1653    ".p2align  4                               \n"
1654  "1:                                          \n"
1655    READYUV411
1656    YUVTORGB
1657    "punpcklbw %%xmm1,%%xmm0                   \n"
1658    "punpcklbw %%xmm5,%%xmm2                   \n"
1659    "movdqa    %%xmm0,%%xmm1                   \n"
1660    "punpcklwd %%xmm2,%%xmm0                   \n"
1661    "punpckhwd %%xmm2,%%xmm1                   \n"
1662    "movdqu    %%xmm0,(%[argb_buf])            \n"
1663    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1664    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1665    "sub       $0x8,%[width]                   \n"
1666    "jg        1b                              \n"
1667  : [y_buf]"+r"(y_buf),    // %[y_buf]
1668    [u_buf]"+r"(u_buf),    // %[u_buf]
1669    [v_buf]"+r"(v_buf),    // %[v_buf]
1670    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1671    [width]"+rm"(width)    // %[width]
1672  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1673  : "memory", "cc"
1674#if defined(__SSE2__)
1675    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1676#endif
1677  );
1678}
1679
1680void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1681                                          const uint8* uv_buf,
1682                                          uint8* argb_buf,
1683                                          int width) {
1684  asm volatile (
1685    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1686    "pxor      %%xmm4,%%xmm4                   \n"
1687    ".p2align  4                               \n"
1688  "1:                                          \n"
1689    READNV12
1690    YUVTORGB
1691    "punpcklbw %%xmm1,%%xmm0                   \n"
1692    "punpcklbw %%xmm5,%%xmm2                   \n"
1693    "movdqa    %%xmm0,%%xmm1                   \n"
1694    "punpcklwd %%xmm2,%%xmm0                   \n"
1695    "punpckhwd %%xmm2,%%xmm1                   \n"
1696    "movdqu    %%xmm0,(%[argb_buf])            \n"
1697    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1698    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1699    "sub       $0x8,%[width]                   \n"
1700    "jg        1b                              \n"
1701  : [y_buf]"+r"(y_buf),    // %[y_buf]
1702    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
1703    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1704    [width]"+rm"(width)    // %[width]
1705  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1706  : "memory", "cc"
1707#if defined(__SSE2__)
1708    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1709#endif
1710  );
1711}
1712
1713void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
1714                                          const uint8* vu_buf,
1715                                          uint8* argb_buf,
1716                                          int width) {
1717  asm volatile (
1718    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1719    "pxor      %%xmm4,%%xmm4                   \n"
1720    ".p2align  4                               \n"
1721  "1:                                          \n"
1722    READNV12
1723    YVUTORGB
1724    "punpcklbw %%xmm1,%%xmm0                   \n"
1725    "punpcklbw %%xmm5,%%xmm2                   \n"
1726    "movdqa    %%xmm0,%%xmm1                   \n"
1727    "punpcklwd %%xmm2,%%xmm0                   \n"
1728    "punpckhwd %%xmm2,%%xmm1                   \n"
1729    "movdqu    %%xmm0,(%[argb_buf])            \n"
1730    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1731    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1732    "sub       $0x8,%[width]                   \n"
1733    "jg        1b                              \n"
1734  : [y_buf]"+r"(y_buf),    // %[y_buf]
1735    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
1736    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
1737    [width]"+rm"(width)    // %[width]
1738  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1739  : "memory", "cc"
1740#if defined(__SSE2__)
1741    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1742#endif
1743  );
1744}
1745
1746void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
1747                                const uint8* u_buf,
1748                                const uint8* v_buf,
1749                                uint8* bgra_buf,
1750                                int width) {
1751  asm volatile (
1752    "sub       %[u_buf],%[v_buf]               \n"
1753    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1754    "pxor      %%xmm4,%%xmm4                   \n"
1755    ".p2align  4                               \n"
1756  "1:                                          \n"
1757    READYUV422
1758    YUVTORGB
1759    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1760    "punpcklbw %%xmm0,%%xmm1                   \n"
1761    "punpcklbw %%xmm2,%%xmm5                   \n"
1762    "movdqa    %%xmm5,%%xmm0                   \n"
1763    "punpcklwd %%xmm1,%%xmm5                   \n"
1764    "punpckhwd %%xmm1,%%xmm0                   \n"
1765    "movdqa    %%xmm5,(%[argb_buf])            \n"
1766    "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
1767    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1768    "sub       $0x8,%[width]                   \n"
1769    "jg        1b                              \n"
1770  : [y_buf]"+r"(y_buf),    // %[y_buf]
1771    [u_buf]"+r"(u_buf),    // %[u_buf]
1772    [v_buf]"+r"(v_buf),    // %[v_buf]
1773    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
1774    [width]"+rm"(width)    // %[width]
1775  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1776  : "memory", "cc"
1777#if defined(__SSE2__)
1778    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1779#endif
1780  );
1781}
1782
1783void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
1784                                const uint8* u_buf,
1785                                const uint8* v_buf,
1786                                uint8* abgr_buf,
1787                                int width) {
1788  asm volatile (
1789    "sub       %[u_buf],%[v_buf]               \n"
1790    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1791    "pxor      %%xmm4,%%xmm4                   \n"
1792    ".p2align  4                               \n"
1793  "1:                                          \n"
1794    READYUV422
1795    YUVTORGB
1796    "punpcklbw %%xmm1,%%xmm2                   \n"
1797    "punpcklbw %%xmm5,%%xmm0                   \n"
1798    "movdqa    %%xmm2,%%xmm1                   \n"
1799    "punpcklwd %%xmm0,%%xmm2                   \n"
1800    "punpckhwd %%xmm0,%%xmm1                   \n"
1801    "movdqa    %%xmm2,(%[argb_buf])            \n"
1802    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
1803    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1804    "sub       $0x8,%[width]                   \n"
1805    "jg        1b                              \n"
1806  : [y_buf]"+r"(y_buf),    // %[y_buf]
1807    [u_buf]"+r"(u_buf),    // %[u_buf]
1808    [v_buf]"+r"(v_buf),    // %[v_buf]
1809    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
1810    [width]"+rm"(width)    // %[width]
1811  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1812  : "memory", "cc"
1813#if defined(__SSE2__)
1814    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1815#endif
1816  );
1817}
1818
1819void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
1820                                          const uint8* u_buf,
1821                                          const uint8* v_buf,
1822                                          uint8* bgra_buf,
1823                                          int width) {
1824  asm volatile (
1825    "sub       %[u_buf],%[v_buf]               \n"
1826    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1827    "pxor      %%xmm4,%%xmm4                   \n"
1828    ".p2align  4                               \n"
1829  "1:                                          \n"
1830    READYUV422
1831    YUVTORGB
1832    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1833    "punpcklbw %%xmm0,%%xmm1                   \n"
1834    "punpcklbw %%xmm2,%%xmm5                   \n"
1835    "movdqa    %%xmm5,%%xmm0                   \n"
1836    "punpcklwd %%xmm1,%%xmm5                   \n"
1837    "punpckhwd %%xmm1,%%xmm0                   \n"
1838    "movdqu    %%xmm5,(%[argb_buf])            \n"
1839    "movdqu    %%xmm0,0x10(%[argb_buf])        \n"
1840    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1841    "sub       $0x8,%[width]                   \n"
1842    "jg        1b                              \n"
1843  : [y_buf]"+r"(y_buf),    // %[y_buf]
1844    [u_buf]"+r"(u_buf),    // %[u_buf]
1845    [v_buf]"+r"(v_buf),    // %[v_buf]
1846    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
1847    [width]"+rm"(width)    // %[width]
1848  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1849  : "memory", "cc"
1850#if defined(__SSE2__)
1851    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1852#endif
1853  );
1854}
1855
1856void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
1857                                          const uint8* u_buf,
1858                                          const uint8* v_buf,
1859                                          uint8* abgr_buf,
1860                                          int width) {
1861  asm volatile (
1862    "sub       %[u_buf],%[v_buf]               \n"
1863    "pcmpeqb   %%xmm5,%%xmm5                   \n"
1864    "pxor      %%xmm4,%%xmm4                   \n"
1865    ".p2align  4                               \n"
1866  "1:                                          \n"
1867    READYUV422
1868    YUVTORGB
1869    "punpcklbw %%xmm1,%%xmm2                   \n"
1870    "punpcklbw %%xmm5,%%xmm0                   \n"
1871    "movdqa    %%xmm2,%%xmm1                   \n"
1872    "punpcklwd %%xmm0,%%xmm2                   \n"
1873    "punpckhwd %%xmm0,%%xmm1                   \n"
1874    "movdqu    %%xmm2,(%[argb_buf])            \n"
1875    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
1876    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
1877    "sub       $0x8,%[width]                   \n"
1878    "jg        1b                              \n"
1879  : [y_buf]"+r"(y_buf),    // %[y_buf]
1880    [u_buf]"+r"(u_buf),    // %[u_buf]
1881    [v_buf]"+r"(v_buf),    // %[v_buf]
1882    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
1883    [width]"+rm"(width)    // %[width]
1884  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
1885  : "memory", "cc"
1886#if defined(__SSE2__)
1887    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1888#endif
1889  );
1890}
1891#endif  // HAS_I422TOARGBROW_SSSE3
1892
1893#ifdef HAS_YTOARGBROW_SSE2
1894void YToARGBRow_SSE2(const uint8* y_buf,
1895                     uint8* rgb_buf,
1896                     int width) {
1897  asm volatile (
1898    "pcmpeqb   %%xmm4,%%xmm4                   \n"
1899    "pslld     $0x18,%%xmm4                    \n"
1900    "mov       $0x10001000,%%eax               \n"
1901    "movd      %%eax,%%xmm3                    \n"
1902    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
1903    "mov       $0x012a012a,%%eax               \n"
1904    "movd      %%eax,%%xmm2                    \n"
1905    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
1906    ".p2align  4                               \n"
1907  "1:                                          \n"
1908    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
1909    "movq      (%0),%%xmm0                     \n"
1910    "lea       0x8(%0),%0                      \n"
1911    "punpcklbw %%xmm0,%%xmm0                   \n"
1912    "psubusw   %%xmm3,%%xmm0                   \n"
1913    "pmulhuw   %%xmm2,%%xmm0                   \n"
1914    "packuswb  %%xmm0,%%xmm0                   \n"
1915
1916    // Step 2: Weave into ARGB
1917    "punpcklbw %%xmm0,%%xmm0                   \n"
1918    "movdqa    %%xmm0,%%xmm1                   \n"
1919    "punpcklwd %%xmm0,%%xmm0                   \n"
1920    "punpckhwd %%xmm1,%%xmm1                   \n"
1921    "por       %%xmm4,%%xmm0                   \n"
1922    "por       %%xmm4,%%xmm1                   \n"
1923    "movdqa    %%xmm0,(%1)                     \n"
1924    "movdqa    %%xmm1,16(%1)                   \n"
1925    "lea       32(%1),%1                       \n"
1926
1927    "sub       $0x8,%2                         \n"
1928    "jg        1b                              \n"
1929  : "+r"(y_buf),    // %0
1930    "+r"(rgb_buf),  // %1
1931    "+rm"(width)    // %2
1932  :
1933  : "memory", "cc", "eax"
1934#if defined(__SSE2__)
1935    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
1936#endif
1937  );
1938}
1939#endif  // HAS_YTOARGBROW_SSE2
1940
1941#ifdef HAS_MIRRORROW_SSSE3
1942// Shuffle table for reversing the bytes.
1943CONST uvec8 kShuffleMirror = {
1944  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
1945};
1946
1947void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
1948  intptr_t temp_width = static_cast<intptr_t>(width);
1949  asm volatile (
1950    "movdqa    %3,%%xmm5                       \n"
1951    "lea       -0x10(%0),%0                    \n"
1952    ".p2align  4                               \n"
1953  "1:                                          \n"
1954    "movdqa    (%0,%2),%%xmm0                  \n"
1955    "pshufb    %%xmm5,%%xmm0                   \n"
1956    "sub       $0x10,%2                        \n"
1957    "movdqa    %%xmm0,(%1)                     \n"
1958    "lea       0x10(%1),%1                     \n"
1959    "jg        1b                              \n"
1960  : "+r"(src),  // %0
1961    "+r"(dst),  // %1
1962    "+r"(temp_width)  // %2
1963  : "m"(kShuffleMirror) // %3
1964  : "memory", "cc"
1965#if defined(__SSE2__)
1966    , "xmm0", "xmm5"
1967#endif
1968  );
1969}
1970#endif  // HAS_MIRRORROW_SSSE3
1971
1972#ifdef HAS_MIRRORROW_SSE2
1973void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
1974  intptr_t temp_width = static_cast<intptr_t>(width);
1975  asm volatile (
1976    "lea       -0x10(%0),%0                    \n"
1977    ".p2align  4                               \n"
1978  "1:                                          \n"
1979    "movdqu    (%0,%2),%%xmm0                  \n"
1980    "movdqa    %%xmm0,%%xmm1                   \n"
1981    "psllw     $0x8,%%xmm0                     \n"
1982    "psrlw     $0x8,%%xmm1                     \n"
1983    "por       %%xmm1,%%xmm0                   \n"
1984    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
1985    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
1986    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
1987    "sub       $0x10,%2                        \n"
1988    "movdqu    %%xmm0,(%1)                     \n"
1989    "lea       0x10(%1),%1                     \n"
1990    "jg        1b                              \n"
1991  : "+r"(src),  // %0
1992    "+r"(dst),  // %1
1993    "+r"(temp_width)  // %2
1994  :
1995  : "memory", "cc"
1996#if defined(__SSE2__)
1997    , "xmm0", "xmm1"
1998#endif
1999  );
2000}
2001#endif  // HAS_MIRRORROW_SSE2
2002
2003#ifdef HAS_MIRRORROW_UV_SSSE3
2004// Shuffle table for reversing the bytes of UV channels.
2005CONST uvec8 kShuffleMirrorUV = {
2006  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
2007};
2008void MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
2009                       int width) {
2010  intptr_t temp_width = static_cast<intptr_t>(width);
2011  asm volatile (
2012    "movdqa    %4,%%xmm1                       \n"
2013    "lea       -16(%0,%3,2),%0                 \n"
2014    "sub       %1,%2                           \n"
2015    ".p2align  4                               \n"
2016  "1:                                          \n"
2017    "movdqa    (%0),%%xmm0                     \n"
2018    "lea       -16(%0),%0                      \n"
2019    "pshufb    %%xmm1,%%xmm0                   \n"
2020    "sub       $8,%3                           \n"
2021    "movlpd    %%xmm0,(%1)                     \n"
2022    "movhpd    %%xmm0,(%1,%2)                  \n"
2023    "lea       8(%1),%1                        \n"
2024    "jg        1b                              \n"
2025  : "+r"(src),      // %0
2026    "+r"(dst_u),    // %1
2027    "+r"(dst_v),    // %2
2028    "+r"(temp_width)  // %3
2029  : "m"(kShuffleMirrorUV)  // %4
2030  : "memory", "cc"
2031#if defined(__SSE2__)
2032    , "xmm0", "xmm1"
2033#endif
2034  );
2035}
2036#endif  // HAS_MIRRORROW_UV_SSSE3
2037
2038#ifdef HAS_ARGBMIRRORROW_SSSE3
2039// Shuffle table for reversing the bytes.
2040CONST uvec8 kARGBShuffleMirror = {
2041  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
2042};
2043
2044void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2045  intptr_t temp_width = static_cast<intptr_t>(width);
2046  asm volatile (
2047    "movdqa    %3,%%xmm5                       \n"
2048    "lea       -0x10(%0),%0                    \n"
2049    ".p2align  4                               \n"
2050  "1:                                          \n"
2051    "movdqa    (%0,%2,4),%%xmm0                \n"
2052    "pshufb    %%xmm5,%%xmm0                   \n"
2053    "sub       $0x4,%2                         \n"
2054    "movdqa    %%xmm0,(%1)                     \n"
2055    "lea       0x10(%1),%1                     \n"
2056    "jg        1b                              \n"
2057  : "+r"(src),  // %0
2058    "+r"(dst),  // %1
2059    "+r"(temp_width)  // %2
2060  : "m"(kARGBShuffleMirror)  // %3
2061  : "memory", "cc"
2062#if defined(__SSE2__)
2063    , "xmm0", "xmm5"
2064#endif
2065  );
2066}
2067#endif  // HAS_ARGBMIRRORROW_SSSE3
2068
2069#ifdef HAS_SPLITUV_SSE2
2070void SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
2071  asm volatile (
2072    "pcmpeqb    %%xmm5,%%xmm5                    \n"
2073    "psrlw      $0x8,%%xmm5                      \n"
2074    "sub        %1,%2                            \n"
2075    ".p2align  4                               \n"
2076  "1:                                            \n"
2077    "movdqa     (%0),%%xmm0                      \n"
2078    "movdqa     0x10(%0),%%xmm1                  \n"
2079    "lea        0x20(%0),%0                      \n"
2080    "movdqa     %%xmm0,%%xmm2                    \n"
2081    "movdqa     %%xmm1,%%xmm3                    \n"
2082    "pand       %%xmm5,%%xmm0                    \n"
2083    "pand       %%xmm5,%%xmm1                    \n"
2084    "packuswb   %%xmm1,%%xmm0                    \n"
2085    "psrlw      $0x8,%%xmm2                      \n"
2086    "psrlw      $0x8,%%xmm3                      \n"
2087    "packuswb   %%xmm3,%%xmm2                    \n"
2088    "movdqa     %%xmm0,(%1)                      \n"
2089    "movdqa     %%xmm2,(%1,%2)                   \n"
2090    "lea        0x10(%1),%1                      \n"
2091    "sub        $0x10,%3                         \n"
2092    "jg         1b                               \n"
2093  : "+r"(src_uv),     // %0
2094    "+r"(dst_u),      // %1
2095    "+r"(dst_v),      // %2
2096    "+r"(pix)         // %3
2097  :
2098  : "memory", "cc"
2099#if defined(__SSE2__)
2100    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2101#endif
2102  );
2103}
2104#endif  // HAS_SPLITUV_SSE2
2105
2106#ifdef HAS_COPYROW_SSE2
2107void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
2108  asm volatile (
2109    "sub        %0,%1                          \n"
2110    ".p2align  4                               \n"
2111  "1:                                          \n"
2112    "movdqa    (%0),%%xmm0                     \n"
2113    "movdqa    0x10(%0),%%xmm1                 \n"
2114    "movdqa    %%xmm0,(%0,%1)                  \n"
2115    "movdqa    %%xmm1,0x10(%0,%1)              \n"
2116    "lea       0x20(%0),%0                     \n"
2117    "sub       $0x20,%2                        \n"
2118    "jg        1b                              \n"
2119  : "+r"(src),   // %0
2120    "+r"(dst),   // %1
2121    "+r"(count)  // %2
2122  :
2123  : "memory", "cc"
2124#if defined(__SSE2__)
2125    , "xmm0", "xmm1"
2126#endif
2127  );
2128}
2129#endif  // HAS_COPYROW_SSE2
2130
2131#ifdef HAS_COPYROW_X86
2132void CopyRow_X86(const uint8* src, uint8* dst, int width) {
2133  size_t width_tmp = static_cast<size_t>(width);
2134  asm volatile (
2135    "shr       $0x2,%2                         \n"
2136    "rep movsl                                 \n"
2137  : "+S"(src),  // %0
2138    "+D"(dst),  // %1
2139    "+c"(width_tmp) // %2
2140  :
2141  : "memory", "cc"
2142  );
2143}
2144#endif  // HAS_COPYROW_X86
2145
2146#ifdef HAS_SETROW_X86
2147void SetRow8_X86(uint8* dst, uint32 v32, int width) {
2148  size_t width_tmp = static_cast<size_t>(width);
2149  asm volatile (
2150    "shr       $0x2,%1                         \n"
2151    "rep stosl                                 \n"
2152    : "+D"(dst),       // %0
2153      "+c"(width_tmp)  // %1
2154    : "a"(v32)         // %2
2155    : "memory", "cc");
2156}
2157
2158void SetRows32_X86(uint8* dst, uint32 v32, int width,
2159                   int dst_stride, int height) {
2160  for (int y = 0; y < height; ++y) {
2161    size_t width_tmp = static_cast<size_t>(width);
2162    uint32* d = reinterpret_cast<uint32*>(dst);
2163    asm volatile (
2164      "rep stosl                               \n"
2165      : "+D"(d),         // %0
2166        "+c"(width_tmp)  // %1
2167      : "a"(v32)         // %2
2168      : "memory", "cc");
2169    dst += dst_stride;
2170  }
2171}
2172#endif  // HAS_SETROW_X86
2173
2174#ifdef HAS_YUY2TOYROW_SSE2
2175void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
2176  asm volatile (
2177    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2178    "psrlw     $0x8,%%xmm5                     \n"
2179    ".p2align  4                               \n"
2180  "1:                                          \n"
2181    "movdqa    (%0),%%xmm0                     \n"
2182    "movdqa    0x10(%0),%%xmm1                 \n"
2183    "lea       0x20(%0),%0                     \n"
2184    "pand      %%xmm5,%%xmm0                   \n"
2185    "pand      %%xmm5,%%xmm1                   \n"
2186    "packuswb  %%xmm1,%%xmm0                   \n"
2187    "movdqa    %%xmm0,(%1)                     \n"
2188    "lea       0x10(%1),%1                     \n"
2189    "sub       $0x10,%2                        \n"
2190    "jg        1b                              \n"
2191  : "+r"(src_yuy2),  // %0
2192    "+r"(dst_y),     // %1
2193    "+r"(pix)        // %2
2194  :
2195  : "memory", "cc"
2196#if defined(__SSE2__)
2197    , "xmm0", "xmm1", "xmm5"
2198#endif
2199  );
2200}
2201
2202void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
2203                      uint8* dst_u, uint8* dst_v, int pix) {
2204  asm volatile (
2205    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2206    "psrlw     $0x8,%%xmm5                     \n"
2207    "sub       %1,%2                           \n"
2208    ".p2align  4                               \n"
2209  "1:                                          \n"
2210    "movdqa    (%0),%%xmm0                     \n"
2211    "movdqa    0x10(%0),%%xmm1                 \n"
2212    "movdqa    (%0,%4,1),%%xmm2                \n"
2213    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
2214    "lea       0x20(%0),%0                     \n"
2215    "pavgb     %%xmm2,%%xmm0                   \n"
2216    "pavgb     %%xmm3,%%xmm1                   \n"
2217    "psrlw     $0x8,%%xmm0                     \n"
2218    "psrlw     $0x8,%%xmm1                     \n"
2219    "packuswb  %%xmm1,%%xmm0                   \n"
2220    "movdqa    %%xmm0,%%xmm1                   \n"
2221    "pand      %%xmm5,%%xmm0                   \n"
2222    "packuswb  %%xmm0,%%xmm0                   \n"
2223    "psrlw     $0x8,%%xmm1                     \n"
2224    "packuswb  %%xmm1,%%xmm1                   \n"
2225    "movq      %%xmm0,(%1)                     \n"
2226    "movq      %%xmm1,(%1,%2)                  \n"
2227    "lea       0x8(%1),%1                      \n"
2228    "sub       $0x10,%3                        \n"
2229    "jg        1b                              \n"
2230  : "+r"(src_yuy2),    // %0
2231    "+r"(dst_u),       // %1
2232    "+r"(dst_v),       // %2
2233    "+r"(pix)          // %3
2234  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
2235  : "memory", "cc"
2236#if defined(__SSE2__)
2237    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2238#endif
2239  );
2240}
2241
2242void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
2243                         uint8* dst_u, uint8* dst_v, int pix) {
2244  asm volatile (
2245    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2246    "psrlw     $0x8,%%xmm5                     \n"
2247    "sub       %1,%2                           \n"
2248    ".p2align  4                               \n"
2249  "1:                                          \n"
2250    "movdqa    (%0),%%xmm0                     \n"
2251    "movdqa    0x10(%0),%%xmm1                 \n"
2252    "lea       0x20(%0),%0                     \n"
2253    "psrlw     $0x8,%%xmm0                     \n"
2254    "psrlw     $0x8,%%xmm1                     \n"
2255    "packuswb  %%xmm1,%%xmm0                   \n"
2256    "movdqa    %%xmm0,%%xmm1                   \n"
2257    "pand      %%xmm5,%%xmm0                   \n"
2258    "packuswb  %%xmm0,%%xmm0                   \n"
2259    "psrlw     $0x8,%%xmm1                     \n"
2260    "packuswb  %%xmm1,%%xmm1                   \n"
2261    "movq      %%xmm0,(%1)                     \n"
2262    "movq      %%xmm1,(%1,%2)                  \n"
2263    "lea       0x8(%1),%1                      \n"
2264    "sub       $0x10,%3                        \n"
2265    "jg        1b                              \n"
2266  : "+r"(src_yuy2),    // %0
2267    "+r"(dst_u),       // %1
2268    "+r"(dst_v),       // %2
2269    "+r"(pix)          // %3
2270  :
2271  : "memory", "cc"
2272#if defined(__SSE2__)
2273    , "xmm0", "xmm1", "xmm5"
2274#endif
2275  );
2276}
2277
2278void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
2279                               uint8* dst_y, int pix) {
2280  asm volatile (
2281    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2282    "psrlw     $0x8,%%xmm5                     \n"
2283    ".p2align  4                               \n"
2284  "1:                                          \n"
2285    "movdqu    (%0),%%xmm0                     \n"
2286    "movdqu    0x10(%0),%%xmm1                 \n"
2287    "lea       0x20(%0),%0                     \n"
2288    "pand      %%xmm5,%%xmm0                   \n"
2289    "pand      %%xmm5,%%xmm1                   \n"
2290    "packuswb  %%xmm1,%%xmm0                   \n"
2291    "sub       $0x10,%2                        \n"
2292    "movdqu    %%xmm0,(%1)                     \n"
2293    "lea       0x10(%1),%1                     \n"
2294    "jg        1b                              \n"
2295  : "+r"(src_yuy2),  // %0
2296    "+r"(dst_y),     // %1
2297    "+r"(pix)        // %2
2298  :
2299  : "memory", "cc"
2300#if defined(__SSE2__)
2301    , "xmm0", "xmm1", "xmm5"
2302#endif
2303  );
2304}
2305
2306void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
2307                                int stride_yuy2,
2308                                uint8* dst_u, uint8* dst_v, int pix) {
2309  asm volatile (
2310    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2311    "psrlw     $0x8,%%xmm5                     \n"
2312    "sub       %1,%2                           \n"
2313    ".p2align  4                               \n"
2314  "1:                                          \n"
2315    "movdqu    (%0),%%xmm0                     \n"
2316    "movdqu    0x10(%0),%%xmm1                 \n"
2317    "movdqu    (%0,%4,1),%%xmm2                \n"
2318    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
2319    "lea       0x20(%0),%0                     \n"
2320    "pavgb     %%xmm2,%%xmm0                   \n"
2321    "pavgb     %%xmm3,%%xmm1                   \n"
2322    "psrlw     $0x8,%%xmm0                     \n"
2323    "psrlw     $0x8,%%xmm1                     \n"
2324    "packuswb  %%xmm1,%%xmm0                   \n"
2325    "movdqa    %%xmm0,%%xmm1                   \n"
2326    "pand      %%xmm5,%%xmm0                   \n"
2327    "packuswb  %%xmm0,%%xmm0                   \n"
2328    "psrlw     $0x8,%%xmm1                     \n"
2329    "packuswb  %%xmm1,%%xmm1                   \n"
2330    "movq      %%xmm0,(%1)                     \n"
2331    "movq      %%xmm1,(%1,%2)                  \n"
2332    "lea       0x8(%1),%1                      \n"
2333    "sub       $0x10,%3                        \n"
2334    "jg        1b                              \n"
2335  : "+r"(src_yuy2),    // %0
2336    "+r"(dst_u),       // %1
2337    "+r"(dst_v),       // %2
2338    "+r"(pix)          // %3
2339  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
2340  : "memory", "cc"
2341#if defined(__SSE2__)
2342    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2343#endif
2344  );
2345}
2346
2347void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
2348                                   uint8* dst_u, uint8* dst_v, int pix) {
2349  asm volatile (
2350    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2351    "psrlw     $0x8,%%xmm5                     \n"
2352    "sub       %1,%2                           \n"
2353    ".p2align  4                               \n"
2354  "1:                                          \n"
2355    "movdqu    (%0),%%xmm0                     \n"
2356    "movdqu    0x10(%0),%%xmm1                 \n"
2357    "lea       0x20(%0),%0                     \n"
2358    "psrlw     $0x8,%%xmm0                     \n"
2359    "psrlw     $0x8,%%xmm1                     \n"
2360    "packuswb  %%xmm1,%%xmm0                   \n"
2361    "movdqa    %%xmm0,%%xmm1                   \n"
2362    "pand      %%xmm5,%%xmm0                   \n"
2363    "packuswb  %%xmm0,%%xmm0                   \n"
2364    "psrlw     $0x8,%%xmm1                     \n"
2365    "packuswb  %%xmm1,%%xmm1                   \n"
2366    "movq      %%xmm0,(%1)                     \n"
2367    "movq      %%xmm1,(%1,%2)                  \n"
2368    "lea       0x8(%1),%1                      \n"
2369    "sub       $0x10,%3                        \n"
2370    "jg        1b                              \n"
2371  : "+r"(src_yuy2),    // %0
2372    "+r"(dst_u),       // %1
2373    "+r"(dst_v),       // %2
2374    "+r"(pix)          // %3
2375  :
2376  : "memory", "cc"
2377#if defined(__SSE2__)
2378    , "xmm0", "xmm1", "xmm5"
2379#endif
2380  );
2381}
2382
2383void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
2384  asm volatile (
2385    ".p2align  4                               \n"
2386  "1:                                          \n"
2387    "movdqa    (%0),%%xmm0                     \n"
2388    "movdqa    0x10(%0),%%xmm1                 \n"
2389    "lea       0x20(%0),%0                     \n"
2390    "psrlw     $0x8,%%xmm0                     \n"
2391    "psrlw     $0x8,%%xmm1                     \n"
2392    "packuswb  %%xmm1,%%xmm0                   \n"
2393    "sub       $0x10,%2                        \n"
2394    "movdqa    %%xmm0,(%1)                     \n"
2395    "lea       0x10(%1),%1                     \n"
2396    "jg        1b                              \n"
2397  : "+r"(src_uyvy),  // %0
2398    "+r"(dst_y),     // %1
2399    "+r"(pix)        // %2
2400  :
2401  : "memory", "cc"
2402#if defined(__SSE2__)
2403    , "xmm0", "xmm1"
2404#endif
2405  );
2406}
2407
2408void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
2409                      uint8* dst_u, uint8* dst_v, int pix) {
2410  asm volatile (
2411    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2412    "psrlw     $0x8,%%xmm5                     \n"
2413    "sub       %1,%2                           \n"
2414    ".p2align  4                               \n"
2415  "1:                                          \n"
2416    "movdqa    (%0),%%xmm0                     \n"
2417    "movdqa    0x10(%0),%%xmm1                 \n"
2418    "movdqa    (%0,%4,1),%%xmm2                \n"
2419    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
2420    "lea       0x20(%0),%0                     \n"
2421    "pavgb     %%xmm2,%%xmm0                   \n"
2422    "pavgb     %%xmm3,%%xmm1                   \n"
2423    "pand      %%xmm5,%%xmm0                   \n"
2424    "pand      %%xmm5,%%xmm1                   \n"
2425    "packuswb  %%xmm1,%%xmm0                   \n"
2426    "movdqa    %%xmm0,%%xmm1                   \n"
2427    "pand      %%xmm5,%%xmm0                   \n"
2428    "packuswb  %%xmm0,%%xmm0                   \n"
2429    "psrlw     $0x8,%%xmm1                     \n"
2430    "packuswb  %%xmm1,%%xmm1                   \n"
2431    "movq      %%xmm0,(%1)                     \n"
2432    "movq      %%xmm1,(%1,%2)                  \n"
2433    "lea       0x8(%1),%1                      \n"
2434    "sub       $0x10,%3                        \n"
2435    "jg        1b                              \n"
2436  : "+r"(src_uyvy),    // %0
2437    "+r"(dst_u),       // %1
2438    "+r"(dst_v),       // %2
2439    "+r"(pix)          // %3
2440  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
2441  : "memory", "cc"
2442#if defined(__SSE2__)
2443    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2444#endif
2445  );
2446}
2447
2448void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
2449                         uint8* dst_u, uint8* dst_v, int pix) {
2450  asm volatile (
2451    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2452    "psrlw     $0x8,%%xmm5                     \n"
2453    "sub       %1,%2                           \n"
2454    ".p2align  4                               \n"
2455  "1:                                          \n"
2456    "movdqa    (%0),%%xmm0                     \n"
2457    "movdqa    0x10(%0),%%xmm1                 \n"
2458    "lea       0x20(%0),%0                     \n"
2459    "pand      %%xmm5,%%xmm0                   \n"
2460    "pand      %%xmm5,%%xmm1                   \n"
2461    "packuswb  %%xmm1,%%xmm0                   \n"
2462    "movdqa    %%xmm0,%%xmm1                   \n"
2463    "pand      %%xmm5,%%xmm0                   \n"
2464    "packuswb  %%xmm0,%%xmm0                   \n"
2465    "psrlw     $0x8,%%xmm1                     \n"
2466    "packuswb  %%xmm1,%%xmm1                   \n"
2467    "movq      %%xmm0,(%1)                     \n"
2468    "movq      %%xmm1,(%1,%2)                  \n"
2469    "lea       0x8(%1),%1                      \n"
2470    "sub       $0x10,%3                        \n"
2471    "jg        1b                              \n"
2472  : "+r"(src_uyvy),    // %0
2473    "+r"(dst_u),       // %1
2474    "+r"(dst_v),       // %2
2475    "+r"(pix)          // %3
2476  :
2477  : "memory", "cc"
2478#if defined(__SSE2__)
2479    , "xmm0", "xmm1", "xmm5"
2480#endif
2481  );
2482}
2483
2484void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
2485                               uint8* dst_y, int pix) {
2486  asm volatile (
2487    ".p2align  4                               \n"
2488  "1:                                          \n"
2489    "movdqu    (%0),%%xmm0                     \n"
2490    "movdqu    0x10(%0),%%xmm1                 \n"
2491    "lea       0x20(%0),%0                     \n"
2492    "psrlw     $0x8,%%xmm0                     \n"
2493    "psrlw     $0x8,%%xmm1                     \n"
2494    "packuswb  %%xmm1,%%xmm0                   \n"
2495    "sub       $0x10,%2                        \n"
2496    "movdqu    %%xmm0,(%1)                     \n"
2497    "lea       0x10(%1),%1                     \n"
2498    "jg        1b                              \n"
2499  : "+r"(src_uyvy),  // %0
2500    "+r"(dst_y),     // %1
2501    "+r"(pix)        // %2
2502  :
2503  : "memory", "cc"
2504#if defined(__SSE2__)
2505    , "xmm0", "xmm1"
2506#endif
2507  );
2508}
2509
2510void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
2511                                uint8* dst_u, uint8* dst_v, int pix) {
2512  asm volatile (
2513    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2514    "psrlw     $0x8,%%xmm5                     \n"
2515    "sub       %1,%2                           \n"
2516    ".p2align  4                               \n"
2517  "1:                                          \n"
2518    "movdqu    (%0),%%xmm0                     \n"
2519    "movdqu    0x10(%0),%%xmm1                 \n"
2520    "movdqu    (%0,%4,1),%%xmm2                \n"
2521    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
2522    "lea       0x20(%0),%0                     \n"
2523    "pavgb     %%xmm2,%%xmm0                   \n"
2524    "pavgb     %%xmm3,%%xmm1                   \n"
2525    "pand      %%xmm5,%%xmm0                   \n"
2526    "pand      %%xmm5,%%xmm1                   \n"
2527    "packuswb  %%xmm1,%%xmm0                   \n"
2528    "movdqa    %%xmm0,%%xmm1                   \n"
2529    "pand      %%xmm5,%%xmm0                   \n"
2530    "packuswb  %%xmm0,%%xmm0                   \n"
2531    "psrlw     $0x8,%%xmm1                     \n"
2532    "packuswb  %%xmm1,%%xmm1                   \n"
2533    "movq      %%xmm0,(%1)                     \n"
2534    "movq      %%xmm1,(%1,%2)                  \n"
2535    "lea       0x8(%1),%1                      \n"
2536    "sub       $0x10,%3                        \n"
2537    "jg        1b                              \n"
2538  : "+r"(src_uyvy),    // %0
2539    "+r"(dst_u),       // %1
2540    "+r"(dst_v),       // %2
2541    "+r"(pix)          // %3
2542  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
2543  : "memory", "cc"
2544#if defined(__SSE2__)
2545    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
2546#endif
2547  );
2548}
2549
2550void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
2551                                   uint8* dst_u, uint8* dst_v, int pix) {
2552  asm volatile (
2553    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2554    "psrlw     $0x8,%%xmm5                     \n"
2555    "sub       %1,%2                           \n"
2556    ".p2align  4                               \n"
2557  "1:                                          \n"
2558    "movdqu    (%0),%%xmm0                     \n"
2559    "movdqu    0x10(%0),%%xmm1                 \n"
2560    "lea       0x20(%0),%0                     \n"
2561    "pand      %%xmm5,%%xmm0                   \n"
2562    "pand      %%xmm5,%%xmm1                   \n"
2563    "packuswb  %%xmm1,%%xmm0                   \n"
2564    "movdqa    %%xmm0,%%xmm1                   \n"
2565    "pand      %%xmm5,%%xmm0                   \n"
2566    "packuswb  %%xmm0,%%xmm0                   \n"
2567    "psrlw     $0x8,%%xmm1                     \n"
2568    "packuswb  %%xmm1,%%xmm1                   \n"
2569    "movq      %%xmm0,(%1)                     \n"
2570    "movq      %%xmm1,(%1,%2)                  \n"
2571    "lea       0x8(%1),%1                      \n"
2572    "sub       $0x10,%3                        \n"
2573    "jg        1b                              \n"
2574  : "+r"(src_uyvy),    // %0
2575    "+r"(dst_u),       // %1
2576    "+r"(dst_v),       // %2
2577    "+r"(pix)          // %3
2578  :
2579  : "memory", "cc"
2580#if defined(__SSE2__)
2581    , "xmm0", "xmm1", "xmm5"
2582#endif
2583  );
2584}
2585#endif  // HAS_YUY2TOYROW_SSE2
2586
2587#ifdef HAS_ARGBBLENDROW_SSE2
2588// Blend 8 pixels at a time.
2589void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
2590                       uint8* dst_argb, int width) {
2591  asm volatile (
2592    "pcmpeqb   %%xmm7,%%xmm7                   \n"
2593    "psrlw     $0xf,%%xmm7                     \n"
2594    "pcmpeqb   %%xmm6,%%xmm6                   \n"
2595    "psrlw     $0x8,%%xmm6                     \n"
2596    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2597    "psllw     $0x8,%%xmm5                     \n"
2598    "pcmpeqb   %%xmm4,%%xmm4                   \n"
2599    "pslld     $0x18,%%xmm4                    \n"
2600    "sub       $0x1,%3                         \n"
2601    "je        91f                             \n"
2602    "jl        99f                             \n"
2603
2604    // 1 pixel loop until destination pointer is aligned.
2605  "10:                                         \n"
2606    "test      $0xf,%2                         \n"
2607    "je        19f                             \n"
2608    "movd      (%0),%%xmm3                     \n"
2609    "lea       0x4(%0),%0                      \n"
2610    "movdqa    %%xmm3,%%xmm0                   \n"
2611    "pxor      %%xmm4,%%xmm3                   \n"
2612    "movd      (%1),%%xmm2                     \n"
2613    "psrlw     $0x8,%%xmm3                     \n"
2614    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
2615    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
2616    "pand      %%xmm6,%%xmm2                   \n"
2617    "paddw     %%xmm7,%%xmm3                   \n"
2618    "pmullw    %%xmm3,%%xmm2                   \n"
2619    "movd      (%1),%%xmm1                     \n"
2620    "lea       0x4(%1),%1                      \n"
2621    "psrlw     $0x8,%%xmm1                     \n"
2622    "por       %%xmm4,%%xmm0                   \n"
2623    "pmullw    %%xmm3,%%xmm1                   \n"
2624    "psrlw     $0x8,%%xmm2                     \n"
2625    "paddusb   %%xmm2,%%xmm0                   \n"
2626    "pand      %%xmm5,%%xmm1                   \n"
2627    "paddusb   %%xmm1,%%xmm0                   \n"
2628    "sub       $0x1,%3                         \n"
2629    "movd      %%xmm0,(%2)                     \n"
2630    "lea       0x4(%2),%2                      \n"
2631    "jge       10b                             \n"
2632
2633  "19:                                         \n"
2634    "add       $1-4,%3                         \n"
2635    "jl        49f                             \n"
2636
2637    // 4 pixel loop.
2638    ".p2align  2                               \n"
2639  "41:                                         \n"
2640    "movdqu    (%0),%%xmm3                     \n"
2641    "lea       0x10(%0),%0                     \n"
2642    "movdqa    %%xmm3,%%xmm0                   \n"
2643    "pxor      %%xmm4,%%xmm3                   \n"
2644    "movdqu    (%1),%%xmm2                     \n"
2645    "psrlw     $0x8,%%xmm3                     \n"
2646    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
2647    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
2648    "pand      %%xmm6,%%xmm2                   \n"
2649    "paddw     %%xmm7,%%xmm3                   \n"
2650    "pmullw    %%xmm3,%%xmm2                   \n"
2651    "movdqu    (%1),%%xmm1                     \n"
2652    "lea       0x10(%1),%1                     \n"
2653    "psrlw     $0x8,%%xmm1                     \n"
2654    "por       %%xmm4,%%xmm0                   \n"
2655    "pmullw    %%xmm3,%%xmm1                   \n"
2656    "psrlw     $0x8,%%xmm2                     \n"
2657    "paddusb   %%xmm2,%%xmm0                   \n"
2658    "pand      %%xmm5,%%xmm1                   \n"
2659    "paddusb   %%xmm1,%%xmm0                   \n"
2660    "sub       $0x4,%3                         \n"
2661    "movdqa    %%xmm0,(%2)                     \n"
2662    "lea       0x10(%2),%2                     \n"
2663    "jge       41b                             \n"
2664
2665  "49:                                         \n"
2666    "add       $0x3,%3                         \n"
2667    "jl        99f                             \n"
2668
2669    // 1 pixel loop.
2670  "91:                                         \n"
2671    "movd      (%0),%%xmm3                     \n"
2672    "lea       0x4(%0),%0                      \n"
2673    "movdqa    %%xmm3,%%xmm0                   \n"
2674    "pxor      %%xmm4,%%xmm3                   \n"
2675    "movd      (%1),%%xmm2                     \n"
2676    "psrlw     $0x8,%%xmm3                     \n"
2677    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
2678    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
2679    "pand      %%xmm6,%%xmm2                   \n"
2680    "paddw     %%xmm7,%%xmm3                   \n"
2681    "pmullw    %%xmm3,%%xmm2                   \n"
2682    "movd      (%1),%%xmm1                     \n"
2683    "lea       0x4(%1),%1                      \n"
2684    "psrlw     $0x8,%%xmm1                     \n"
2685    "por       %%xmm4,%%xmm0                   \n"
2686    "pmullw    %%xmm3,%%xmm1                   \n"
2687    "psrlw     $0x8,%%xmm2                     \n"
2688    "paddusb   %%xmm2,%%xmm0                   \n"
2689    "pand      %%xmm5,%%xmm1                   \n"
2690    "paddusb   %%xmm1,%%xmm0                   \n"
2691    "sub       $0x1,%3                         \n"
2692    "movd      %%xmm0,(%2)                     \n"
2693    "lea       0x4(%2),%2                      \n"
2694    "jge       91b                             \n"
2695  "99:                                         \n"
2696  : "+r"(src_argb0),    // %0
2697    "+r"(src_argb1),    // %1
2698    "+r"(dst_argb),     // %2
2699    "+r"(width)         // %3
2700  :
2701  : "memory", "cc"
2702#if defined(__SSE2__)
2703    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2704#endif
2705  );
2706}
2707#endif  // HAS_ARGBBLENDROW_SSE2
2708
2709#ifdef HAS_ARGBBLENDROW_SSSE3
2710// Shuffle table for isolating alpha.
2711CONST uvec8 kShuffleAlpha = {
2712  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
2713  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
2714};
2715
2716// Blend 8 pixels at a time
2717// Shuffle table for reversing the bytes.
2718
2719// Same as SSE2, but replaces
2720//    psrlw      xmm3, 8          // alpha
2721//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
2722//    pshuflw    xmm3, xmm3,0F5h
2723// with..
2724//    pshufb     xmm3, kShuffleAlpha // alpha
2725
2726void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
2727                        uint8* dst_argb, int width) {
2728  asm volatile (
2729    "pcmpeqb   %%xmm7,%%xmm7                   \n"
2730    "psrlw     $0xf,%%xmm7                     \n"
2731    "pcmpeqb   %%xmm6,%%xmm6                   \n"
2732    "psrlw     $0x8,%%xmm6                     \n"
2733    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2734    "psllw     $0x8,%%xmm5                     \n"
2735    "pcmpeqb   %%xmm4,%%xmm4                   \n"
2736    "pslld     $0x18,%%xmm4                    \n"
2737    "sub       $0x1,%3                         \n"
2738    "je        91f                             \n"
2739    "jl        99f                             \n"
2740
2741    // 1 pixel loop until destination pointer is aligned.
2742  "10:                                         \n"
2743    "test      $0xf,%2                         \n"
2744    "je        19f                             \n"
2745    "movd      (%0),%%xmm3                     \n"
2746    "lea       0x4(%0),%0                      \n"
2747    "movdqa    %%xmm3,%%xmm0                   \n"
2748    "pxor      %%xmm4,%%xmm3                   \n"
2749    "movd      (%1),%%xmm2                     \n"
2750    "pshufb    %4,%%xmm3                       \n"
2751    "pand      %%xmm6,%%xmm2                   \n"
2752    "paddw     %%xmm7,%%xmm3                   \n"
2753    "pmullw    %%xmm3,%%xmm2                   \n"
2754    "movd      (%1),%%xmm1                     \n"
2755    "lea       0x4(%1),%1                      \n"
2756    "psrlw     $0x8,%%xmm1                     \n"
2757    "por       %%xmm4,%%xmm0                   \n"
2758    "pmullw    %%xmm3,%%xmm1                   \n"
2759    "psrlw     $0x8,%%xmm2                     \n"
2760    "paddusb   %%xmm2,%%xmm0                   \n"
2761    "pand      %%xmm5,%%xmm1                   \n"
2762    "paddusb   %%xmm1,%%xmm0                   \n"
2763    "sub       $0x1,%3                         \n"
2764    "movd      %%xmm0,(%2)                     \n"
2765    "lea       0x4(%2),%2                      \n"
2766    "jge       10b                             \n"
2767
2768  "19:                                         \n"
2769    "add       $1-4,%3                         \n"
2770    "jl        49f                             \n"
2771    "test      $0xf,%0                         \n"
2772    "jne       41f                             \n"
2773    "test      $0xf,%1                         \n"
2774    "jne       41f                             \n"
2775
2776    // 4 pixel loop.
2777    ".p2align  2                               \n"
2778  "40:                                         \n"
2779    "movdqa    (%0),%%xmm3                     \n"
2780    "lea       0x10(%0),%0                     \n"
2781    "movdqa    %%xmm3,%%xmm0                   \n"
2782    "pxor      %%xmm4,%%xmm3                   \n"
2783    "movdqa    (%1),%%xmm2                     \n"
2784    "pshufb    %4,%%xmm3                       \n"
2785    "pand      %%xmm6,%%xmm2                   \n"
2786    "paddw     %%xmm7,%%xmm3                   \n"
2787    "pmullw    %%xmm3,%%xmm2                   \n"
2788    "movdqa    (%1),%%xmm1                     \n"
2789    "lea       0x10(%1),%1                     \n"
2790    "psrlw     $0x8,%%xmm1                     \n"
2791    "por       %%xmm4,%%xmm0                   \n"
2792    "pmullw    %%xmm3,%%xmm1                   \n"
2793    "psrlw     $0x8,%%xmm2                     \n"
2794    "paddusb   %%xmm2,%%xmm0                   \n"
2795    "pand      %%xmm5,%%xmm1                   \n"
2796    "paddusb   %%xmm1,%%xmm0                   \n"
2797    "sub       $0x4,%3                         \n"
2798    "movdqa    %%xmm0,(%2)                     \n"
2799    "lea       0x10(%2),%2                     \n"
2800    "jge       40b                             \n"
2801    "jmp       49f                             \n"
2802
2803    // 4 pixel unaligned loop.
2804    ".p2align  2                               \n"
2805  "41:                                         \n"
2806    "movdqu    (%0),%%xmm3                     \n"
2807    "lea       0x10(%0),%0                     \n"
2808    "movdqa    %%xmm3,%%xmm0                   \n"
2809    "pxor      %%xmm4,%%xmm3                   \n"
2810    "movdqu    (%1),%%xmm2                     \n"
2811    "pshufb    %4,%%xmm3                       \n"
2812    "pand      %%xmm6,%%xmm2                   \n"
2813    "paddw     %%xmm7,%%xmm3                   \n"
2814    "pmullw    %%xmm3,%%xmm2                   \n"
2815    "movdqu    (%1),%%xmm1                     \n"
2816    "lea       0x10(%1),%1                     \n"
2817    "psrlw     $0x8,%%xmm1                     \n"
2818    "por       %%xmm4,%%xmm0                   \n"
2819    "pmullw    %%xmm3,%%xmm1                   \n"
2820    "psrlw     $0x8,%%xmm2                     \n"
2821    "paddusb   %%xmm2,%%xmm0                   \n"
2822    "pand      %%xmm5,%%xmm1                   \n"
2823    "paddusb   %%xmm1,%%xmm0                   \n"
2824    "sub       $0x4,%3                         \n"
2825    "movdqa    %%xmm0,(%2)                     \n"
2826    "lea       0x10(%2),%2                     \n"
2827    "jge       41b                             \n"
2828
2829  "49:                                         \n"
2830    "add       $0x3,%3                         \n"
2831    "jl        99f                             \n"
2832
2833    // 1 pixel loop.
2834  "91:                                         \n"
2835    "movd      (%0),%%xmm3                     \n"
2836    "lea       0x4(%0),%0                      \n"
2837    "movdqa    %%xmm3,%%xmm0                   \n"
2838    "pxor      %%xmm4,%%xmm3                   \n"
2839    "movd      (%1),%%xmm2                     \n"
2840    "pshufb    %4,%%xmm3                       \n"
2841    "pand      %%xmm6,%%xmm2                   \n"
2842    "paddw     %%xmm7,%%xmm3                   \n"
2843    "pmullw    %%xmm3,%%xmm2                   \n"
2844    "movd      (%1),%%xmm1                     \n"
2845    "lea       0x4(%1),%1                      \n"
2846    "psrlw     $0x8,%%xmm1                     \n"
2847    "por       %%xmm4,%%xmm0                   \n"
2848    "pmullw    %%xmm3,%%xmm1                   \n"
2849    "psrlw     $0x8,%%xmm2                     \n"
2850    "paddusb   %%xmm2,%%xmm0                   \n"
2851    "pand      %%xmm5,%%xmm1                   \n"
2852    "paddusb   %%xmm1,%%xmm0                   \n"
2853    "sub       $0x1,%3                         \n"
2854    "movd      %%xmm0,(%2)                     \n"
2855    "lea       0x4(%2),%2                      \n"
2856    "jge       91b                             \n"
2857  "99:                                         \n"
2858  : "+r"(src_argb0),    // %0
2859    "+r"(src_argb1),    // %1
2860    "+r"(dst_argb),     // %2
2861    "+r"(width)         // %3
2862  : "m"(kShuffleAlpha)  // %4
2863  : "memory", "cc"
2864#if defined(__SSE2__)
2865    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
2866#endif
2867  );
2868}
2869#endif  // HAS_ARGBBLENDROW_SSSE3
2870
2871#ifdef HAS_ARGBATTENUATE_SSE2
2872// Attenuate 4 pixels at a time.
2873// aligned to 16 bytes
2874void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
2875  asm volatile (
2876    "sub       %0,%1                           \n"
2877    "pcmpeqb   %%xmm4,%%xmm4                   \n"
2878    "pslld     $0x18,%%xmm4                    \n"
2879    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2880    "psrld     $0x8,%%xmm5                     \n"
2881
2882    // 4 pixel loop.
2883    ".p2align  4                               \n"
2884  "1:                                          \n"
2885    "movdqa    (%0),%%xmm0                     \n"
2886    "punpcklbw %%xmm0,%%xmm0                   \n"
2887    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
2888    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
2889    "pmulhuw   %%xmm2,%%xmm0                   \n"
2890    "movdqa    (%0),%%xmm1                     \n"
2891    "punpckhbw %%xmm1,%%xmm1                   \n"
2892    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
2893    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
2894    "pmulhuw   %%xmm2,%%xmm1                   \n"
2895    "movdqa    (%0),%%xmm2                     \n"
2896    "psrlw     $0x8,%%xmm0                     \n"
2897    "pand      %%xmm4,%%xmm2                   \n"
2898    "psrlw     $0x8,%%xmm1                     \n"
2899    "packuswb  %%xmm1,%%xmm0                   \n"
2900    "pand      %%xmm5,%%xmm0                   \n"
2901    "por       %%xmm2,%%xmm0                   \n"
2902    "sub       $0x4,%2                         \n"
2903    "movdqa    %%xmm0,(%0,%1,1)                \n"
2904    "lea       0x10(%0),%0                     \n"
2905    "jg        1b                              \n"
2906  : "+r"(src_argb),    // %0
2907    "+r"(dst_argb),    // %1
2908    "+r"(width)        // %2
2909  :
2910  : "memory", "cc"
2911#if defined(__SSE2__)
2912    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2913#endif
2914  );
2915}
2916#endif  // HAS_ARGBATTENUATE_SSE2
2917
2918#ifdef HAS_ARGBATTENUATEROW_SSSE3
2919// Shuffle table duplicating alpha
2920CONST uvec8 kShuffleAlpha0 = {
2921  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
2922};
2923CONST uvec8 kShuffleAlpha1 = {
2924  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
2925  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
2926};
2927// Attenuate 4 pixels at a time.
2928// aligned to 16 bytes
2929void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
2930  asm volatile (
2931    "sub       %0,%1                           \n"
2932    "pcmpeqb   %%xmm3,%%xmm3                   \n"
2933    "pslld     $0x18,%%xmm3                    \n"
2934    "movdqa    %3,%%xmm4                       \n"
2935    "movdqa    %4,%%xmm5                       \n"
2936
2937    // 4 pixel loop.
2938    ".p2align  4                               \n"
2939  "1:                                          \n"
2940    "movdqa    (%0),%%xmm0                     \n"
2941    "pshufb    %%xmm4,%%xmm0                   \n"
2942    "movdqa    (%0),%%xmm1                     \n"
2943    "punpcklbw %%xmm1,%%xmm1                   \n"
2944    "pmulhuw   %%xmm1,%%xmm0                   \n"
2945    "movdqa    (%0),%%xmm1                     \n"
2946    "pshufb    %%xmm5,%%xmm1                   \n"
2947    "movdqa    (%0),%%xmm2                     \n"
2948    "punpckhbw %%xmm2,%%xmm2                   \n"
2949    "pmulhuw   %%xmm2,%%xmm1                   \n"
2950    "movdqa    (%0),%%xmm2                     \n"
2951    "pand      %%xmm3,%%xmm2                   \n"
2952    "psrlw     $0x8,%%xmm0                     \n"
2953    "psrlw     $0x8,%%xmm1                     \n"
2954    "packuswb  %%xmm1,%%xmm0                   \n"
2955    "por       %%xmm2,%%xmm0                   \n"
2956    "sub       $0x4,%2                         \n"
2957    "movdqa    %%xmm0,(%0,%1,1)                \n"
2958    "lea       0x10(%0),%0                     \n"
2959    "jg        1b                              \n"
2960  : "+r"(src_argb),    // %0
2961    "+r"(dst_argb),    // %1
2962    "+r"(width)        // %2
2963  : "m"(kShuffleAlpha0),  // %3
2964    "m"(kShuffleAlpha1)  // %4
2965  : "memory", "cc"
2966#if defined(__SSE2__)
2967    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2968#endif
2969  );
2970}
2971#endif  // HAS_ARGBATTENUATEROW_SSSE3
2972
2973#ifdef HAS_ARGBUNATTENUATEROW_SSE2
2974// Unattenuate 4 pixels at a time.
2975// aligned to 16 bytes
2976void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
2977                             int width) {
2978  uintptr_t alpha = 0;
2979  asm volatile (
2980    "sub       %0,%1                           \n"
2981    "pcmpeqb   %%xmm4,%%xmm4                   \n"
2982    "pslld     $0x18,%%xmm4                    \n"
2983
2984    // 4 pixel loop.
2985    ".p2align  4                               \n"
2986  "1:                                          \n"
2987    "movdqa    (%0),%%xmm0                     \n"
2988    "movzb     0x3(%0),%3                      \n"
2989    "punpcklbw %%xmm0,%%xmm0                   \n"
2990    "movd      0x0(%4,%3,4),%%xmm2             \n"
2991    "movzb     0x7(%0),%3                      \n"
2992    "movd      0x0(%4,%3,4),%%xmm3             \n"
2993    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
2994    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
2995    "movlhps   %%xmm3,%%xmm2                   \n"
2996    "pmulhuw   %%xmm2,%%xmm0                   \n"
2997    "movdqa    (%0),%%xmm1                     \n"
2998    "movzb     0xb(%0),%3                      \n"
2999    "punpckhbw %%xmm1,%%xmm1                   \n"
3000    "movd      0x0(%4,%3,4),%%xmm2             \n"
3001    "movzb     0xf(%0),%3                      \n"
3002    "movd      0x0(%4,%3,4),%%xmm3             \n"
3003    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
3004    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
3005    "movlhps   %%xmm3,%%xmm2                   \n"
3006    "pmulhuw   %%xmm2,%%xmm1                   \n"
3007    "movdqa    (%0),%%xmm2                     \n"
3008    "pand      %%xmm4,%%xmm2                   \n"
3009    "packuswb  %%xmm1,%%xmm0                   \n"
3010    "por       %%xmm2,%%xmm0                   \n"
3011    "sub       $0x4,%2                         \n"
3012    "movdqa    %%xmm0,(%0,%1,1)                \n"
3013    "lea       0x10(%0),%0                     \n"
3014    "jg        1b                              \n"
3015  : "+r"(src_argb),    // %0
3016    "+r"(dst_argb),    // %1
3017    "+r"(width),       // %2
3018    "+r"(alpha)        // %3
3019  : "r"(fixed_invtbl8)  // %4
3020  : "memory", "cc"
3021#if defined(__SSE2__)
3022    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3023#endif
3024  );
3025}
3026#endif  // HAS_ARGBUNATTENUATEROW_SSE2
3027
3028#ifdef HAS_ARGBGRAYROW_SSSE3
3029// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
3030CONST vec8 kARGBToGray = {
3031  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
3032};
3033
3034// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
3035void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
3036  asm volatile (
3037    "movdqa    %3,%%xmm4                       \n"
3038    "sub       %0,%1                           \n"
3039
3040    // 8 pixel loop.
3041    ".p2align  4                               \n"
3042  "1:                                          \n"
3043    "movdqa    (%0),%%xmm0                     \n"
3044    "movdqa    0x10(%0),%%xmm1                 \n"
3045    "pmaddubsw %%xmm4,%%xmm0                   \n"
3046    "pmaddubsw %%xmm4,%%xmm1                   \n"
3047    "phaddw    %%xmm1,%%xmm0                   \n"
3048    "psrlw     $0x7,%%xmm0                     \n"
3049    "packuswb  %%xmm0,%%xmm0                   \n"
3050    "movdqa    (%0),%%xmm2                     \n"
3051    "movdqa    0x10(%0),%%xmm3                 \n"
3052    "psrld     $0x18,%%xmm2                    \n"
3053    "psrld     $0x18,%%xmm3                    \n"
3054    "packuswb  %%xmm3,%%xmm2                   \n"
3055    "packuswb  %%xmm2,%%xmm2                   \n"
3056    "movdqa    %%xmm0,%%xmm3                   \n"
3057    "punpcklbw %%xmm0,%%xmm0                   \n"
3058    "punpcklbw %%xmm2,%%xmm3                   \n"
3059    "movdqa    %%xmm0,%%xmm1                   \n"
3060    "punpcklwd %%xmm3,%%xmm0                   \n"
3061    "punpckhwd %%xmm3,%%xmm1                   \n"
3062    "sub       $0x8,%2                         \n"
3063    "movdqa    %%xmm0,(%0,%1,1)                \n"
3064    "movdqa    %%xmm1,0x10(%0,%1,1)            \n"
3065    "lea       0x20(%0),%0                     \n"
3066    "jg        1b                              \n"
3067  : "+r"(src_argb),   // %0
3068    "+r"(dst_argb),   // %1
3069    "+r"(width)       // %2
3070  : "m"(kARGBToGray)  // %3
3071  : "memory", "cc"
3072#if defined(__SSE2__)
3073    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3074#endif
3075  );
3076}
3077#endif  // HAS_ARGBGRAYROW_SSSE3
3078
3079#ifdef HAS_ARGBSEPIAROW_SSSE3
3080//    b = (r * 35 + g * 68 + b * 17) >> 7
3081//    g = (r * 45 + g * 88 + b * 22) >> 7
3082//    r = (r * 50 + g * 98 + b * 24) >> 7
3083// Constant for ARGB color to sepia tone
3084CONST vec8 kARGBToSepiaB = {
3085  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
3086};
3087
3088CONST vec8 kARGBToSepiaG = {
3089  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
3090};
3091
3092CONST vec8 kARGBToSepiaR = {
3093  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
3094};
3095
3096// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
3097void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
3098  asm volatile (
3099    "movdqa    %2,%%xmm2                       \n"
3100    "movdqa    %3,%%xmm3                       \n"
3101    "movdqa    %4,%%xmm4                       \n"
3102
3103    // 8 pixel loop.
3104    ".p2align  4                               \n"
3105  "1:                                          \n"
3106    "movdqa    (%0),%%xmm0                     \n"
3107    "movdqa    0x10(%0),%%xmm6                 \n"
3108    "pmaddubsw %%xmm2,%%xmm0                   \n"
3109    "pmaddubsw %%xmm2,%%xmm6                   \n"
3110    "phaddw    %%xmm6,%%xmm0                   \n"
3111    "psrlw     $0x7,%%xmm0                     \n"
3112    "packuswb  %%xmm0,%%xmm0                   \n"
3113    "movdqa    (%0),%%xmm5                     \n"
3114    "movdqa    0x10(%0),%%xmm1                 \n"
3115    "pmaddubsw %%xmm3,%%xmm5                   \n"
3116    "pmaddubsw %%xmm3,%%xmm1                   \n"
3117    "phaddw    %%xmm1,%%xmm5                   \n"
3118    "psrlw     $0x7,%%xmm5                     \n"
3119    "packuswb  %%xmm5,%%xmm5                   \n"
3120    "punpcklbw %%xmm5,%%xmm0                   \n"
3121    "movdqa    (%0),%%xmm5                     \n"
3122    "movdqa    0x10(%0),%%xmm1                 \n"
3123    "pmaddubsw %%xmm4,%%xmm5                   \n"
3124    "pmaddubsw %%xmm4,%%xmm1                   \n"
3125    "phaddw    %%xmm1,%%xmm5                   \n"
3126    "psrlw     $0x7,%%xmm5                     \n"
3127    "packuswb  %%xmm5,%%xmm5                   \n"
3128    "movdqa    (%0),%%xmm6                     \n"
3129    "movdqa    0x10(%0),%%xmm1                 \n"
3130    "psrld     $0x18,%%xmm6                    \n"
3131    "psrld     $0x18,%%xmm1                    \n"
3132    "packuswb  %%xmm1,%%xmm6                   \n"
3133    "packuswb  %%xmm6,%%xmm6                   \n"
3134    "punpcklbw %%xmm6,%%xmm5                   \n"
3135    "movdqa    %%xmm0,%%xmm1                   \n"
3136    "punpcklwd %%xmm5,%%xmm0                   \n"
3137    "punpckhwd %%xmm5,%%xmm1                   \n"
3138    "sub       $0x8,%1                         \n"
3139    "movdqa    %%xmm0,(%0)                     \n"
3140    "movdqa    %%xmm1,0x10(%0)                 \n"
3141    "lea       0x20(%0),%0                     \n"
3142    "jg        1b                              \n"
3143  : "+r"(dst_argb),      // %0
3144    "+r"(width)          // %1
3145  : "m"(kARGBToSepiaB),  // %2
3146    "m"(kARGBToSepiaG),  // %3
3147    "m"(kARGBToSepiaR)   // %4
3148  : "memory", "cc"
3149#if defined(__SSE2__)
3150    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3151#endif
3152  );
3153}
3154#endif  // HAS_ARGBSEPIAROW_SSSE3
3155
3156#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
3157// Tranform 8 ARGB pixels (32 bytes) with color matrix.
3158// Same as Sepia except matrix is provided.
3159void ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
3160                              int width) {
3161  asm volatile (
3162    "movd      (%2),%%xmm2                     \n"
3163    "movd      0x4(%2),%%xmm3                  \n"
3164    "movd      0x8(%2),%%xmm4                  \n"
3165    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
3166    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
3167    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
3168
3169    // 8 pixel loop.
3170    ".p2align  4                               \n"
3171  "1:                                          \n"
3172    "movdqa    (%0),%%xmm0                     \n"
3173    "movdqa    0x10(%0),%%xmm6                 \n"
3174    "pmaddubsw %%xmm2,%%xmm0                   \n"
3175    "pmaddubsw %%xmm2,%%xmm6                   \n"
3176    "movdqa    (%0),%%xmm5                     \n"
3177    "movdqa    0x10(%0),%%xmm1                 \n"
3178    "pmaddubsw %%xmm3,%%xmm5                   \n"
3179    "pmaddubsw %%xmm3,%%xmm1                   \n"
3180    "phaddsw   %%xmm6,%%xmm0                   \n"
3181    "phaddsw   %%xmm1,%%xmm5                   \n"
3182    "psraw     $0x7,%%xmm0                     \n"
3183    "psraw     $0x7,%%xmm5                     \n"
3184    "packuswb  %%xmm0,%%xmm0                   \n"
3185    "packuswb  %%xmm5,%%xmm5                   \n"
3186    "punpcklbw %%xmm5,%%xmm0                   \n"
3187    "movdqa    (%0),%%xmm5                     \n"
3188    "movdqa    0x10(%0),%%xmm1                 \n"
3189    "pmaddubsw %%xmm4,%%xmm5                   \n"
3190    "pmaddubsw %%xmm4,%%xmm1                   \n"
3191    "phaddsw   %%xmm1,%%xmm5                   \n"
3192    "psraw     $0x7,%%xmm5                     \n"
3193    "packuswb  %%xmm5,%%xmm5                   \n"
3194    "movdqa    (%0),%%xmm6                     \n"
3195    "movdqa    0x10(%0),%%xmm1                 \n"
3196    "psrld     $0x18,%%xmm6                    \n"
3197    "psrld     $0x18,%%xmm1                    \n"
3198    "packuswb  %%xmm1,%%xmm6                   \n"
3199    "packuswb  %%xmm6,%%xmm6                   \n"
3200    "movdqa    %%xmm0,%%xmm1                   \n"
3201    "punpcklbw %%xmm6,%%xmm5                   \n"
3202    "punpcklwd %%xmm5,%%xmm0                   \n"
3203    "punpckhwd %%xmm5,%%xmm1                   \n"
3204    "sub       $0x8,%1                         \n"
3205    "movdqa    %%xmm0,(%0)                     \n"
3206    "movdqa    %%xmm1,0x10(%0)                 \n"
3207    "lea       0x20(%0),%0                     \n"
3208    "jg        1b                              \n"
3209  : "+r"(dst_argb),      // %0
3210    "+r"(width)          // %1
3211  : "r"(matrix_argb)     // %2
3212  : "memory", "cc"
3213#if defined(__SSE2__)
3214    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
3215#endif
3216  );
3217}
3218#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
3219
3220#ifdef HAS_ARGBQUANTIZEROW_SSE2
3221// Quantize 4 ARGB pixels (16 bytes).
3222// aligned to 16 bytes
3223void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
3224                          int interval_offset, int width) {
3225  asm volatile (
3226    "movd      %2,%%xmm2                       \n"
3227    "movd      %3,%%xmm3                       \n"
3228    "movd      %4,%%xmm4                       \n"
3229    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
3230    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
3231    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
3232    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
3233    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
3234    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
3235    "pxor      %%xmm5,%%xmm5                   \n"
3236    "pcmpeqb   %%xmm6,%%xmm6                   \n"
3237    "pslld     $0x18,%%xmm6                    \n"
3238
3239    // 4 pixel loop.
3240    ".p2align  2                               \n"
3241  "1:                                          \n"
3242    "movdqa    (%0),%%xmm0                     \n"
3243    "punpcklbw %%xmm5,%%xmm0                   \n"
3244    "pmulhuw   %%xmm2,%%xmm0                   \n"
3245    "movdqa    (%0),%%xmm1                     \n"
3246    "punpckhbw %%xmm5,%%xmm1                   \n"
3247    "pmulhuw   %%xmm2,%%xmm1                   \n"
3248    "pmullw    %%xmm3,%%xmm0                   \n"
3249    "movdqa    (%0),%%xmm7                     \n"
3250    "pmullw    %%xmm3,%%xmm1                   \n"
3251    "pand      %%xmm6,%%xmm7                   \n"
3252    "paddw     %%xmm4,%%xmm0                   \n"
3253    "paddw     %%xmm4,%%xmm1                   \n"
3254    "packuswb  %%xmm1,%%xmm0                   \n"
3255    "por       %%xmm7,%%xmm0                   \n"
3256    "sub       $0x4,%1                         \n"
3257    "movdqa    %%xmm0,(%0)                     \n"
3258    "lea       0x10(%0),%0                     \n"
3259    "jg        1b                              \n"
3260  : "+r"(dst_argb),       // %0
3261    "+r"(width)           // %1
3262  : "r"(scale),           // %2
3263    "r"(interval_size),   // %3
3264    "r"(interval_offset)  // %4
3265  : "memory", "cc"
3266#if defined(__SSE2__)
3267    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3268#endif
3269  );
3270}
3271#endif  // HAS_ARGBQUANTIZEROW_SSE2
3272
3273#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
3274// Creates a table of cumulative sums where each value is a sum of all values
3275// above and to the left of the value, inclusive of the value.
3276void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
3277                                  const int32* previous_cumsum, int width) {
3278  asm volatile (
3279    "sub       %1,%2                           \n"
3280    "pxor      %%xmm0,%%xmm0                   \n"
3281    "pxor      %%xmm1,%%xmm1                   \n"
3282    "sub       $0x4,%3                         \n"
3283    "jl        49f                             \n"
3284    "test      $0xf,%1                         \n"
3285    "jne       49f                             \n"
3286
3287  // 4 pixel loop                              \n"
3288    ".p2align  2                               \n"
3289  "40:                                         \n"
3290    "movdqu    (%0),%%xmm2                     \n"
3291    "lea       0x10(%0),%0                     \n"
3292    "movdqa    %%xmm2,%%xmm4                   \n"
3293    "punpcklbw %%xmm1,%%xmm2                   \n"
3294    "movdqa    %%xmm2,%%xmm3                   \n"
3295    "punpcklwd %%xmm1,%%xmm2                   \n"
3296    "punpckhwd %%xmm1,%%xmm3                   \n"
3297    "punpckhbw %%xmm1,%%xmm4                   \n"
3298    "movdqa    %%xmm4,%%xmm5                   \n"
3299    "punpcklwd %%xmm1,%%xmm4                   \n"
3300    "punpckhwd %%xmm1,%%xmm5                   \n"
3301    "paddd     %%xmm2,%%xmm0                   \n"
3302    "movdqa    (%1,%2,1),%%xmm2                \n"
3303    "paddd     %%xmm0,%%xmm2                   \n"
3304    "paddd     %%xmm3,%%xmm0                   \n"
3305    "movdqa    0x10(%1,%2,1),%%xmm3            \n"
3306    "paddd     %%xmm0,%%xmm3                   \n"
3307    "paddd     %%xmm4,%%xmm0                   \n"
3308    "movdqa    0x20(%1,%2,1),%%xmm4            \n"
3309    "paddd     %%xmm0,%%xmm4                   \n"
3310    "paddd     %%xmm5,%%xmm0                   \n"
3311    "movdqa    0x30(%1,%2,1),%%xmm5            \n"
3312    "paddd     %%xmm0,%%xmm5                   \n"
3313    "movdqa    %%xmm2,(%1)                     \n"
3314    "movdqa    %%xmm3,0x10(%1)                 \n"
3315    "movdqa    %%xmm4,0x20(%1)                 \n"
3316    "movdqa    %%xmm5,0x30(%1)                 \n"
3317    "lea       0x40(%1),%1                     \n"
3318    "sub       $0x4,%3                         \n"
3319    "jge       40b                             \n"
3320
3321  "49:                                         \n"
3322    "add       $0x3,%3                         \n"
3323    "jl        19f                             \n"
3324
3325  // 1 pixel loop                              \n"
3326    ".p2align  2                               \n"
3327  "10:                                         \n"
3328    "movd      (%0),%%xmm2                     \n"
3329    "lea       0x4(%0),%0                      \n"
3330    "punpcklbw %%xmm1,%%xmm2                   \n"
3331    "punpcklwd %%xmm1,%%xmm2                   \n"
3332    "paddd     %%xmm2,%%xmm0                   \n"
3333    "movdqu    (%1,%2,1),%%xmm2                \n"
3334    "paddd     %%xmm0,%%xmm2                   \n"
3335    "movdqu    %%xmm2,(%1)                     \n"
3336    "lea       0x10(%1),%1                     \n"
3337    "sub       $0x1,%3                         \n"
3338    "jge       10b                             \n"
3339
3340  "19:                                         \n"
3341  : "+r"(row),  // %0
3342    "+r"(cumsum),  // %1
3343    "+r"(previous_cumsum),  // %2
3344    "+r"(width)  // %3
3345  :
3346  : "memory", "cc"
3347#if defined(__SSE2__)
3348    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3349#endif
3350  );
3351}
3352#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
3353
3354#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
3355void CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
3356                                 int width, int area, uint8* dst, int count) {
3357  asm volatile (
3358    "movd      %5,%%xmm4                       \n"
3359    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
3360    "rcpss     %%xmm4,%%xmm4                   \n"
3361    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
3362    "sub       $0x4,%3                         \n"
3363    "jl        49f                             \n"
3364
3365  // 4 pixel loop                              \n"
3366    ".p2align  2                               \n"
3367  "40:                                         \n"
3368    "movdqa    (%0),%%xmm0                     \n"
3369    "movdqa    0x10(%0),%%xmm1                 \n"
3370    "movdqa    0x20(%0),%%xmm2                 \n"
3371    "movdqa    0x30(%0),%%xmm3                 \n"
3372    "psubd     (%0,%4,4),%%xmm0                \n"
3373    "psubd     0x10(%0,%4,4),%%xmm1            \n"
3374    "psubd     0x20(%0,%4,4),%%xmm2            \n"
3375    "psubd     0x30(%0,%4,4),%%xmm3            \n"
3376    "lea       0x40(%0),%0                     \n"
3377    "psubd     (%1),%%xmm0                     \n"
3378    "psubd     0x10(%1),%%xmm1                 \n"
3379    "psubd     0x20(%1),%%xmm2                 \n"
3380    "psubd     0x30(%1),%%xmm3                 \n"
3381    "paddd     (%1,%4,4),%%xmm0                \n"
3382    "paddd     0x10(%1,%4,4),%%xmm1            \n"
3383    "paddd     0x20(%1,%4,4),%%xmm2            \n"
3384    "paddd     0x30(%1,%4,4),%%xmm3            \n"
3385    "lea       0x40(%1),%1                     \n"
3386    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
3387    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
3388    "mulps     %%xmm4,%%xmm0                   \n"
3389    "mulps     %%xmm4,%%xmm1                   \n"
3390    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
3391    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
3392    "mulps     %%xmm4,%%xmm2                   \n"
3393    "mulps     %%xmm4,%%xmm3                   \n"
3394    "cvtps2dq  %%xmm0,%%xmm0                   \n"
3395    "cvtps2dq  %%xmm1,%%xmm1                   \n"
3396    "cvtps2dq  %%xmm2,%%xmm2                   \n"
3397    "cvtps2dq  %%xmm3,%%xmm3                   \n"
3398    "packssdw  %%xmm1,%%xmm0                   \n"
3399    "packssdw  %%xmm3,%%xmm2                   \n"
3400    "packuswb  %%xmm2,%%xmm0                   \n"
3401    "movdqu    %%xmm0,(%2)                     \n"
3402    "lea       0x10(%2),%2                     \n"
3403    "sub       $0x4,%3                         \n"
3404    "jge       40b                             \n"
3405
3406  "49:                                         \n"
3407    "add       $0x3,%3                         \n"
3408    "jl        19f                             \n"
3409
3410  // 1 pixel loop                              \n"
3411    ".p2align  2                               \n"
3412  "10:                                         \n"
3413    "movdqa    (%0),%%xmm0                     \n"
3414    "psubd     (%0,%4,4),%%xmm0                \n"
3415    "lea       0x10(%0),%0                     \n"
3416    "psubd     (%1),%%xmm0                     \n"
3417    "paddd     (%1,%4,4),%%xmm0                \n"
3418    "lea       0x10(%1),%1                     \n"
3419    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
3420    "mulps     %%xmm4,%%xmm0                   \n"
3421    "cvtps2dq  %%xmm0,%%xmm0                   \n"
3422    "packssdw  %%xmm0,%%xmm0                   \n"
3423    "packuswb  %%xmm0,%%xmm0                   \n"
3424    "movd      %%xmm0,(%2)                     \n"
3425    "lea       0x4(%2),%2                      \n"
3426    "sub       $0x1,%3                         \n"
3427    "jge       10b                             \n"
3428  "19:                                         \n"
3429  : "+r"(topleft),  // %0
3430    "+r"(botleft),  // %1
3431    "+r"(dst),      // %2
3432    "+rm"(count)    // %3
3433  : "r"(static_cast<intptr_t>(width)),  // %4
3434    "rm"(area)     // %5
3435  : "memory", "cc"
3436#if defined(__SSE2__)
3437    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
3438#endif
3439  );
3440}
3441#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
3442#ifdef HAS_ARGBSHADE_SSE2
3443// Shade 4 pixels at a time by specified value.
3444// Aligned to 16 bytes.
3445void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
3446                       uint32 value) {
3447  asm volatile (
3448    "movd      %3,%%xmm2                       \n"
3449    "sub       %0,%1                           \n"
3450    "punpcklbw %%xmm2,%%xmm2                   \n"
3451    "punpcklqdq %%xmm2,%%xmm2                  \n"
3452
3453    // 4 pixel loop.
3454    ".p2align  2                               \n"
3455  "1:                                          \n"
3456    "movdqa    (%0),%%xmm0                     \n"
3457    "movdqa    %%xmm0,%%xmm1                   \n"
3458    "punpcklbw %%xmm0,%%xmm0                   \n"
3459    "punpckhbw %%xmm1,%%xmm1                   \n"
3460    "pmulhuw   %%xmm2,%%xmm0                   \n"
3461    "pmulhuw   %%xmm2,%%xmm1                   \n"
3462    "psrlw     $0x8,%%xmm0                     \n"
3463    "psrlw     $0x8,%%xmm1                     \n"
3464    "packuswb  %%xmm1,%%xmm0                   \n"
3465    "sub       $0x4,%2                         \n"
3466    "movdqa    %%xmm0,(%0,%1,1)                \n"
3467    "lea       0x10(%0),%0                     \n"
3468    "jg        1b                              \n"
3469  : "+r"(src_argb),       // %0
3470    "+r"(dst_argb),       // %1
3471    "+r"(width)           // %2
3472  : "r"(value)            // %3
3473  : "memory", "cc"
3474#if defined(__SSE2__)
3475    , "xmm0", "xmm1", "xmm2"
3476#endif
3477  );
3478}
3479#endif  // HAS_ARGBSHADE_SSE2
3480
3481#ifdef HAS_ARGBAFFINEROW_SSE2
3482// TODO(fbarchard): Find 64 bit way to avoid masking.
3483// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
3484// Copy ARGB pixels from source image with slope to a row of destination.
3485// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
3486// an error if movq is used. movd  %%xmm0,%1
3487
3488LIBYUV_API
3489void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
3490                        uint8* dst_argb, const float* uv_dudv, int width) {
3491  intptr_t src_argb_stride_temp = src_argb_stride;
3492  intptr_t temp = 0;
3493  asm volatile (
3494    "movq      (%3),%%xmm2                     \n"
3495    "movq      0x8(%3),%%xmm7                  \n"
3496    "shl       $0x10,%1                        \n"
3497    "add       $0x4,%1                         \n"
3498    "movd      %1,%%xmm5                       \n"
3499    "sub       $0x4,%4                         \n"
3500    "jl        49f                             \n"
3501
3502    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
3503    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
3504    "movdqa    %%xmm2,%%xmm0                   \n"
3505    "addps     %%xmm7,%%xmm0                   \n"
3506    "movlhps   %%xmm0,%%xmm2                   \n"
3507    "movdqa    %%xmm7,%%xmm4                   \n"
3508    "addps     %%xmm4,%%xmm4                   \n"
3509    "movdqa    %%xmm2,%%xmm3                   \n"
3510    "addps     %%xmm4,%%xmm3                   \n"
3511    "addps     %%xmm4,%%xmm4                   \n"
3512
3513  // 4 pixel loop                              \n"
3514    ".p2align  4                               \n"
3515  "40:                                         \n"
3516    "cvttps2dq %%xmm2,%%xmm0                   \n"
3517    "cvttps2dq %%xmm3,%%xmm1                   \n"
3518    "packssdw  %%xmm1,%%xmm0                   \n"
3519    "pmaddwd   %%xmm5,%%xmm0                   \n"
3520#if defined(__x86_64__)
3521    "movd      %%xmm0,%1                       \n"
3522    "mov       %1,%5                           \n"
3523    "and       $0x0fffffff,%1                  \n"
3524    "shr       $32,%5                          \n"
3525    "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
3526#else
3527    "movd      %%xmm0,%1                       \n"
3528    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
3529    "movd      %%xmm0,%5                       \n"
3530    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
3531#endif
3532    "movd      (%0,%1,1),%%xmm1                \n"
3533    "movd      (%0,%5,1),%%xmm6                \n"
3534    "punpckldq %%xmm6,%%xmm1                   \n"
3535    "addps     %%xmm4,%%xmm2                   \n"
3536    "movq      %%xmm1,(%2)                     \n"
3537#if defined(__x86_64__)
3538    "movd      %%xmm0,%1                       \n"
3539    "mov       %1,%5                           \n"
3540    "and       $0x0fffffff,%1                  \n"
3541    "shr       $32,%5                          \n"
3542#else
3543    "movd      %%xmm0,%1                       \n"
3544    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
3545    "movd      %%xmm0,%5                       \n"
3546#endif
3547    "movd      (%0,%1,1),%%xmm0                \n"
3548    "movd      (%0,%5,1),%%xmm6                \n"
3549    "punpckldq %%xmm6,%%xmm0                   \n"
3550    "addps     %%xmm4,%%xmm3                   \n"
3551    "sub       $0x4,%4                         \n"
3552    "movq      %%xmm0,0x08(%2)                 \n"
3553    "lea       0x10(%2),%2                     \n"
3554    "jge       40b                             \n"
3555
3556  "49:                                         \n"
3557    "add       $0x3,%4                         \n"
3558    "jl        19f                             \n"
3559
3560  // 1 pixel loop                              \n"
3561    ".p2align  4                               \n"
3562  "10:                                         \n"
3563    "cvttps2dq %%xmm2,%%xmm0                   \n"
3564    "packssdw  %%xmm0,%%xmm0                   \n"
3565    "pmaddwd   %%xmm5,%%xmm0                   \n"
3566    "addps     %%xmm7,%%xmm2                   \n"
3567    "movd      %%xmm0,%1                       \n"
3568#if defined(__x86_64__)
3569    "and       $0x0fffffff,%1                  \n"
3570#endif
3571    "movd      (%0,%1,1),%%xmm0                \n"
3572    "sub       $0x1,%4                         \n"
3573    "movd      %%xmm0,(%2)                     \n"
3574    "lea       0x4(%2),%2                      \n"
3575    "jge       10b                             \n"
3576  "19:                                         \n"
3577  : "+r"(src_argb),  // %0
3578    "+r"(src_argb_stride_temp),  // %1
3579    "+r"(dst_argb),  // %2
3580    "+r"(uv_dudv),   // %3
3581    "+rm"(width),    // %4
3582    "+r"(temp)   // %5
3583  :
3584  : "memory", "cc"
3585#if defined(__SSE2__)
3586    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3587#endif
3588  );
3589}
3590#endif  // HAS_ARGBAFFINEROW_SSE2
3591
3592// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
3593void ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
3594                              ptrdiff_t src_stride, int dst_width,
3595                              int source_y_fraction) {
3596  asm volatile (
3597    "sub       %1,%0                           \n"
3598    "shr       %3                              \n"
3599    "cmp       $0x0,%3                         \n"
3600    "je        2f                              \n"
3601    "cmp       $0x40,%3                        \n"
3602    "je        3f                              \n"
3603    "movd      %3,%%xmm0                       \n"
3604    "neg       %3                              \n"
3605    "add       $0x80,%3                        \n"
3606    "movd      %3,%%xmm5                       \n"
3607    "punpcklbw %%xmm0,%%xmm5                   \n"
3608    "punpcklwd %%xmm5,%%xmm5                   \n"
3609    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
3610    ".p2align  4                               \n"
3611  "1:                                          \n"
3612    "movdqa    (%1),%%xmm0                     \n"
3613    "movdqa    (%1,%4,1),%%xmm2                \n"
3614    "movdqa    %%xmm0,%%xmm1                   \n"
3615    "punpcklbw %%xmm2,%%xmm0                   \n"
3616    "punpckhbw %%xmm2,%%xmm1                   \n"
3617    "pmaddubsw %%xmm5,%%xmm0                   \n"
3618    "pmaddubsw %%xmm5,%%xmm1                   \n"
3619    "psrlw     $0x7,%%xmm0                     \n"
3620    "psrlw     $0x7,%%xmm1                     \n"
3621    "packuswb  %%xmm1,%%xmm0                   \n"
3622    "sub       $0x4,%2                         \n"
3623    "movdqa    %%xmm0,(%1,%0,1)                \n"
3624    "lea       0x10(%1),%1                     \n"
3625    "jg        1b                              \n"
3626    "jmp       4f                              \n"
3627    ".p2align  4                               \n"
3628  "2:                                          \n"
3629    "movdqa    (%1),%%xmm0                     \n"
3630    "sub       $0x4,%2                         \n"
3631    "movdqa    %%xmm0,(%1,%0,1)                \n"
3632    "lea       0x10(%1),%1                     \n"
3633    "jg        2b                              \n"
3634    "jmp       4f                              \n"
3635    ".p2align  4                               \n"
3636  "3:                                          \n"
3637    "movdqa    (%1),%%xmm0                     \n"
3638    "pavgb     (%1,%4,1),%%xmm0                \n"
3639    "sub       $0x4,%2                         \n"
3640    "movdqa    %%xmm0,(%1,%0,1)                \n"
3641    "lea       0x10(%1),%1                     \n"
3642    "jg        3b                              \n"
3643  "4:                                          \n"
3644    ".p2align  4                               \n"
3645  : "+r"(dst_ptr),     // %0
3646    "+r"(src_ptr),     // %1
3647    "+r"(dst_width),   // %2
3648    "+r"(source_y_fraction)  // %3
3649  : "r"(static_cast<intptr_t>(src_stride))  // %4
3650  : "memory", "cc"
3651#if defined(__SSE2__)
3652    , "xmm0", "xmm1", "xmm2", "xmm5"
3653#endif
3654  );
3655}
3656
3657#endif  // defined(__x86_64__) || defined(__i386__)
3658
3659#ifdef __cplusplus
3660}  // extern "C"
3661}  // namespace libyuv
3662#endif
3663