1/*
2 *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
3 *
4 *  Use of this source code is governed by a BSD-style license
5 *  that can be found in the LICENSE file in the root of the source
6 *  tree. An additional intellectual property rights grant can be found
7 *  in the file PATENTS. All contributing project authors may
8 *  be found in the AUTHORS file in the root of the source tree.
9 */
10
11#include "libyuv/row.h"
12
13#ifdef __cplusplus
14namespace libyuv {
15extern "C" {
16#endif
17
18// This module is for GCC x86 and x64.
19#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
20
21#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
22
23// Constants for ARGB
24static vec8 kARGBToY = {
25  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
26};
27
28// JPeg full range.
29static vec8 kARGBToYJ = {
30  15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0, 15, 75, 38, 0
31};
32#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_ARGBGRAYROW_SSSE3)
33
34#if defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
35
36static vec8 kARGBToU = {
37  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
38};
39
40static vec8 kARGBToUJ = {
41  127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0, 127, -84, -43, 0
42};
43
44static vec8 kARGBToV = {
45  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
46};
47
48static vec8 kARGBToVJ = {
49  -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0, -20, -107, 127, 0
50};
51
52// Constants for BGRA
53static vec8 kBGRAToY = {
54  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
55};
56
57static vec8 kBGRAToU = {
58  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
59};
60
61static vec8 kBGRAToV = {
62  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
63};
64
65// Constants for ABGR
66static vec8 kABGRToY = {
67  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
68};
69
70static vec8 kABGRToU = {
71  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
72};
73
74static vec8 kABGRToV = {
75  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
76};
77
78// Constants for RGBA.
79static vec8 kRGBAToY = {
80  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
81};
82
83static vec8 kRGBAToU = {
84  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
85};
86
87static vec8 kRGBAToV = {
88  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
89};
90
91static uvec8 kAddY16 = {
92  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
93};
94
95static vec16 kAddYJ64 = {
96  64, 64, 64, 64, 64, 64, 64, 64
97};
98
99static uvec8 kAddUV128 = {
100  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
101  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
102};
103
104static uvec16 kAddUVJ128 = {
105  0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u, 0x8080u
106};
107#endif  // defined(HAS_ARGBTOYROW_SSSE3) || defined(HAS_I422TOARGBROW_SSSE3)
108
109#ifdef HAS_RGB24TOARGBROW_SSSE3
110
111// Shuffle table for converting RGB24 to ARGB.
112static uvec8 kShuffleMaskRGB24ToARGB = {
113  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
114};
115
116// Shuffle table for converting RAW to ARGB.
117static uvec8 kShuffleMaskRAWToARGB = {
118  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
119};
120
121// Shuffle table for converting ARGB to RGB24.
122static uvec8 kShuffleMaskARGBToRGB24 = {
123  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
124};
125
126// Shuffle table for converting ARGB to RAW.
127static uvec8 kShuffleMaskARGBToRAW = {
128  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
129};
130
131// Shuffle table for converting ARGBToRGB24 for I422ToRGB24.  First 8 + next 4
132static uvec8 kShuffleMaskARGBToRGB24_0 = {
133  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 128u, 128u, 128u, 128u, 10u, 12u, 13u, 14u
134};
135
136// Shuffle table for converting ARGB to RAW.
137static uvec8 kShuffleMaskARGBToRAW_0 = {
138  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 128u, 128u, 128u, 128u, 8u, 14u, 13u, 12u
139};
140#endif  // HAS_RGB24TOARGBROW_SSSE3
141
142#if defined(TESTING) && defined(__x86_64__)
143void TestRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
144  asm volatile (
145    ".p2align  5                               \n"
146    "mov       %%eax,%%eax                     \n"
147    "mov       %%ebx,%%ebx                     \n"
148    "mov       %%ecx,%%ecx                     \n"
149    "mov       %%edx,%%edx                     \n"
150    "mov       %%esi,%%esi                     \n"
151    "mov       %%edi,%%edi                     \n"
152    "mov       %%ebp,%%ebp                     \n"
153    "mov       %%esp,%%esp                     \n"
154    ".p2align  5                               \n"
155    "mov       %%r8d,%%r8d                     \n"
156    "mov       %%r9d,%%r9d                     \n"
157    "mov       %%r10d,%%r10d                   \n"
158    "mov       %%r11d,%%r11d                   \n"
159    "mov       %%r12d,%%r12d                   \n"
160    "mov       %%r13d,%%r13d                   \n"
161    "mov       %%r14d,%%r14d                   \n"
162    "mov       %%r15d,%%r15d                   \n"
163    ".p2align  5                               \n"
164    "lea       (%%rax),%%eax                   \n"
165    "lea       (%%rbx),%%ebx                   \n"
166    "lea       (%%rcx),%%ecx                   \n"
167    "lea       (%%rdx),%%edx                   \n"
168    "lea       (%%rsi),%%esi                   \n"
169    "lea       (%%rdi),%%edi                   \n"
170    "lea       (%%rbp),%%ebp                   \n"
171    "lea       (%%rsp),%%esp                   \n"
172    ".p2align  5                               \n"
173    "lea       (%%r8),%%r8d                    \n"
174    "lea       (%%r9),%%r9d                    \n"
175    "lea       (%%r10),%%r10d                  \n"
176    "lea       (%%r11),%%r11d                  \n"
177    "lea       (%%r12),%%r12d                  \n"
178    "lea       (%%r13),%%r13d                  \n"
179    "lea       (%%r14),%%r14d                  \n"
180    "lea       (%%r15),%%r15d                  \n"
181
182    ".p2align  5                               \n"
183    "lea       0x10(%%rax),%%eax               \n"
184    "lea       0x10(%%rbx),%%ebx               \n"
185    "lea       0x10(%%rcx),%%ecx               \n"
186    "lea       0x10(%%rdx),%%edx               \n"
187    "lea       0x10(%%rsi),%%esi               \n"
188    "lea       0x10(%%rdi),%%edi               \n"
189    "lea       0x10(%%rbp),%%ebp               \n"
190    "lea       0x10(%%rsp),%%esp               \n"
191    ".p2align  5                               \n"
192    "lea       0x10(%%r8),%%r8d                \n"
193    "lea       0x10(%%r9),%%r9d                \n"
194    "lea       0x10(%%r10),%%r10d              \n"
195    "lea       0x10(%%r11),%%r11d              \n"
196    "lea       0x10(%%r12),%%r12d              \n"
197    "lea       0x10(%%r13),%%r13d              \n"
198    "lea       0x10(%%r14),%%r14d              \n"
199    "lea       0x10(%%r15),%%r15d              \n"
200
201    ".p2align  5                               \n"
202    "add       0x10,%%eax                      \n"
203    "add       0x10,%%ebx                      \n"
204    "add       0x10,%%ecx                      \n"
205    "add       0x10,%%edx                      \n"
206    "add       0x10,%%esi                      \n"
207    "add       0x10,%%edi                      \n"
208    "add       0x10,%%ebp                      \n"
209    "add       0x10,%%esp                      \n"
210    ".p2align  5                               \n"
211    "add       0x10,%%r8d                      \n"
212    "add       0x10,%%r9d                      \n"
213    "add       0x10,%%r10d                     \n"
214    "add       0x10,%%r11d                     \n"
215    "add       0x10,%%r12d                     \n"
216    "add       0x10,%%r13d                     \n"
217    "add       0x10,%%r14d                     \n"
218    "add       0x10,%%r15d                     \n"
219
220    ".p2align  2                               \n"
221  "1:                                          \n"
222    "movq      " MEMACCESS(0) ",%%xmm0         \n"
223    "lea       " MEMLEA(0x8,0) ",%0            \n"
224    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
225    "lea       " MEMLEA(0x20,1) ",%1           \n"
226    "sub       $0x8,%2                         \n"
227    "jg        1b                              \n"
228  : "+r"(src_y),     // %0
229    "+r"(dst_argb),  // %1
230    "+r"(pix)        // %2
231  :
232  : "memory", "cc"
233#if defined(__SSE2__)
234    , "xmm0", "xmm1", "xmm5"
235#endif
236  );
237}
238#endif  // TESTING
239
240#ifdef HAS_I400TOARGBROW_SSE2
241void I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
242  asm volatile (
243    "pcmpeqb   %%xmm5,%%xmm5                   \n"
244    "pslld     $0x18,%%xmm5                    \n"
245    LABELALIGN
246  "1:                                          \n"
247    "movq      " MEMACCESS(0) ",%%xmm0         \n"
248    "lea       " MEMLEA(0x8,0) ",%0            \n"
249    "punpcklbw %%xmm0,%%xmm0                   \n"
250    "movdqa    %%xmm0,%%xmm1                   \n"
251    "punpcklwd %%xmm0,%%xmm0                   \n"
252    "punpckhwd %%xmm1,%%xmm1                   \n"
253    "por       %%xmm5,%%xmm0                   \n"
254    "por       %%xmm5,%%xmm1                   \n"
255    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
256    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
257    "lea       " MEMLEA(0x20,1) ",%1           \n"
258    "sub       $0x8,%2                         \n"
259    "jg        1b                              \n"
260  : "+r"(src_y),     // %0
261    "+r"(dst_argb),  // %1
262    "+r"(pix)        // %2
263  :
264  : "memory", "cc"
265#if defined(__SSE2__)
266    , "xmm0", "xmm1", "xmm5"
267#endif
268  );
269}
270
271void I400ToARGBRow_Unaligned_SSE2(const uint8* src_y, uint8* dst_argb,
272                                  int pix) {
273  asm volatile (
274    "pcmpeqb   %%xmm5,%%xmm5                   \n"
275    "pslld     $0x18,%%xmm5                    \n"
276    LABELALIGN
277  "1:                                          \n"
278    "movq      " MEMACCESS(0) ",%%xmm0         \n"
279    "lea       " MEMLEA(0x8,0) ",%0            \n"
280    "punpcklbw %%xmm0,%%xmm0                   \n"
281    "movdqa    %%xmm0,%%xmm1                   \n"
282    "punpcklwd %%xmm0,%%xmm0                   \n"
283    "punpckhwd %%xmm1,%%xmm1                   \n"
284    "por       %%xmm5,%%xmm0                   \n"
285    "por       %%xmm5,%%xmm1                   \n"
286    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
287    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
288    "lea       " MEMLEA(0x20,1) ",%1           \n"
289    "sub       $0x8,%2                         \n"
290    "jg        1b                              \n"
291  : "+r"(src_y),     // %0
292    "+r"(dst_argb),  // %1
293    "+r"(pix)        // %2
294  :
295  : "memory", "cc"
296#if defined(__SSE2__)
297    , "xmm0", "xmm1", "xmm5"
298#endif
299  );
300}
301#endif  // HAS_I400TOARGBROW_SSE2
302
303#ifdef HAS_RGB24TOARGBROW_SSSE3
304void RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
305  asm volatile (
306    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
307    "pslld     $0x18,%%xmm5                    \n"
308    "movdqa    %3,%%xmm4                       \n"
309    LABELALIGN
310  "1:                                          \n"
311    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
312    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
313    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
314    "lea       " MEMLEA(0x30,0) ",%0           \n"
315    "movdqa    %%xmm3,%%xmm2                   \n"
316    "palignr   $0x8,%%xmm1,%%xmm2              \n"
317    "pshufb    %%xmm4,%%xmm2                   \n"
318    "por       %%xmm5,%%xmm2                   \n"
319    "palignr   $0xc,%%xmm0,%%xmm1              \n"
320    "pshufb    %%xmm4,%%xmm0                   \n"
321    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
322    "por       %%xmm5,%%xmm0                   \n"
323    "pshufb    %%xmm4,%%xmm1                   \n"
324    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
325    "por       %%xmm5,%%xmm1                   \n"
326    "palignr   $0x4,%%xmm3,%%xmm3              \n"
327    "pshufb    %%xmm4,%%xmm3                   \n"
328    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
329    "por       %%xmm5,%%xmm3                   \n"
330    "sub       $0x10,%2                        \n"
331    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
332    "lea       " MEMLEA(0x40,1) ",%1           \n"
333    "jg        1b                              \n"
334  : "+r"(src_rgb24),  // %0
335    "+r"(dst_argb),  // %1
336    "+r"(pix)        // %2
337  : "m"(kShuffleMaskRGB24ToARGB)  // %3
338  : "memory", "cc"
339#if defined(__SSE2__)
340    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
341#endif
342  );
343}
344
345void RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
346  asm volatile (
347    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
348    "pslld     $0x18,%%xmm5                    \n"
349    "movdqa    %3,%%xmm4                       \n"
350    LABELALIGN
351  "1:                                          \n"
352    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
353    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
354    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm3   \n"
355    "lea       " MEMLEA(0x30,0) ",%0           \n"
356    "movdqa    %%xmm3,%%xmm2                   \n"
357    "palignr   $0x8,%%xmm1,%%xmm2              \n"
358    "pshufb    %%xmm4,%%xmm2                   \n"
359    "por       %%xmm5,%%xmm2                   \n"
360    "palignr   $0xc,%%xmm0,%%xmm1              \n"
361    "pshufb    %%xmm4,%%xmm0                   \n"
362    "movdqa    %%xmm2," MEMACCESS2(0x20,1) "   \n"
363    "por       %%xmm5,%%xmm0                   \n"
364    "pshufb    %%xmm4,%%xmm1                   \n"
365    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
366    "por       %%xmm5,%%xmm1                   \n"
367    "palignr   $0x4,%%xmm3,%%xmm3              \n"
368    "pshufb    %%xmm4,%%xmm3                   \n"
369    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
370    "por       %%xmm5,%%xmm3                   \n"
371    "sub       $0x10,%2                        \n"
372    "movdqa    %%xmm3," MEMACCESS2(0x30,1) "   \n"
373    "lea       " MEMLEA(0x40,1) ",%1           \n"
374    "jg        1b                              \n"
375  : "+r"(src_raw),   // %0
376    "+r"(dst_argb),  // %1
377    "+r"(pix)        // %2
378  : "m"(kShuffleMaskRAWToARGB)  // %3
379  : "memory", "cc"
380#if defined(__SSE2__)
381    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
382#endif
383  );
384}
385
386void RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
387  asm volatile (
388    "mov       $0x1080108,%%eax                \n"
389    "movd      %%eax,%%xmm5                    \n"
390    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
391    "mov       $0x20802080,%%eax               \n"
392    "movd      %%eax,%%xmm6                    \n"
393    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
394    "pcmpeqb   %%xmm3,%%xmm3                   \n"
395    "psllw     $0xb,%%xmm3                     \n"
396    "pcmpeqb   %%xmm4,%%xmm4                   \n"
397    "psllw     $0xa,%%xmm4                     \n"
398    "psrlw     $0x5,%%xmm4                     \n"
399    "pcmpeqb   %%xmm7,%%xmm7                   \n"
400    "psllw     $0x8,%%xmm7                     \n"
401    "sub       %0,%1                           \n"
402    "sub       %0,%1                           \n"
403    LABELALIGN
404  "1:                                          \n"
405    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
406    "movdqa    %%xmm0,%%xmm1                   \n"
407    "movdqa    %%xmm0,%%xmm2                   \n"
408    "pand      %%xmm3,%%xmm1                   \n"
409    "psllw     $0xb,%%xmm2                     \n"
410    "pmulhuw   %%xmm5,%%xmm1                   \n"
411    "pmulhuw   %%xmm5,%%xmm2                   \n"
412    "psllw     $0x8,%%xmm1                     \n"
413    "por       %%xmm2,%%xmm1                   \n"
414    "pand      %%xmm4,%%xmm0                   \n"
415    "pmulhuw   %%xmm6,%%xmm0                   \n"
416    "por       %%xmm7,%%xmm0                   \n"
417    "movdqa    %%xmm1,%%xmm2                   \n"
418    "punpcklbw %%xmm0,%%xmm1                   \n"
419    "punpckhbw %%xmm0,%%xmm2                   \n"
420    BUNDLEALIGN
421    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
422    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
423    "lea       " MEMLEA(0x10,0) ",%0           \n"
424    "sub       $0x8,%2                         \n"
425    "jg        1b                              \n"
426  : "+r"(src),  // %0
427    "+r"(dst),  // %1
428    "+r"(pix)   // %2
429  :
430  : "memory", "cc", "eax"
431#if defined(__native_client__) && defined(__x86_64__)
432    , "r14"
433#endif
434#if defined(__SSE2__)
435    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
436#endif
437  );
438}
439
440void ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
441  asm volatile (
442    "mov       $0x1080108,%%eax                \n"
443    "movd      %%eax,%%xmm5                    \n"
444    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
445    "mov       $0x42004200,%%eax               \n"
446    "movd      %%eax,%%xmm6                    \n"
447    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
448    "pcmpeqb   %%xmm3,%%xmm3                   \n"
449    "psllw     $0xb,%%xmm3                     \n"
450    "movdqa    %%xmm3,%%xmm4                   \n"
451    "psrlw     $0x6,%%xmm4                     \n"
452    "pcmpeqb   %%xmm7,%%xmm7                   \n"
453    "psllw     $0x8,%%xmm7                     \n"
454    "sub       %0,%1                           \n"
455    "sub       %0,%1                           \n"
456    LABELALIGN
457  "1:                                          \n"
458    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
459    "movdqa    %%xmm0,%%xmm1                   \n"
460    "movdqa    %%xmm0,%%xmm2                   \n"
461    "psllw     $0x1,%%xmm1                     \n"
462    "psllw     $0xb,%%xmm2                     \n"
463    "pand      %%xmm3,%%xmm1                   \n"
464    "pmulhuw   %%xmm5,%%xmm2                   \n"
465    "pmulhuw   %%xmm5,%%xmm1                   \n"
466    "psllw     $0x8,%%xmm1                     \n"
467    "por       %%xmm2,%%xmm1                   \n"
468    "movdqa    %%xmm0,%%xmm2                   \n"
469    "pand      %%xmm4,%%xmm0                   \n"
470    "psraw     $0x8,%%xmm2                     \n"
471    "pmulhuw   %%xmm6,%%xmm0                   \n"
472    "pand      %%xmm7,%%xmm2                   \n"
473    "por       %%xmm2,%%xmm0                   \n"
474    "movdqa    %%xmm1,%%xmm2                   \n"
475    "punpcklbw %%xmm0,%%xmm1                   \n"
476    "punpckhbw %%xmm0,%%xmm2                   \n"
477    BUNDLEALIGN
478    MEMOPMEM(movdqa,xmm1,0x00,1,0,2)           //  movdqa  %%xmm1,(%1,%0,2)
479    MEMOPMEM(movdqa,xmm2,0x10,1,0,2)           //  movdqa  %%xmm2,0x10(%1,%0,2)
480    "lea       " MEMLEA(0x10,0) ",%0           \n"
481    "sub       $0x8,%2                         \n"
482    "jg        1b                              \n"
483  : "+r"(src),  // %0
484    "+r"(dst),  // %1
485    "+r"(pix)   // %2
486  :
487  : "memory", "cc", "eax"
488#if defined(__native_client__) && defined(__x86_64__)
489    , "r14"
490#endif
491#if defined(__SSE2__)
492    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
493#endif
494  );
495}
496
497void ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
498  asm volatile (
499    "mov       $0xf0f0f0f,%%eax                \n"
500    "movd      %%eax,%%xmm4                    \n"
501    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
502    "movdqa    %%xmm4,%%xmm5                   \n"
503    "pslld     $0x4,%%xmm5                     \n"
504    "sub       %0,%1                           \n"
505    "sub       %0,%1                           \n"
506    LABELALIGN
507  "1:                                          \n"
508    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
509    "movdqa    %%xmm0,%%xmm2                   \n"
510    "pand      %%xmm4,%%xmm0                   \n"
511    "pand      %%xmm5,%%xmm2                   \n"
512    "movdqa    %%xmm0,%%xmm1                   \n"
513    "movdqa    %%xmm2,%%xmm3                   \n"
514    "psllw     $0x4,%%xmm1                     \n"
515    "psrlw     $0x4,%%xmm3                     \n"
516    "por       %%xmm1,%%xmm0                   \n"
517    "por       %%xmm3,%%xmm2                   \n"
518    "movdqa    %%xmm0,%%xmm1                   \n"
519    "punpcklbw %%xmm2,%%xmm0                   \n"
520    "punpckhbw %%xmm2,%%xmm1                   \n"
521    BUNDLEALIGN
522    MEMOPMEM(movdqa,xmm0,0x00,1,0,2)           //  movdqa  %%xmm0,(%1,%0,2)
523    MEMOPMEM(movdqa,xmm1,0x10,1,0,2)           //  movdqa  %%xmm1,0x10(%1,%0,2)
524    "lea       " MEMLEA(0x10,0) ",%0           \n"
525    "sub       $0x8,%2                         \n"
526    "jg        1b                              \n"
527  : "+r"(src),  // %0
528    "+r"(dst),  // %1
529    "+r"(pix)   // %2
530  :
531  : "memory", "cc", "eax"
532#if defined(__native_client__) && defined(__x86_64__)
533    , "r14"
534#endif
535#if defined(__SSE2__)
536    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
537#endif
538  );
539}
540
541void ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
542  asm volatile (
543    "movdqa    %3,%%xmm6                       \n"
544    LABELALIGN
545  "1:                                          \n"
546    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
547    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
548    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
549    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
550    "lea       " MEMLEA(0x40,0) ",%0           \n"
551    "pshufb    %%xmm6,%%xmm0                   \n"
552    "pshufb    %%xmm6,%%xmm1                   \n"
553    "pshufb    %%xmm6,%%xmm2                   \n"
554    "pshufb    %%xmm6,%%xmm3                   \n"
555    "movdqa    %%xmm1,%%xmm4                   \n"
556    "psrldq    $0x4,%%xmm1                     \n"
557    "pslldq    $0xc,%%xmm4                     \n"
558    "movdqa    %%xmm2,%%xmm5                   \n"
559    "por       %%xmm4,%%xmm0                   \n"
560    "pslldq    $0x8,%%xmm5                     \n"
561    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
562    "por       %%xmm5,%%xmm1                   \n"
563    "psrldq    $0x8,%%xmm2                     \n"
564    "pslldq    $0x4,%%xmm3                     \n"
565    "por       %%xmm3,%%xmm2                   \n"
566    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
567    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
568    "lea       " MEMLEA(0x30,1) ",%1           \n"
569    "sub       $0x10,%2                        \n"
570    "jg        1b                              \n"
571  : "+r"(src),  // %0
572    "+r"(dst),  // %1
573    "+r"(pix)   // %2
574  : "m"(kShuffleMaskARGBToRGB24)  // %3
575  : "memory", "cc"
576#if defined(__SSE2__)
577    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
578#endif
579  );
580}
581
582void ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
583  asm volatile (
584    "movdqa    %3,%%xmm6                       \n"
585    LABELALIGN
586  "1:                                          \n"
587    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
588    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
589    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
590    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
591    "lea       " MEMLEA(0x40,0) ",%0           \n"
592    "pshufb    %%xmm6,%%xmm0                   \n"
593    "pshufb    %%xmm6,%%xmm1                   \n"
594    "pshufb    %%xmm6,%%xmm2                   \n"
595    "pshufb    %%xmm6,%%xmm3                   \n"
596    "movdqa    %%xmm1,%%xmm4                   \n"
597    "psrldq    $0x4,%%xmm1                     \n"
598    "pslldq    $0xc,%%xmm4                     \n"
599    "movdqa    %%xmm2,%%xmm5                   \n"
600    "por       %%xmm4,%%xmm0                   \n"
601    "pslldq    $0x8,%%xmm5                     \n"
602    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
603    "por       %%xmm5,%%xmm1                   \n"
604    "psrldq    $0x8,%%xmm2                     \n"
605    "pslldq    $0x4,%%xmm3                     \n"
606    "por       %%xmm3,%%xmm2                   \n"
607    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
608    "movdqu    %%xmm2," MEMACCESS2(0x20,1) "   \n"
609    "lea       " MEMLEA(0x30,1) ",%1           \n"
610    "sub       $0x10,%2                        \n"
611    "jg        1b                              \n"
612  : "+r"(src),  // %0
613    "+r"(dst),  // %1
614    "+r"(pix)   // %2
615  : "m"(kShuffleMaskARGBToRAW)  // %3
616  : "memory", "cc"
617#if defined(__SSE2__)
618    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
619#endif
620  );
621}
622
623void ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
624  asm volatile (
625    "pcmpeqb   %%xmm3,%%xmm3                   \n"
626    "psrld     $0x1b,%%xmm3                    \n"
627    "pcmpeqb   %%xmm4,%%xmm4                   \n"
628    "psrld     $0x1a,%%xmm4                    \n"
629    "pslld     $0x5,%%xmm4                     \n"
630    "pcmpeqb   %%xmm5,%%xmm5                   \n"
631    "pslld     $0xb,%%xmm5                     \n"
632    LABELALIGN
633  "1:                                          \n"
634    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
635    "movdqa    %%xmm0,%%xmm1                   \n"
636    "movdqa    %%xmm0,%%xmm2                   \n"
637    "pslld     $0x8,%%xmm0                     \n"
638    "psrld     $0x3,%%xmm1                     \n"
639    "psrld     $0x5,%%xmm2                     \n"
640    "psrad     $0x10,%%xmm0                    \n"
641    "pand      %%xmm3,%%xmm1                   \n"
642    "pand      %%xmm4,%%xmm2                   \n"
643    "pand      %%xmm5,%%xmm0                   \n"
644    "por       %%xmm2,%%xmm1                   \n"
645    "por       %%xmm1,%%xmm0                   \n"
646    "packssdw  %%xmm0,%%xmm0                   \n"
647    "lea       " MEMLEA(0x10,0) ",%0           \n"
648    "movq      %%xmm0," MEMACCESS(1) "         \n"
649    "lea       " MEMLEA(0x8,1) ",%1            \n"
650    "sub       $0x4,%2                         \n"
651    "jg        1b                              \n"
652  : "+r"(src),  // %0
653    "+r"(dst),  // %1
654    "+r"(pix)   // %2
655  :
656  : "memory", "cc"
657#if defined(__SSE2__)
658    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
659#endif
660  );
661}
662
663void ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
664  asm volatile (
665    "pcmpeqb   %%xmm4,%%xmm4                   \n"
666    "psrld     $0x1b,%%xmm4                    \n"
667    "movdqa    %%xmm4,%%xmm5                   \n"
668    "pslld     $0x5,%%xmm5                     \n"
669    "movdqa    %%xmm4,%%xmm6                   \n"
670    "pslld     $0xa,%%xmm6                     \n"
671    "pcmpeqb   %%xmm7,%%xmm7                   \n"
672    "pslld     $0xf,%%xmm7                     \n"
673    LABELALIGN
674  "1:                                          \n"
675    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
676    "movdqa    %%xmm0,%%xmm1                   \n"
677    "movdqa    %%xmm0,%%xmm2                   \n"
678    "movdqa    %%xmm0,%%xmm3                   \n"
679    "psrad     $0x10,%%xmm0                    \n"
680    "psrld     $0x3,%%xmm1                     \n"
681    "psrld     $0x6,%%xmm2                     \n"
682    "psrld     $0x9,%%xmm3                     \n"
683    "pand      %%xmm7,%%xmm0                   \n"
684    "pand      %%xmm4,%%xmm1                   \n"
685    "pand      %%xmm5,%%xmm2                   \n"
686    "pand      %%xmm6,%%xmm3                   \n"
687    "por       %%xmm1,%%xmm0                   \n"
688    "por       %%xmm3,%%xmm2                   \n"
689    "por       %%xmm2,%%xmm0                   \n"
690    "packssdw  %%xmm0,%%xmm0                   \n"
691    "lea       " MEMLEA(0x10,0) ",%0           \n"
692    "movq      %%xmm0," MEMACCESS(1) "         \n"
693    "lea       " MEMACCESS2(0x8,1) ",%1        \n"
694    "sub       $0x4,%2                         \n"
695    "jg        1b                              \n"
696  : "+r"(src),  // %0
697    "+r"(dst),  // %1
698    "+r"(pix)   // %2
699  :
700  : "memory", "cc"
701#if defined(__SSE2__)
702    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
703#endif
704  );
705}
706
707void ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
708  asm volatile (
709    "pcmpeqb   %%xmm4,%%xmm4                   \n"
710    "psllw     $0xc,%%xmm4                     \n"
711    "movdqa    %%xmm4,%%xmm3                   \n"
712    "psrlw     $0x8,%%xmm3                     \n"
713    LABELALIGN
714  "1:                                          \n"
715    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
716    "movdqa    %%xmm0,%%xmm1                   \n"
717    "pand      %%xmm3,%%xmm0                   \n"
718    "pand      %%xmm4,%%xmm1                   \n"
719    "psrlq     $0x4,%%xmm0                     \n"
720    "psrlq     $0x8,%%xmm1                     \n"
721    "por       %%xmm1,%%xmm0                   \n"
722    "packuswb  %%xmm0,%%xmm0                   \n"
723    "lea       " MEMLEA(0x10,0) ",%0           \n"
724    "movq      %%xmm0," MEMACCESS(1) "         \n"
725    "lea       " MEMLEA(0x8,1) ",%1            \n"
726    "sub       $0x4,%2                         \n"
727    "jg        1b                              \n"
728  : "+r"(src),  // %0
729    "+r"(dst),  // %1
730    "+r"(pix)   // %2
731  :
732  : "memory", "cc"
733#if defined(__SSE2__)
734    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
735#endif
736  );
737}
738#endif  // HAS_RGB24TOARGBROW_SSSE3
739
740#ifdef HAS_ARGBTOYROW_SSSE3
741void ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
742  asm volatile (
743    "movdqa    %4,%%xmm5                       \n"
744    "movdqa    %3,%%xmm4                       \n"
745    LABELALIGN
746  "1:                                          \n"
747    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
748    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
749    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
750    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
751    "pmaddubsw %%xmm4,%%xmm0                   \n"
752    "pmaddubsw %%xmm4,%%xmm1                   \n"
753    "pmaddubsw %%xmm4,%%xmm2                   \n"
754    "pmaddubsw %%xmm4,%%xmm3                   \n"
755    "lea       " MEMLEA(0x40,0) ",%0           \n"
756    "phaddw    %%xmm1,%%xmm0                   \n"
757    "phaddw    %%xmm3,%%xmm2                   \n"
758    "psrlw     $0x7,%%xmm0                     \n"
759    "psrlw     $0x7,%%xmm2                     \n"
760    "packuswb  %%xmm2,%%xmm0                   \n"
761    "paddb     %%xmm5,%%xmm0                   \n"
762    "sub       $0x10,%2                        \n"
763    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
764    "lea       " MEMLEA(0x10,1) ",%1           \n"
765    "jg        1b                              \n"
766  : "+r"(src_argb),  // %0
767    "+r"(dst_y),     // %1
768    "+r"(pix)        // %2
769  : "m"(kARGBToY),   // %3
770    "m"(kAddY16)     // %4
771  : "memory", "cc"
772#if defined(__SSE2__)
773    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
774#endif
775  );
776}
777
778void ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
779  asm volatile (
780    "movdqa    %4,%%xmm5                       \n"
781    "movdqa    %3,%%xmm4                       \n"
782    LABELALIGN
783  "1:                                          \n"
784    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
785    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
786    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
787    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
788    "pmaddubsw %%xmm4,%%xmm0                   \n"
789    "pmaddubsw %%xmm4,%%xmm1                   \n"
790    "pmaddubsw %%xmm4,%%xmm2                   \n"
791    "pmaddubsw %%xmm4,%%xmm3                   \n"
792    "lea       " MEMLEA(0x40,0) ",%0           \n"
793    "phaddw    %%xmm1,%%xmm0                   \n"
794    "phaddw    %%xmm3,%%xmm2                   \n"
795    "psrlw     $0x7,%%xmm0                     \n"
796    "psrlw     $0x7,%%xmm2                     \n"
797    "packuswb  %%xmm2,%%xmm0                   \n"
798    "paddb     %%xmm5,%%xmm0                   \n"
799    "sub       $0x10,%2                        \n"
800    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
801    "lea       " MEMLEA(0x10,1) ",%1           \n"
802    "jg        1b                              \n"
803  : "+r"(src_argb),  // %0
804    "+r"(dst_y),     // %1
805    "+r"(pix)        // %2
806  : "m"(kARGBToY),   // %3
807    "m"(kAddY16)     // %4
808  : "memory", "cc"
809#if defined(__SSE2__)
810    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
811#endif
812  );
813}
814#endif  // HAS_ARGBTOYROW_SSSE3
815
816#ifdef HAS_ARGBTOYJROW_SSSE3
817void ARGBToYJRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
818  asm volatile (
819    "movdqa    %3,%%xmm4                       \n"
820    "movdqa    %4,%%xmm5                       \n"
821    LABELALIGN
822  "1:                                          \n"
823    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
824    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
825    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
826    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
827    "pmaddubsw %%xmm4,%%xmm0                   \n"
828    "pmaddubsw %%xmm4,%%xmm1                   \n"
829    "pmaddubsw %%xmm4,%%xmm2                   \n"
830    "pmaddubsw %%xmm4,%%xmm3                   \n"
831    "lea       " MEMLEA(0x40,0) ",%0           \n"
832    "phaddw    %%xmm1,%%xmm0                   \n"
833    "phaddw    %%xmm3,%%xmm2                   \n"
834    "paddw     %%xmm5,%%xmm0                   \n"
835    "paddw     %%xmm5,%%xmm2                   \n"
836    "psrlw     $0x7,%%xmm0                     \n"
837    "psrlw     $0x7,%%xmm2                     \n"
838    "packuswb  %%xmm2,%%xmm0                   \n"
839    "sub       $0x10,%2                        \n"
840    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
841    "lea       " MEMLEA(0x10,1) ",%1           \n"
842    "jg        1b                              \n"
843  : "+r"(src_argb),  // %0
844    "+r"(dst_y),     // %1
845    "+r"(pix)        // %2
846  : "m"(kARGBToYJ),  // %3
847    "m"(kAddYJ64)    // %4
848  : "memory", "cc"
849#if defined(__SSE2__)
850    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
851#endif
852  );
853}
854
855void ARGBToYJRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
856  asm volatile (
857    "movdqa    %3,%%xmm4                       \n"
858    "movdqa    %4,%%xmm5                       \n"
859    LABELALIGN
860  "1:                                          \n"
861    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
862    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
863    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
864    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
865    "pmaddubsw %%xmm4,%%xmm0                   \n"
866    "pmaddubsw %%xmm4,%%xmm1                   \n"
867    "pmaddubsw %%xmm4,%%xmm2                   \n"
868    "pmaddubsw %%xmm4,%%xmm3                   \n"
869    "lea       " MEMLEA(0x40,0) ",%0           \n"
870    "phaddw    %%xmm1,%%xmm0                   \n"
871    "phaddw    %%xmm3,%%xmm2                   \n"
872    "paddw     %%xmm5,%%xmm0                   \n"
873    "paddw     %%xmm5,%%xmm2                   \n"
874    "psrlw     $0x7,%%xmm0                     \n"
875    "psrlw     $0x7,%%xmm2                     \n"
876    "packuswb  %%xmm2,%%xmm0                   \n"
877    "sub       $0x10,%2                        \n"
878    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
879    "lea       " MEMLEA(0x10,1) ",%1           \n"
880    "jg        1b                              \n"
881  : "+r"(src_argb),  // %0
882    "+r"(dst_y),     // %1
883    "+r"(pix)        // %2
884  : "m"(kARGBToYJ),  // %3
885    "m"(kAddYJ64)    // %4
886  : "memory", "cc"
887#if defined(__SSE2__)
888    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
889#endif
890  );
891}
892#endif  // HAS_ARGBTOYJROW_SSSE3
893
894#ifdef HAS_ARGBTOUVROW_SSSE3
895// TODO(fbarchard): pass xmm constants to single block of assembly.
896// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
897// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
898// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
899// and considered unsafe.
900void ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
901                       uint8* dst_u, uint8* dst_v, int width) {
902  asm volatile (
903    "movdqa    %0,%%xmm4                       \n"
904    "movdqa    %1,%%xmm3                       \n"
905    "movdqa    %2,%%xmm5                       \n"
906  :
907  : "m"(kARGBToU),  // %0
908    "m"(kARGBToV),  // %1
909    "m"(kAddUV128)  // %2
910  );
911  asm volatile (
912    "sub       %1,%2                           \n"
913    LABELALIGN
914  "1:                                          \n"
915    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
916    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
917    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
918    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
919    BUNDLEALIGN
920    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
921    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
922    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
923    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
924    "lea       " MEMLEA(0x40,0) ",%0           \n"
925    "movdqa    %%xmm0,%%xmm7                   \n"
926    "shufps    $0x88,%%xmm1,%%xmm0             \n"
927    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
928    "pavgb     %%xmm7,%%xmm0                   \n"
929    "movdqa    %%xmm2,%%xmm7                   \n"
930    "shufps    $0x88,%%xmm6,%%xmm2             \n"
931    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
932    "pavgb     %%xmm7,%%xmm2                   \n"
933    "movdqa    %%xmm0,%%xmm1                   \n"
934    "movdqa    %%xmm2,%%xmm6                   \n"
935    "pmaddubsw %%xmm4,%%xmm0                   \n"
936    "pmaddubsw %%xmm4,%%xmm2                   \n"
937    "pmaddubsw %%xmm3,%%xmm1                   \n"
938    "pmaddubsw %%xmm3,%%xmm6                   \n"
939    "phaddw    %%xmm2,%%xmm0                   \n"
940    "phaddw    %%xmm6,%%xmm1                   \n"
941    "psraw     $0x8,%%xmm0                     \n"
942    "psraw     $0x8,%%xmm1                     \n"
943    "packsswb  %%xmm1,%%xmm0                   \n"
944    "paddb     %%xmm5,%%xmm0                   \n"
945    "sub       $0x10,%3                        \n"
946    "movlps    %%xmm0," MEMACCESS(1) "         \n"
947    BUNDLEALIGN
948    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps    %%xmm0,(%1,%2,1)
949    "lea       " MEMLEA(0x8,1) ",%1            \n"
950    "jg        1b                              \n"
951  : "+r"(src_argb0),       // %0
952    "+r"(dst_u),           // %1
953    "+r"(dst_v),           // %2
954    "+rm"(width)           // %3
955  : "r"((intptr_t)(src_stride_argb)) // %4
956  : "memory", "cc"
957#if defined(__native_client__) && defined(__x86_64__)
958    , "r14"
959#endif
960#if defined(__SSE2__)
961    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
962#endif
963  );
964}
965
966// TODO(fbarchard): Share code with ARGBToUVRow_SSSE3.
967void ARGBToUVJRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
968                        uint8* dst_u, uint8* dst_v, int width) {
969  asm volatile (
970    "movdqa    %0,%%xmm4                       \n"
971    "movdqa    %1,%%xmm3                       \n"
972    "movdqa    %2,%%xmm5                       \n"
973  :
974  : "m"(kARGBToUJ),  // %0
975    "m"(kARGBToVJ),  // %1
976    "m"(kAddUVJ128)  // %2
977  );
978  asm volatile (
979    "sub       %1,%2                           \n"
980    LABELALIGN
981  "1:                                          \n"
982    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
983    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
984    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
985    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
986    BUNDLEALIGN
987    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
988    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
989    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
990    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
991    "lea       " MEMLEA(0x40,0) ",%0           \n"
992    "movdqa    %%xmm0,%%xmm7                   \n"
993    "shufps    $0x88,%%xmm1,%%xmm0             \n"
994    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
995    "pavgb     %%xmm7,%%xmm0                   \n"
996    "movdqa    %%xmm2,%%xmm7                   \n"
997    "shufps    $0x88,%%xmm6,%%xmm2             \n"
998    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
999    "pavgb     %%xmm7,%%xmm2                   \n"
1000    "movdqa    %%xmm0,%%xmm1                   \n"
1001    "movdqa    %%xmm2,%%xmm6                   \n"
1002    "pmaddubsw %%xmm4,%%xmm0                   \n"
1003    "pmaddubsw %%xmm4,%%xmm2                   \n"
1004    "pmaddubsw %%xmm3,%%xmm1                   \n"
1005    "pmaddubsw %%xmm3,%%xmm6                   \n"
1006    "phaddw    %%xmm2,%%xmm0                   \n"
1007    "phaddw    %%xmm6,%%xmm1                   \n"
1008    "paddw     %%xmm5,%%xmm0                   \n"
1009    "paddw     %%xmm5,%%xmm1                   \n"
1010    "psraw     $0x8,%%xmm0                     \n"
1011    "psraw     $0x8,%%xmm1                     \n"
1012    "packsswb  %%xmm1,%%xmm0                   \n"
1013    "sub       $0x10,%3                        \n"
1014    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1015    BUNDLEALIGN
1016    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1017    "lea       " MEMLEA(0x8,1) ",%1            \n"
1018    "jg        1b                              \n"
1019  : "+r"(src_argb0),       // %0
1020    "+r"(dst_u),           // %1
1021    "+r"(dst_v),           // %2
1022    "+rm"(width)           // %3
1023  : "r"((intptr_t)(src_stride_argb)) // %4
1024  : "memory", "cc"
1025#if defined(__native_client__) && defined(__x86_64__)
1026    , "r14"
1027#endif
1028#if defined(__SSE2__)
1029    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1030#endif
1031  );
1032}
1033
1034void ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1035                                 uint8* dst_u, uint8* dst_v, int width) {
1036  asm volatile (
1037    "movdqa    %0,%%xmm4                       \n"
1038    "movdqa    %1,%%xmm3                       \n"
1039    "movdqa    %2,%%xmm5                       \n"
1040  :
1041  : "m"(kARGBToU),         // %0
1042    "m"(kARGBToV),         // %1
1043    "m"(kAddUV128)         // %2
1044  );
1045  asm volatile (
1046    "sub       %1,%2                           \n"
1047    LABELALIGN
1048  "1:                                          \n"
1049    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1050    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1051    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1052    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1053    BUNDLEALIGN
1054    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
1055    "pavgb     %%xmm7,%%xmm0                   \n"
1056    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
1057    "pavgb     %%xmm7,%%xmm1                   \n"
1058    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
1059    "pavgb     %%xmm7,%%xmm2                   \n"
1060    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
1061    "pavgb     %%xmm7,%%xmm6                   \n"
1062    "lea       " MEMLEA(0x40,0) ",%0           \n"
1063    "movdqa    %%xmm0,%%xmm7                   \n"
1064    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1065    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1066    "pavgb     %%xmm7,%%xmm0                   \n"
1067    "movdqa    %%xmm2,%%xmm7                   \n"
1068    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1069    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1070    "pavgb     %%xmm7,%%xmm2                   \n"
1071    "movdqa    %%xmm0,%%xmm1                   \n"
1072    "movdqa    %%xmm2,%%xmm6                   \n"
1073    "pmaddubsw %%xmm4,%%xmm0                   \n"
1074    "pmaddubsw %%xmm4,%%xmm2                   \n"
1075    "pmaddubsw %%xmm3,%%xmm1                   \n"
1076    "pmaddubsw %%xmm3,%%xmm6                   \n"
1077    "phaddw    %%xmm2,%%xmm0                   \n"
1078    "phaddw    %%xmm6,%%xmm1                   \n"
1079    "psraw     $0x8,%%xmm0                     \n"
1080    "psraw     $0x8,%%xmm1                     \n"
1081    "packsswb  %%xmm1,%%xmm0                   \n"
1082    "paddb     %%xmm5,%%xmm0                   \n"
1083    "sub       $0x10,%3                        \n"
1084    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1085    BUNDLEALIGN
1086    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1087    "lea       " MEMLEA(0x8,1) ",%1            \n"
1088    "jg        1b                              \n"
1089  : "+r"(src_argb0),       // %0
1090    "+r"(dst_u),           // %1
1091    "+r"(dst_v),           // %2
1092    "+rm"(width)           // %3
1093  : "r"((intptr_t)(src_stride_argb)) // %4
1094  : "memory", "cc"
1095#if defined(__native_client__) && defined(__x86_64__)
1096    , "r14"
1097#endif
1098#if defined(__SSE2__)
1099    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1100#endif
1101  );
1102}
1103
1104void ARGBToUVJRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
1105                                  uint8* dst_u, uint8* dst_v, int width) {
1106  asm volatile (
1107    "movdqa    %0,%%xmm4                       \n"
1108    "movdqa    %1,%%xmm3                       \n"
1109    "movdqa    %2,%%xmm5                       \n"
1110  :
1111  : "m"(kARGBToUJ),         // %0
1112    "m"(kARGBToVJ),         // %1
1113    "m"(kAddUVJ128)         // %2
1114  );
1115  asm volatile (
1116    "sub       %1,%2                           \n"
1117    LABELALIGN
1118  "1:                                          \n"
1119    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1120    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1121    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1122    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1123    BUNDLEALIGN
1124    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
1125    "pavgb     %%xmm7,%%xmm0                   \n"
1126    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
1127    "pavgb     %%xmm7,%%xmm1                   \n"
1128    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
1129    "pavgb     %%xmm7,%%xmm2                   \n"
1130    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
1131    "pavgb     %%xmm7,%%xmm6                   \n"
1132    "lea       " MEMLEA(0x40,0) ",%0           \n"
1133    "movdqa    %%xmm0,%%xmm7                   \n"
1134    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1135    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1136    "pavgb     %%xmm7,%%xmm0                   \n"
1137    "movdqa    %%xmm2,%%xmm7                   \n"
1138    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1139    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1140    "pavgb     %%xmm7,%%xmm2                   \n"
1141    "movdqa    %%xmm0,%%xmm1                   \n"
1142    "movdqa    %%xmm2,%%xmm6                   \n"
1143    "pmaddubsw %%xmm4,%%xmm0                   \n"
1144    "pmaddubsw %%xmm4,%%xmm2                   \n"
1145    "pmaddubsw %%xmm3,%%xmm1                   \n"
1146    "pmaddubsw %%xmm3,%%xmm6                   \n"
1147    "phaddw    %%xmm2,%%xmm0                   \n"
1148    "phaddw    %%xmm6,%%xmm1                   \n"
1149    "paddw     %%xmm5,%%xmm0                   \n"
1150    "paddw     %%xmm5,%%xmm1                   \n"
1151    "psraw     $0x8,%%xmm0                     \n"
1152    "psraw     $0x8,%%xmm1                     \n"
1153    "packsswb  %%xmm1,%%xmm0                   \n"
1154    "sub       $0x10,%3                        \n"
1155    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1156    BUNDLEALIGN
1157    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1158    "lea       " MEMLEA(0x8,1) ",%1            \n"
1159    "jg        1b                              \n"
1160  : "+r"(src_argb0),       // %0
1161    "+r"(dst_u),           // %1
1162    "+r"(dst_v),           // %2
1163    "+rm"(width)           // %3
1164  : "r"((intptr_t)(src_stride_argb))
1165  : "memory", "cc"
1166#if defined(__native_client__) && defined(__x86_64__)
1167    , "r14"
1168#endif
1169#if defined(__SSE2__)
1170    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1171#endif
1172  );
1173}
1174
1175void ARGBToUV444Row_SSSE3(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
1176                          int width) {
1177  asm volatile (
1178    "movdqa    %0,%%xmm4                       \n"
1179    "movdqa    %1,%%xmm3                       \n"
1180    "movdqa    %2,%%xmm5                       \n"
1181  :
1182  : "m"(kARGBToU),  // %0
1183    "m"(kARGBToV),  // %1
1184    "m"(kAddUV128)  // %2
1185  );
1186  asm volatile (
1187    "sub       %1,%2                           \n"
1188    LABELALIGN
1189  "1:                                          \n"
1190    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1191    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1192    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1193    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1194    "pmaddubsw %%xmm4,%%xmm0                   \n"
1195    "pmaddubsw %%xmm4,%%xmm1                   \n"
1196    "pmaddubsw %%xmm4,%%xmm2                   \n"
1197    "pmaddubsw %%xmm4,%%xmm6                   \n"
1198    "phaddw    %%xmm1,%%xmm0                   \n"
1199    "phaddw    %%xmm6,%%xmm2                   \n"
1200    "psraw     $0x8,%%xmm0                     \n"
1201    "psraw     $0x8,%%xmm2                     \n"
1202    "packsswb  %%xmm2,%%xmm0                   \n"
1203    "paddb     %%xmm5,%%xmm0                   \n"
1204    "sub       $0x10,%3                        \n"
1205    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
1206    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1207    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1208    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1209    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1210    "pmaddubsw %%xmm3,%%xmm0                   \n"
1211    "pmaddubsw %%xmm3,%%xmm1                   \n"
1212    "pmaddubsw %%xmm3,%%xmm2                   \n"
1213    "pmaddubsw %%xmm3,%%xmm6                   \n"
1214    "phaddw    %%xmm1,%%xmm0                   \n"
1215    "phaddw    %%xmm6,%%xmm2                   \n"
1216    "psraw     $0x8,%%xmm0                     \n"
1217    "psraw     $0x8,%%xmm2                     \n"
1218    "packsswb  %%xmm2,%%xmm0                   \n"
1219    "paddb     %%xmm5,%%xmm0                   \n"
1220    "lea       " MEMLEA(0x40,0) ",%0           \n"
1221    BUNDLEALIGN
1222    MEMOPMEM(movdqa,xmm0,0x00,1,2,1)           //  movdqa  %%xmm0,(%1,%2,1)
1223    "lea       " MEMLEA(0x10,1) ",%1           \n"
1224    "jg        1b                              \n"
1225  : "+r"(src_argb),        // %0
1226    "+r"(dst_u),           // %1
1227    "+r"(dst_v),           // %2
1228    "+rm"(width)           // %3
1229  :
1230  : "memory", "cc"
1231#if defined(__native_client__) && defined(__x86_64__)
1232    , "r14"
1233#endif
1234#if defined(__SSE2__)
1235    , "xmm0", "xmm1", "xmm2", "xmm6"
1236#endif
1237  );
1238}
1239
1240void ARGBToUV444Row_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_u,
1241                                    uint8* dst_v, int width) {
1242  asm volatile (
1243    "movdqa    %0,%%xmm4                       \n"
1244    "movdqa    %1,%%xmm3                       \n"
1245    "movdqa    %2,%%xmm5                       \n"
1246  :
1247  : "m"(kARGBToU),  // %0
1248    "m"(kARGBToV),  // %1
1249    "m"(kAddUV128)  // %2
1250  );
1251  asm volatile (
1252    "sub       %1,%2                           \n"
1253    LABELALIGN
1254  "1:                                          \n"
1255    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1256    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1257    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1258    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1259    "pmaddubsw %%xmm4,%%xmm0                   \n"
1260    "pmaddubsw %%xmm4,%%xmm1                   \n"
1261    "pmaddubsw %%xmm4,%%xmm2                   \n"
1262    "pmaddubsw %%xmm4,%%xmm6                   \n"
1263    "phaddw    %%xmm1,%%xmm0                   \n"
1264    "phaddw    %%xmm6,%%xmm2                   \n"
1265    "psraw     $0x8,%%xmm0                     \n"
1266    "psraw     $0x8,%%xmm2                     \n"
1267    "packsswb  %%xmm2,%%xmm0                   \n"
1268    "paddb     %%xmm5,%%xmm0                   \n"
1269    "sub       $0x10,%3                        \n"
1270    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1271    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1272    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1273    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1274    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1275    "pmaddubsw %%xmm3,%%xmm0                   \n"
1276    "pmaddubsw %%xmm3,%%xmm1                   \n"
1277    "pmaddubsw %%xmm3,%%xmm2                   \n"
1278    "pmaddubsw %%xmm3,%%xmm6                   \n"
1279    "phaddw    %%xmm1,%%xmm0                   \n"
1280    "phaddw    %%xmm6,%%xmm2                   \n"
1281    "psraw     $0x8,%%xmm0                     \n"
1282    "psraw     $0x8,%%xmm2                     \n"
1283    "packsswb  %%xmm2,%%xmm0                   \n"
1284    "paddb     %%xmm5,%%xmm0                   \n"
1285    "lea       " MEMLEA(0x40,0) ",%0           \n"
1286    BUNDLEALIGN
1287    MEMOPMEM(movdqu,xmm0,0x00,1,2,1)           //  movdqu  %%xmm0,(%1,%2,1)
1288    "lea       " MEMLEA(0x10,1) ",%1           \n"
1289    "jg        1b                              \n"
1290  : "+r"(src_argb),        // %0
1291    "+r"(dst_u),           // %1
1292    "+r"(dst_v),           // %2
1293    "+rm"(width)           // %3
1294  :
1295  : "memory", "cc"
1296#if defined(__native_client__) && defined(__x86_64__)
1297    , "r14"
1298#endif
1299#if defined(__SSE2__)
1300    , "xmm0", "xmm1", "xmm2", "xmm6"
1301#endif
1302  );
1303}
1304
1305void ARGBToUV422Row_SSSE3(const uint8* src_argb0,
1306                          uint8* dst_u, uint8* dst_v, int width) {
1307  asm volatile (
1308    "movdqa    %0,%%xmm4                       \n"
1309    "movdqa    %1,%%xmm3                       \n"
1310    "movdqa    %2,%%xmm5                       \n"
1311  :
1312  : "m"(kARGBToU),  // %0
1313    "m"(kARGBToV),  // %1
1314    "m"(kAddUV128)  // %2
1315  );
1316  asm volatile (
1317    "sub       %1,%2                           \n"
1318    LABELALIGN
1319  "1:                                          \n"
1320    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1321    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1322    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1323    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1324    "lea       " MEMLEA(0x40,0) ",%0           \n"
1325    "movdqa    %%xmm0,%%xmm7                   \n"
1326    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1327    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1328    "pavgb     %%xmm7,%%xmm0                   \n"
1329    "movdqa    %%xmm2,%%xmm7                   \n"
1330    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1331    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1332    "pavgb     %%xmm7,%%xmm2                   \n"
1333    "movdqa    %%xmm0,%%xmm1                   \n"
1334    "movdqa    %%xmm2,%%xmm6                   \n"
1335    "pmaddubsw %%xmm4,%%xmm0                   \n"
1336    "pmaddubsw %%xmm4,%%xmm2                   \n"
1337    "pmaddubsw %%xmm3,%%xmm1                   \n"
1338    "pmaddubsw %%xmm3,%%xmm6                   \n"
1339    "phaddw    %%xmm2,%%xmm0                   \n"
1340    "phaddw    %%xmm6,%%xmm1                   \n"
1341    "psraw     $0x8,%%xmm0                     \n"
1342    "psraw     $0x8,%%xmm1                     \n"
1343    "packsswb  %%xmm1,%%xmm0                   \n"
1344    "paddb     %%xmm5,%%xmm0                   \n"
1345    "sub       $0x10,%3                        \n"
1346    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1347    BUNDLEALIGN
1348    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1349    "lea       " MEMLEA(0x8,1) ",%1            \n"
1350    "jg        1b                              \n"
1351  : "+r"(src_argb0),       // %0
1352    "+r"(dst_u),           // %1
1353    "+r"(dst_v),           // %2
1354    "+rm"(width)           // %3
1355  :
1356  : "memory", "cc"
1357#if defined(__native_client__) && defined(__x86_64__)
1358    , "r14"
1359#endif
1360#if defined(__SSE2__)
1361    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1362#endif
1363  );
1364}
1365
1366void ARGBToUV422Row_Unaligned_SSSE3(const uint8* src_argb0,
1367                                    uint8* dst_u, uint8* dst_v, int width) {
1368  asm volatile (
1369    "movdqa    %0,%%xmm4                       \n"
1370    "movdqa    %1,%%xmm3                       \n"
1371    "movdqa    %2,%%xmm5                       \n"
1372  :
1373  : "m"(kARGBToU),  // %0
1374    "m"(kARGBToV),  // %1
1375    "m"(kAddUV128)  // %2
1376  );
1377  asm volatile (
1378    "sub       %1,%2                           \n"
1379    LABELALIGN
1380  "1:                                          \n"
1381    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1382    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1383    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1384    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1385    "lea       " MEMLEA(0x40,0) ",%0           \n"
1386    "movdqa    %%xmm0,%%xmm7                   \n"
1387    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1388    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1389    "pavgb     %%xmm7,%%xmm0                   \n"
1390    "movdqa    %%xmm2,%%xmm7                   \n"
1391    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1392    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1393    "pavgb     %%xmm7,%%xmm2                   \n"
1394    "movdqa    %%xmm0,%%xmm1                   \n"
1395    "movdqa    %%xmm2,%%xmm6                   \n"
1396    "pmaddubsw %%xmm4,%%xmm0                   \n"
1397    "pmaddubsw %%xmm4,%%xmm2                   \n"
1398    "pmaddubsw %%xmm3,%%xmm1                   \n"
1399    "pmaddubsw %%xmm3,%%xmm6                   \n"
1400    "phaddw    %%xmm2,%%xmm0                   \n"
1401    "phaddw    %%xmm6,%%xmm1                   \n"
1402    "psraw     $0x8,%%xmm0                     \n"
1403    "psraw     $0x8,%%xmm1                     \n"
1404    "packsswb  %%xmm1,%%xmm0                   \n"
1405    "paddb     %%xmm5,%%xmm0                   \n"
1406    "sub       $0x10,%3                        \n"
1407    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1408    BUNDLEALIGN
1409    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1410    "lea       " MEMLEA(0x8,1) ",%1            \n"
1411    "jg        1b                              \n"
1412  : "+r"(src_argb0),       // %0
1413    "+r"(dst_u),           // %1
1414    "+r"(dst_v),           // %2
1415    "+rm"(width)           // %3
1416  :
1417  : "memory", "cc"
1418#if defined(__native_client__) && defined(__x86_64__)
1419    , "r14"
1420#endif
1421#if defined(__SSE2__)
1422    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1423#endif
1424  );
1425}
1426
1427void BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1428  asm volatile (
1429    "movdqa    %4,%%xmm5                       \n"
1430    "movdqa    %3,%%xmm4                       \n"
1431    LABELALIGN
1432  "1:                                          \n"
1433    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1434    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1435    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1436    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1437    "pmaddubsw %%xmm4,%%xmm0                   \n"
1438    "pmaddubsw %%xmm4,%%xmm1                   \n"
1439    "pmaddubsw %%xmm4,%%xmm2                   \n"
1440    "pmaddubsw %%xmm4,%%xmm3                   \n"
1441    "lea       " MEMLEA(0x40,0) ",%0           \n"
1442    "phaddw    %%xmm1,%%xmm0                   \n"
1443    "phaddw    %%xmm3,%%xmm2                   \n"
1444    "psrlw     $0x7,%%xmm0                     \n"
1445    "psrlw     $0x7,%%xmm2                     \n"
1446    "packuswb  %%xmm2,%%xmm0                   \n"
1447    "paddb     %%xmm5,%%xmm0                   \n"
1448    "sub       $0x10,%2                        \n"
1449    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
1450    "lea       " MEMLEA(0x10,1) ",%1           \n"
1451    "jg        1b                              \n"
1452  : "+r"(src_bgra),  // %0
1453    "+r"(dst_y),     // %1
1454    "+r"(pix)        // %2
1455  : "m"(kBGRAToY),   // %3
1456    "m"(kAddY16)     // %4
1457  : "memory", "cc"
1458#if defined(__SSE2__)
1459    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1460#endif
1461  );
1462}
1463
1464void BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
1465  asm volatile (
1466    "movdqa    %4,%%xmm5                       \n"
1467    "movdqa    %3,%%xmm4                       \n"
1468    LABELALIGN
1469  "1:                                          \n"
1470    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1471    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1472    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1473    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1474    "pmaddubsw %%xmm4,%%xmm0                   \n"
1475    "pmaddubsw %%xmm4,%%xmm1                   \n"
1476    "pmaddubsw %%xmm4,%%xmm2                   \n"
1477    "pmaddubsw %%xmm4,%%xmm3                   \n"
1478    "lea       " MEMLEA(0x40,0) ",%0           \n"
1479    "phaddw    %%xmm1,%%xmm0                   \n"
1480    "phaddw    %%xmm3,%%xmm2                   \n"
1481    "psrlw     $0x7,%%xmm0                     \n"
1482    "psrlw     $0x7,%%xmm2                     \n"
1483    "packuswb  %%xmm2,%%xmm0                   \n"
1484    "paddb     %%xmm5,%%xmm0                   \n"
1485    "sub       $0x10,%2                        \n"
1486    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1487    "lea       " MEMLEA(0x10,1) ",%1           \n"
1488    "jg        1b                              \n"
1489  : "+r"(src_bgra),  // %0
1490    "+r"(dst_y),     // %1
1491    "+r"(pix)        // %2
1492  : "m"(kBGRAToY),   // %3
1493    "m"(kAddY16)     // %4
1494  : "memory", "cc"
1495#if defined(__SSE2__)
1496    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1497#endif
1498  );
1499}
1500
1501void BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1502                       uint8* dst_u, uint8* dst_v, int width) {
1503  asm volatile (
1504    "movdqa    %0,%%xmm4                       \n"
1505    "movdqa    %1,%%xmm3                       \n"
1506    "movdqa    %2,%%xmm5                       \n"
1507  :
1508  : "m"(kBGRAToU),         // %0
1509    "m"(kBGRAToV),         // %1
1510    "m"(kAddUV128)         // %2
1511  );
1512  asm volatile (
1513    "sub       %1,%2                           \n"
1514    LABELALIGN
1515  "1:                                          \n"
1516    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1517    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1518    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1519    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1520    BUNDLEALIGN
1521    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
1522    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
1523    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
1524    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
1525    "lea       " MEMLEA(0x40,0) ",%0           \n"
1526    "movdqa    %%xmm0,%%xmm7                   \n"
1527    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1528    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1529    "pavgb     %%xmm7,%%xmm0                   \n"
1530    "movdqa    %%xmm2,%%xmm7                   \n"
1531    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1532    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1533    "pavgb     %%xmm7,%%xmm2                   \n"
1534    "movdqa    %%xmm0,%%xmm1                   \n"
1535    "movdqa    %%xmm2,%%xmm6                   \n"
1536    "pmaddubsw %%xmm4,%%xmm0                   \n"
1537    "pmaddubsw %%xmm4,%%xmm2                   \n"
1538    "pmaddubsw %%xmm3,%%xmm1                   \n"
1539    "pmaddubsw %%xmm3,%%xmm6                   \n"
1540    "phaddw    %%xmm2,%%xmm0                   \n"
1541    "phaddw    %%xmm6,%%xmm1                   \n"
1542    "psraw     $0x8,%%xmm0                     \n"
1543    "psraw     $0x8,%%xmm1                     \n"
1544    "packsswb  %%xmm1,%%xmm0                   \n"
1545    "paddb     %%xmm5,%%xmm0                   \n"
1546    "sub       $0x10,%3                        \n"
1547    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1548    BUNDLEALIGN
1549    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1550    "lea       " MEMLEA(0x8,1) ",%1            \n"
1551    "jg        1b                              \n"
1552  : "+r"(src_bgra0),       // %0
1553    "+r"(dst_u),           // %1
1554    "+r"(dst_v),           // %2
1555    "+rm"(width)           // %3
1556  : "r"((intptr_t)(src_stride_bgra)) // %4
1557  : "memory", "cc"
1558#if defined(__native_client__) && defined(__x86_64__)
1559    , "r14"
1560#endif
1561#if defined(__SSE2__)
1562    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1563#endif
1564  );
1565}
1566
1567void BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
1568                                 uint8* dst_u, uint8* dst_v, int width) {
1569  asm volatile (
1570    "movdqa    %0,%%xmm4                       \n"
1571    "movdqa    %1,%%xmm3                       \n"
1572    "movdqa    %2,%%xmm5                       \n"
1573  :
1574  : "m"(kBGRAToU),         // %0
1575    "m"(kBGRAToV),         // %1
1576    "m"(kAddUV128)         // %2
1577  );
1578  asm volatile (
1579    "sub       %1,%2                           \n"
1580    LABELALIGN
1581  "1:                                          \n"
1582    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1583    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1584    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1585    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1586    BUNDLEALIGN
1587    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
1588    "pavgb     %%xmm7,%%xmm0                   \n"
1589    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
1590    "pavgb     %%xmm7,%%xmm1                   \n"
1591    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
1592    "pavgb     %%xmm7,%%xmm2                   \n"
1593    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
1594    "pavgb     %%xmm7,%%xmm6                   \n"
1595    "lea       " MEMLEA(0x40,0) ",%0           \n"
1596    "movdqa    %%xmm0,%%xmm7                   \n"
1597    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1598    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1599    "pavgb     %%xmm7,%%xmm0                   \n"
1600    "movdqa    %%xmm2,%%xmm7                   \n"
1601    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1602    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1603    "pavgb     %%xmm7,%%xmm2                   \n"
1604    "movdqa    %%xmm0,%%xmm1                   \n"
1605    "movdqa    %%xmm2,%%xmm6                   \n"
1606    "pmaddubsw %%xmm4,%%xmm0                   \n"
1607    "pmaddubsw %%xmm4,%%xmm2                   \n"
1608    "pmaddubsw %%xmm3,%%xmm1                   \n"
1609    "pmaddubsw %%xmm3,%%xmm6                   \n"
1610    "phaddw    %%xmm2,%%xmm0                   \n"
1611    "phaddw    %%xmm6,%%xmm1                   \n"
1612    "psraw     $0x8,%%xmm0                     \n"
1613    "psraw     $0x8,%%xmm1                     \n"
1614    "packsswb  %%xmm1,%%xmm0                   \n"
1615    "paddb     %%xmm5,%%xmm0                   \n"
1616    "sub       $0x10,%3                        \n"
1617    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1618    BUNDLEALIGN
1619    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1620    "lea       " MEMLEA(0x8,1) ",%1            \n"
1621    "jg        1b                              \n"
1622  : "+r"(src_bgra0),       // %0
1623    "+r"(dst_u),           // %1
1624    "+r"(dst_v),           // %2
1625    "+rm"(width)           // %3
1626  : "r"((intptr_t)(src_stride_bgra)) // %4
1627  : "memory", "cc"
1628#if defined(__native_client__) && defined(__x86_64__)
1629    , "r14"
1630#endif
1631#if defined(__SSE2__)
1632    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1633#endif
1634  );
1635}
1636
1637void ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1638  asm volatile (
1639    "movdqa    %4,%%xmm5                       \n"
1640    "movdqa    %3,%%xmm4                       \n"
1641    LABELALIGN
1642  "1:                                          \n"
1643    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1644    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1645    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1646    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1647    "pmaddubsw %%xmm4,%%xmm0                   \n"
1648    "pmaddubsw %%xmm4,%%xmm1                   \n"
1649    "pmaddubsw %%xmm4,%%xmm2                   \n"
1650    "pmaddubsw %%xmm4,%%xmm3                   \n"
1651    "lea       " MEMLEA(0x40,0) ",%0           \n"
1652    "phaddw    %%xmm1,%%xmm0                   \n"
1653    "phaddw    %%xmm3,%%xmm2                   \n"
1654    "psrlw     $0x7,%%xmm0                     \n"
1655    "psrlw     $0x7,%%xmm2                     \n"
1656    "packuswb  %%xmm2,%%xmm0                   \n"
1657    "paddb     %%xmm5,%%xmm0                   \n"
1658    "sub       $0x10,%2                        \n"
1659    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
1660    "lea       " MEMLEA(0x10,1) ",%1           \n"
1661    "jg        1b                              \n"
1662  : "+r"(src_abgr),  // %0
1663    "+r"(dst_y),     // %1
1664    "+r"(pix)        // %2
1665  : "m"(kABGRToY),   // %3
1666    "m"(kAddY16)     // %4
1667  : "memory", "cc"
1668#if defined(__SSE2__)
1669    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1670#endif
1671  );
1672}
1673
1674void ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
1675  asm volatile (
1676    "movdqa    %4,%%xmm5                       \n"
1677    "movdqa    %3,%%xmm4                       \n"
1678    LABELALIGN
1679  "1:                                          \n"
1680    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1681    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1682    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1683    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1684    "pmaddubsw %%xmm4,%%xmm0                   \n"
1685    "pmaddubsw %%xmm4,%%xmm1                   \n"
1686    "pmaddubsw %%xmm4,%%xmm2                   \n"
1687    "pmaddubsw %%xmm4,%%xmm3                   \n"
1688    "lea       " MEMLEA(0x40,0) ",%0           \n"
1689    "phaddw    %%xmm1,%%xmm0                   \n"
1690    "phaddw    %%xmm3,%%xmm2                   \n"
1691    "psrlw     $0x7,%%xmm0                     \n"
1692    "psrlw     $0x7,%%xmm2                     \n"
1693    "packuswb  %%xmm2,%%xmm0                   \n"
1694    "paddb     %%xmm5,%%xmm0                   \n"
1695    "sub       $0x10,%2                        \n"
1696    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1697    "lea       " MEMLEA(0x10,1) ",%1           \n"
1698    "jg        1b                              \n"
1699  : "+r"(src_abgr),  // %0
1700    "+r"(dst_y),     // %1
1701    "+r"(pix)        // %2
1702  : "m"(kABGRToY),   // %3
1703    "m"(kAddY16)     // %4
1704  : "memory", "cc"
1705#if defined(__SSE2__)
1706    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1707#endif
1708  );
1709}
1710
1711void RGBAToYRow_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1712  asm volatile (
1713    "movdqa    %4,%%xmm5                       \n"
1714    "movdqa    %3,%%xmm4                       \n"
1715    LABELALIGN
1716  "1:                                          \n"
1717    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1718    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1719    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1720    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1721    "pmaddubsw %%xmm4,%%xmm0                   \n"
1722    "pmaddubsw %%xmm4,%%xmm1                   \n"
1723    "pmaddubsw %%xmm4,%%xmm2                   \n"
1724    "pmaddubsw %%xmm4,%%xmm3                   \n"
1725    "lea       " MEMLEA(0x40,0) ",%0           \n"
1726    "phaddw    %%xmm1,%%xmm0                   \n"
1727    "phaddw    %%xmm3,%%xmm2                   \n"
1728    "psrlw     $0x7,%%xmm0                     \n"
1729    "psrlw     $0x7,%%xmm2                     \n"
1730    "packuswb  %%xmm2,%%xmm0                   \n"
1731    "paddb     %%xmm5,%%xmm0                   \n"
1732    "sub       $0x10,%2                        \n"
1733    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
1734    "lea       " MEMLEA(0x10,1) ",%1           \n"
1735    "jg        1b                              \n"
1736  : "+r"(src_rgba),  // %0
1737    "+r"(dst_y),     // %1
1738    "+r"(pix)        // %2
1739  : "m"(kRGBAToY),   // %3
1740    "m"(kAddY16)     // %4
1741  : "memory", "cc"
1742#if defined(__SSE2__)
1743    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1744#endif
1745  );
1746}
1747
1748void RGBAToYRow_Unaligned_SSSE3(const uint8* src_rgba, uint8* dst_y, int pix) {
1749  asm volatile (
1750    "movdqa    %4,%%xmm5                       \n"
1751    "movdqa    %3,%%xmm4                       \n"
1752    LABELALIGN
1753  "1:                                          \n"
1754    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1755    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1756    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1757    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
1758    "pmaddubsw %%xmm4,%%xmm0                   \n"
1759    "pmaddubsw %%xmm4,%%xmm1                   \n"
1760    "pmaddubsw %%xmm4,%%xmm2                   \n"
1761    "pmaddubsw %%xmm4,%%xmm3                   \n"
1762    "lea       " MEMLEA(0x40,0) ",%0           \n"
1763    "phaddw    %%xmm1,%%xmm0                   \n"
1764    "phaddw    %%xmm3,%%xmm2                   \n"
1765    "psrlw     $0x7,%%xmm0                     \n"
1766    "psrlw     $0x7,%%xmm2                     \n"
1767    "packuswb  %%xmm2,%%xmm0                   \n"
1768    "paddb     %%xmm5,%%xmm0                   \n"
1769    "sub       $0x10,%2                        \n"
1770    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
1771    "lea       " MEMLEA(0x10,1) ",%1           \n"
1772    "jg        1b                              \n"
1773  : "+r"(src_rgba),  // %0
1774    "+r"(dst_y),     // %1
1775    "+r"(pix)        // %2
1776  : "m"(kRGBAToY),   // %3
1777    "m"(kAddY16)     // %4
1778  : "memory", "cc"
1779#if defined(__SSE2__)
1780    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
1781#endif
1782  );
1783}
1784
1785void ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1786                       uint8* dst_u, uint8* dst_v, int width) {
1787  asm volatile (
1788    "movdqa    %0,%%xmm4                       \n"
1789    "movdqa    %1,%%xmm3                       \n"
1790    "movdqa    %2,%%xmm5                       \n"
1791  :
1792  : "m"(kABGRToU),         // %0
1793    "m"(kABGRToV),         // %1
1794    "m"(kAddUV128)         // %2
1795  );
1796  asm volatile (
1797    "sub       %1,%2                           \n"
1798    LABELALIGN
1799  "1:                                          \n"
1800    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1801    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1802    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1803    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1804    BUNDLEALIGN
1805    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
1806    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
1807    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
1808    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
1809    "lea       " MEMLEA(0x40,0) ",%0           \n"
1810    "movdqa    %%xmm0,%%xmm7                   \n"
1811    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1812    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1813    "pavgb     %%xmm7,%%xmm0                   \n"
1814    "movdqa    %%xmm2,%%xmm7                   \n"
1815    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1816    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1817    "pavgb     %%xmm7,%%xmm2                   \n"
1818    "movdqa    %%xmm0,%%xmm1                   \n"
1819    "movdqa    %%xmm2,%%xmm6                   \n"
1820    "pmaddubsw %%xmm4,%%xmm0                   \n"
1821    "pmaddubsw %%xmm4,%%xmm2                   \n"
1822    "pmaddubsw %%xmm3,%%xmm1                   \n"
1823    "pmaddubsw %%xmm3,%%xmm6                   \n"
1824    "phaddw    %%xmm2,%%xmm0                   \n"
1825    "phaddw    %%xmm6,%%xmm1                   \n"
1826    "psraw     $0x8,%%xmm0                     \n"
1827    "psraw     $0x8,%%xmm1                     \n"
1828    "packsswb  %%xmm1,%%xmm0                   \n"
1829    "paddb     %%xmm5,%%xmm0                   \n"
1830    "sub       $0x10,%3                        \n"
1831    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1832    BUNDLEALIGN
1833    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1834    "lea       " MEMLEA(0x8,1) ",%1            \n"
1835    "jg        1b                              \n"
1836  : "+r"(src_abgr0),       // %0
1837    "+r"(dst_u),           // %1
1838    "+r"(dst_v),           // %2
1839    "+rm"(width)           // %3
1840  : "r"((intptr_t)(src_stride_abgr)) // %4
1841  : "memory", "cc"
1842#if defined(__native_client__) && defined(__x86_64__)
1843    , "r14"
1844#endif
1845#if defined(__SSE2__)
1846    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1847#endif
1848  );
1849}
1850
1851void ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
1852                                 uint8* dst_u, uint8* dst_v, int width) {
1853  asm volatile (
1854    "movdqa    %0,%%xmm4                       \n"
1855    "movdqa    %1,%%xmm3                       \n"
1856    "movdqa    %2,%%xmm5                       \n"
1857  :
1858  : "m"(kABGRToU),         // %0
1859    "m"(kABGRToV),         // %1
1860    "m"(kAddUV128)         // %2
1861  );
1862  asm volatile (
1863    "sub       %1,%2                           \n"
1864    LABELALIGN
1865  "1:                                          \n"
1866    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
1867    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1868    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1869    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1870    BUNDLEALIGN
1871    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
1872    "pavgb     %%xmm7,%%xmm0                   \n"
1873    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
1874    "pavgb     %%xmm7,%%xmm1                   \n"
1875    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
1876    "pavgb     %%xmm7,%%xmm2                   \n"
1877    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
1878    "pavgb     %%xmm7,%%xmm6                   \n"
1879    "lea       " MEMLEA(0x40,0) ",%0           \n"
1880    "movdqa    %%xmm0,%%xmm7                   \n"
1881    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1882    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1883    "pavgb     %%xmm7,%%xmm0                   \n"
1884    "movdqa    %%xmm2,%%xmm7                   \n"
1885    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1886    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1887    "pavgb     %%xmm7,%%xmm2                   \n"
1888    "movdqa    %%xmm0,%%xmm1                   \n"
1889    "movdqa    %%xmm2,%%xmm6                   \n"
1890    "pmaddubsw %%xmm4,%%xmm0                   \n"
1891    "pmaddubsw %%xmm4,%%xmm2                   \n"
1892    "pmaddubsw %%xmm3,%%xmm1                   \n"
1893    "pmaddubsw %%xmm3,%%xmm6                   \n"
1894    "phaddw    %%xmm2,%%xmm0                   \n"
1895    "phaddw    %%xmm6,%%xmm1                   \n"
1896    "psraw     $0x8,%%xmm0                     \n"
1897    "psraw     $0x8,%%xmm1                     \n"
1898    "packsswb  %%xmm1,%%xmm0                   \n"
1899    "paddb     %%xmm5,%%xmm0                   \n"
1900    "sub       $0x10,%3                        \n"
1901    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1902    BUNDLEALIGN
1903    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1904    "lea       " MEMLEA(0x8,1) ",%1            \n"
1905    "jg        1b                              \n"
1906  : "+r"(src_abgr0),       // %0
1907    "+r"(dst_u),           // %1
1908    "+r"(dst_v),           // %2
1909    "+rm"(width)           // %3
1910  : "r"((intptr_t)(src_stride_abgr)) // %4
1911  : "memory", "cc"
1912#if defined(__native_client__) && defined(__x86_64__)
1913    , "r14"
1914#endif
1915#if defined(__SSE2__)
1916    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1917#endif
1918  );
1919}
1920
1921void RGBAToUVRow_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1922                       uint8* dst_u, uint8* dst_v, int width) {
1923  asm volatile (
1924    "movdqa    %0,%%xmm4                       \n"
1925    "movdqa    %1,%%xmm3                       \n"
1926    "movdqa    %2,%%xmm5                       \n"
1927  :
1928  : "m"(kRGBAToU),         // %0
1929    "m"(kRGBAToV),         // %1
1930    "m"(kAddUV128)         // %2
1931  );
1932  asm volatile (
1933    "sub       %1,%2                           \n"
1934    LABELALIGN
1935  "1:                                          \n"
1936    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
1937    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
1938    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
1939    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
1940    BUNDLEALIGN
1941    MEMOPREG(pavgb,0x00,0,4,1,xmm0)            //  pavgb   (%0,%4,1),%%xmm0
1942    MEMOPREG(pavgb,0x10,0,4,1,xmm1)            //  pavgb   0x10(%0,%4,1),%%xmm1
1943    MEMOPREG(pavgb,0x20,0,4,1,xmm2)            //  pavgb   0x20(%0,%4,1),%%xmm2
1944    MEMOPREG(pavgb,0x30,0,4,1,xmm6)            //  pavgb   0x30(%0,%4,1),%%xmm6
1945    "lea       " MEMLEA(0x40,0) ",%0           \n"
1946    "movdqa    %%xmm0,%%xmm7                   \n"
1947    "shufps    $0x88,%%xmm1,%%xmm0             \n"
1948    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
1949    "pavgb     %%xmm7,%%xmm0                   \n"
1950    "movdqa    %%xmm2,%%xmm7                   \n"
1951    "shufps    $0x88,%%xmm6,%%xmm2             \n"
1952    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
1953    "pavgb     %%xmm7,%%xmm2                   \n"
1954    "movdqa    %%xmm0,%%xmm1                   \n"
1955    "movdqa    %%xmm2,%%xmm6                   \n"
1956    "pmaddubsw %%xmm4,%%xmm0                   \n"
1957    "pmaddubsw %%xmm4,%%xmm2                   \n"
1958    "pmaddubsw %%xmm3,%%xmm1                   \n"
1959    "pmaddubsw %%xmm3,%%xmm6                   \n"
1960    "phaddw    %%xmm2,%%xmm0                   \n"
1961    "phaddw    %%xmm6,%%xmm1                   \n"
1962    "psraw     $0x8,%%xmm0                     \n"
1963    "psraw     $0x8,%%xmm1                     \n"
1964    "packsswb  %%xmm1,%%xmm0                   \n"
1965    "paddb     %%xmm5,%%xmm0                   \n"
1966    "sub       $0x10,%3                        \n"
1967    "movlps    %%xmm0," MEMACCESS(1) "         \n"
1968    BUNDLEALIGN
1969    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
1970    "lea       " MEMLEA(0x8,1) ",%1            \n"
1971    "jg        1b                              \n"
1972  : "+r"(src_rgba0),       // %0
1973    "+r"(dst_u),           // %1
1974    "+r"(dst_v),           // %2
1975    "+rm"(width)           // %3
1976  : "r"((intptr_t)(src_stride_rgba))
1977  : "memory", "cc"
1978#if defined(__native_client__) && defined(__x86_64__)
1979    , "r14"
1980#endif
1981#if defined(__SSE2__)
1982    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
1983#endif
1984  );
1985}
1986
1987void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_rgba0, int src_stride_rgba,
1988                                 uint8* dst_u, uint8* dst_v, int width) {
1989  asm volatile (
1990    "movdqa    %0,%%xmm4                       \n"
1991    "movdqa    %1,%%xmm3                       \n"
1992    "movdqa    %2,%%xmm5                       \n"
1993  :
1994  : "m"(kRGBAToU),         // %0
1995    "m"(kRGBAToV),         // %1
1996    "m"(kAddUV128)         // %2
1997  );
1998  asm volatile (
1999    "sub       %1,%2                           \n"
2000    LABELALIGN
2001  "1:                                          \n"
2002    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
2003    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
2004    "movdqu    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
2005    "movdqu    " MEMACCESS2(0x30,0) ",%%xmm6   \n"
2006    BUNDLEALIGN
2007    MEMOPREG(movdqu,0x00,0,4,1,xmm7)           //  movdqu  (%0,%4,1),%%xmm7
2008    "pavgb     %%xmm7,%%xmm0                   \n"
2009    MEMOPREG(movdqu,0x10,0,4,1,xmm7)           //  movdqu  0x10(%0,%4,1),%%xmm7
2010    "pavgb     %%xmm7,%%xmm1                   \n"
2011    MEMOPREG(movdqu,0x20,0,4,1,xmm7)           //  movdqu  0x20(%0,%4,1),%%xmm7
2012    "pavgb     %%xmm7,%%xmm2                   \n"
2013    MEMOPREG(movdqu,0x30,0,4,1,xmm7)           //  movdqu  0x30(%0,%4,1),%%xmm7
2014    "pavgb     %%xmm7,%%xmm6                   \n"
2015    "lea       " MEMLEA(0x40,0) ",%0           \n"
2016    "movdqa    %%xmm0,%%xmm7                   \n"
2017    "shufps    $0x88,%%xmm1,%%xmm0             \n"
2018    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
2019    "pavgb     %%xmm7,%%xmm0                   \n"
2020    "movdqa    %%xmm2,%%xmm7                   \n"
2021    "shufps    $0x88,%%xmm6,%%xmm2             \n"
2022    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
2023    "pavgb     %%xmm7,%%xmm2                   \n"
2024    "movdqa    %%xmm0,%%xmm1                   \n"
2025    "movdqa    %%xmm2,%%xmm6                   \n"
2026    "pmaddubsw %%xmm4,%%xmm0                   \n"
2027    "pmaddubsw %%xmm4,%%xmm2                   \n"
2028    "pmaddubsw %%xmm3,%%xmm1                   \n"
2029    "pmaddubsw %%xmm3,%%xmm6                   \n"
2030    "phaddw    %%xmm2,%%xmm0                   \n"
2031    "phaddw    %%xmm6,%%xmm1                   \n"
2032    "psraw     $0x8,%%xmm0                     \n"
2033    "psraw     $0x8,%%xmm1                     \n"
2034    "packsswb  %%xmm1,%%xmm0                   \n"
2035    "paddb     %%xmm5,%%xmm0                   \n"
2036    "sub       $0x10,%3                        \n"
2037    "movlps    %%xmm0," MEMACCESS(1) "         \n"
2038    BUNDLEALIGN
2039    MEMOPMEM(movhps,xmm0,0x00,1,2,1)           //  movhps  %%xmm0,(%1,%2,1)
2040    "lea       " MEMLEA(0x8,1) ",%1            \n"
2041    "jg        1b                              \n"
2042  : "+r"(src_rgba0),       // %0
2043    "+r"(dst_u),           // %1
2044    "+r"(dst_v),           // %2
2045    "+rm"(width)           // %3
2046  : "r"((intptr_t)(src_stride_rgba)) // %4
2047  : "memory", "cc"
2048#if defined(__native_client__) && defined(__x86_64__)
2049    , "r14"
2050#endif
2051#if defined(__SSE2__)
2052    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
2053#endif
2054  );
2055}
2056#endif  // HAS_ARGBTOUVROW_SSSE3
2057
2058#ifdef HAS_I422TOARGBROW_SSSE3
2059#define UB 127 /* min(63,(int8)(2.018 * 64)) */
2060#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
2061#define UR 0
2062
2063#define VB 0
2064#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
2065#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
2066
2067// Bias
2068#define BB UB * 128 + VB * 128
2069#define BG UG * 128 + VG * 128
2070#define BR UR * 128 + VR * 128
2071
2072#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
2073
2074struct {
2075  vec8 kUVToB;  // 0
2076  vec8 kUVToG;  // 16
2077  vec8 kUVToR;  // 32
2078  vec16 kUVBiasB;  // 48
2079  vec16 kUVBiasG;  // 64
2080  vec16 kUVBiasR;  // 80
2081  vec16 kYSub16;  // 96
2082  vec16 kYToRgb;  // 112
2083  vec8 kVUToB;  // 128
2084  vec8 kVUToG;  // 144
2085  vec8 kVUToR;  // 160
2086} static SIMD_ALIGNED(kYuvConstants) = {
2087  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
2088  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
2089  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
2090  { BB, BB, BB, BB, BB, BB, BB, BB },
2091  { BG, BG, BG, BG, BG, BG, BG, BG },
2092  { BR, BR, BR, BR, BR, BR, BR, BR },
2093  { 16, 16, 16, 16, 16, 16, 16, 16 },
2094  { YG, YG, YG, YG, YG, YG, YG, YG },
2095  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
2096  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
2097  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
2098};
2099
2100
2101// Read 8 UV from 411
2102#define READYUV444                                                             \
2103    "movq       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
2104    BUNDLEALIGN                                                                \
2105    MEMOPREG(movq, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
2106    "lea        " MEMLEA(0x8, [u_buf]) ",%[u_buf]               \n"            \
2107    "punpcklbw  %%xmm1,%%xmm0                                   \n"
2108
2109// Read 4 UV from 422, upsample to 8 UV
2110#define READYUV422                                                             \
2111    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
2112    BUNDLEALIGN                                                                \
2113    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
2114    "lea        " MEMLEA(0x4, [u_buf]) ",%[u_buf]               \n"            \
2115    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
2116    "punpcklwd  %%xmm0,%%xmm0                                   \n"
2117
2118// Read 2 UV from 411, upsample to 8 UV
2119#define READYUV411                                                             \
2120    "movd       " MEMACCESS([u_buf]) ",%%xmm0                   \n"            \
2121    BUNDLEALIGN                                                                \
2122    MEMOPREG(movd, 0x00, [u_buf], [v_buf], 1, xmm1)                            \
2123    "lea        " MEMLEA(0x2, [u_buf]) ",%[u_buf]               \n"            \
2124    "punpcklbw  %%xmm1,%%xmm0                                   \n"            \
2125    "punpcklwd  %%xmm0,%%xmm0                                   \n"            \
2126    "punpckldq  %%xmm0,%%xmm0                                   \n"
2127
2128// Read 4 UV from NV12, upsample to 8 UV
2129#define READNV12                                                               \
2130    "movq       " MEMACCESS([uv_buf]) ",%%xmm0                  \n"            \
2131    "lea        " MEMLEA(0x8, [uv_buf]) ",%[uv_buf]             \n"            \
2132    "punpcklwd  %%xmm0,%%xmm0                                   \n"
2133
2134// Convert 8 pixels: 8 UV and 8 Y
2135#define YUVTORGB                                                               \
2136    "movdqa     %%xmm0,%%xmm1                                   \n"            \
2137    "movdqa     %%xmm0,%%xmm2                                   \n"            \
2138    "pmaddubsw  " MEMACCESS([kYuvConstants]) ",%%xmm0           \n"            \
2139    "pmaddubsw  " MEMACCESS2(16, [kYuvConstants]) ",%%xmm1      \n"            \
2140    "pmaddubsw  " MEMACCESS2(32, [kYuvConstants]) ",%%xmm2      \n"            \
2141    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
2142    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
2143    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
2144    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
2145    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
2146    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
2147    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
2148    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
2149    "paddsw     %%xmm3,%%xmm0                                   \n"            \
2150    "paddsw     %%xmm3,%%xmm1                                   \n"            \
2151    "paddsw     %%xmm3,%%xmm2                                   \n"            \
2152    "psraw      $0x6,%%xmm0                                     \n"            \
2153    "psraw      $0x6,%%xmm1                                     \n"            \
2154    "psraw      $0x6,%%xmm2                                     \n"            \
2155    "packuswb   %%xmm0,%%xmm0                                   \n"            \
2156    "packuswb   %%xmm1,%%xmm1                                   \n"            \
2157    "packuswb   %%xmm2,%%xmm2                                   \n"
2158
2159// Convert 8 pixels: 8 VU and 8 Y
2160#define YVUTORGB                                                               \
2161    "movdqa     %%xmm0,%%xmm1                                   \n"            \
2162    "movdqa     %%xmm0,%%xmm2                                   \n"            \
2163    "pmaddubsw  " MEMACCESS2(128, [kYuvConstants]) ",%%xmm0     \n"            \
2164    "pmaddubsw  " MEMACCESS2(144, [kYuvConstants]) ",%%xmm1     \n"            \
2165    "pmaddubsw  " MEMACCESS2(160, [kYuvConstants]) ",%%xmm2     \n"            \
2166    "psubw      " MEMACCESS2(48, [kYuvConstants]) ",%%xmm0      \n"            \
2167    "psubw      " MEMACCESS2(64, [kYuvConstants]) ",%%xmm1      \n"            \
2168    "psubw      " MEMACCESS2(80, [kYuvConstants]) ",%%xmm2      \n"            \
2169    "movq       " MEMACCESS([y_buf]) ",%%xmm3                   \n"            \
2170    "lea        " MEMLEA(0x8, [y_buf]) ",%[y_buf]               \n"            \
2171    "punpcklbw  %%xmm4,%%xmm3                                   \n"            \
2172    "psubsw     " MEMACCESS2(96, [kYuvConstants]) ",%%xmm3      \n"            \
2173    "pmullw     " MEMACCESS2(112, [kYuvConstants]) ",%%xmm3     \n"            \
2174    "paddsw     %%xmm3,%%xmm0                                   \n"            \
2175    "paddsw     %%xmm3,%%xmm1                                   \n"            \
2176    "paddsw     %%xmm3,%%xmm2                                   \n"            \
2177    "psraw      $0x6,%%xmm0                                     \n"            \
2178    "psraw      $0x6,%%xmm1                                     \n"            \
2179    "psraw      $0x6,%%xmm2                                     \n"            \
2180    "packuswb   %%xmm0,%%xmm0                                   \n"            \
2181    "packuswb   %%xmm1,%%xmm1                                   \n"            \
2182    "packuswb   %%xmm2,%%xmm2                                   \n"
2183
2184void OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
2185                                const uint8* u_buf,
2186                                const uint8* v_buf,
2187                                uint8* dst_argb,
2188                                int width) {
2189  asm volatile (
2190    "sub       %[u_buf],%[v_buf]               \n"
2191    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2192    "pxor      %%xmm4,%%xmm4                   \n"
2193    LABELALIGN
2194  "1:                                          \n"
2195    READYUV444
2196    YUVTORGB
2197    "punpcklbw %%xmm1,%%xmm0                   \n"
2198    "punpcklbw %%xmm5,%%xmm2                   \n"
2199    "movdqa    %%xmm0,%%xmm1                   \n"
2200    "punpcklwd %%xmm2,%%xmm0                   \n"
2201    "punpckhwd %%xmm2,%%xmm1                   \n"
2202    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "         \n"
2203    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "   \n"
2204    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb]  \n"
2205    "sub       $0x8,%[width]                   \n"
2206    "jg        1b                              \n"
2207  : [y_buf]"+r"(y_buf),    // %[y_buf]
2208    [u_buf]"+r"(u_buf),    // %[u_buf]
2209    [v_buf]"+r"(v_buf),    // %[v_buf]
2210    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2211    [width]"+rm"(width)    // %[width]
2212  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2213  : "memory", "cc"
2214#if defined(__native_client__) && defined(__x86_64__)
2215    , "r14"
2216#endif
2217#if defined(__SSE2__)
2218    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2219#endif
2220  );
2221}
2222
2223void OMITFP I422ToRGB24Row_SSSE3(const uint8* y_buf,
2224                                 const uint8* u_buf,
2225                                 const uint8* v_buf,
2226                                 uint8* dst_rgb24,
2227                                 int width) {
2228// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
2229#if defined(__i386__)
2230  asm volatile (
2231    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2232    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
2233  :: [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2234    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24));
2235#endif
2236
2237  asm volatile (
2238#if !defined(__i386__)
2239    "movdqa    %[kShuffleMaskARGBToRGB24_0],%%xmm5 \n"
2240    "movdqa    %[kShuffleMaskARGBToRGB24],%%xmm6   \n"
2241#endif
2242    "sub       %[u_buf],%[v_buf]               \n"
2243    "pxor      %%xmm4,%%xmm4                   \n"
2244    LABELALIGN
2245  "1:                                          \n"
2246    READYUV422
2247    YUVTORGB
2248    "punpcklbw %%xmm1,%%xmm0                   \n"
2249    "punpcklbw %%xmm2,%%xmm2                   \n"
2250    "movdqa    %%xmm0,%%xmm1                   \n"
2251    "punpcklwd %%xmm2,%%xmm0                   \n"
2252    "punpckhwd %%xmm2,%%xmm1                   \n"
2253    "pshufb    %%xmm5,%%xmm0                   \n"
2254    "pshufb    %%xmm6,%%xmm1                   \n"
2255    "palignr   $0xc,%%xmm0,%%xmm1              \n"
2256    "movq      %%xmm0," MEMACCESS([dst_rgb24]) "\n"
2257    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_rgb24]) "\n"
2258    "lea       " MEMLEA(0x18,[dst_rgb24]) ",%[dst_rgb24] \n"
2259    "sub       $0x8,%[width]                   \n"
2260    "jg        1b                              \n"
2261  : [y_buf]"+r"(y_buf),    // %[y_buf]
2262    [u_buf]"+r"(u_buf),    // %[u_buf]
2263    [v_buf]"+r"(v_buf),    // %[v_buf]
2264    [dst_rgb24]"+r"(dst_rgb24),  // %[dst_rgb24]
2265    [width]"+rm"(width)    // %[width]
2266  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
2267#if !defined(__i386__)
2268    , [kShuffleMaskARGBToRGB24_0]"m"(kShuffleMaskARGBToRGB24_0),
2269    [kShuffleMaskARGBToRGB24]"m"(kShuffleMaskARGBToRGB24)
2270#endif
2271  : "memory", "cc"
2272#if defined(__native_client__) && defined(__x86_64__)
2273    , "r14"
2274#endif
2275#if defined(__SSE2__)
2276    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2277#endif
2278  );
2279}
2280
2281void OMITFP I422ToRAWRow_SSSE3(const uint8* y_buf,
2282                               const uint8* u_buf,
2283                               const uint8* v_buf,
2284                               uint8* dst_raw,
2285                               int width) {
2286// fpic 32 bit gcc 4.2 on OSX runs out of GPR regs.
2287#if defined(__i386__)
2288  asm volatile (
2289    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
2290    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
2291  :: [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2292    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW));
2293#endif
2294
2295  asm volatile (
2296#if !defined(__i386__)
2297    "movdqa    %[kShuffleMaskARGBToRAW_0],%%xmm5 \n"
2298    "movdqa    %[kShuffleMaskARGBToRAW],%%xmm6   \n"
2299#endif
2300    "sub       %[u_buf],%[v_buf]               \n"
2301    "pxor      %%xmm4,%%xmm4                   \n"
2302    LABELALIGN
2303  "1:                                          \n"
2304    READYUV422
2305    YUVTORGB
2306    "punpcklbw %%xmm1,%%xmm0                   \n"
2307    "punpcklbw %%xmm2,%%xmm2                   \n"
2308    "movdqa    %%xmm0,%%xmm1                   \n"
2309    "punpcklwd %%xmm2,%%xmm0                   \n"
2310    "punpckhwd %%xmm2,%%xmm1                   \n"
2311    "pshufb    %%xmm5,%%xmm0                   \n"
2312    "pshufb    %%xmm6,%%xmm1                   \n"
2313    "palignr   $0xc,%%xmm0,%%xmm1              \n"
2314    "movq      %%xmm0," MEMACCESS([dst_raw]) " \n"
2315    "movdqu    %%xmm1," MEMACCESS2(0x8,[dst_raw]) "\n"
2316    "lea       " MEMLEA(0x18,[dst_raw]) ",%[dst_raw] \n"
2317    "sub       $0x8,%[width]                   \n"
2318    "jg        1b                              \n"
2319  : [y_buf]"+r"(y_buf),    // %[y_buf]
2320    [u_buf]"+r"(u_buf),    // %[u_buf]
2321    [v_buf]"+r"(v_buf),    // %[v_buf]
2322    [dst_raw]"+r"(dst_raw),  // %[dst_raw]
2323    [width]"+rm"(width)    // %[width]
2324  : [kYuvConstants]"r"(&kYuvConstants.kUVToB)
2325#if !defined(__i386__)
2326    , [kShuffleMaskARGBToRAW_0]"m"(kShuffleMaskARGBToRAW_0),
2327    [kShuffleMaskARGBToRAW]"m"(kShuffleMaskARGBToRAW)
2328#endif
2329  : "memory", "cc"
2330#if defined(__native_client__) && defined(__x86_64__)
2331    , "r14"
2332#endif
2333#if defined(__SSE2__)
2334    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
2335#endif
2336  );
2337}
2338
2339void OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
2340                                const uint8* u_buf,
2341                                const uint8* v_buf,
2342                                uint8* dst_argb,
2343                                int width) {
2344  asm volatile (
2345    "sub       %[u_buf],%[v_buf]               \n"
2346    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2347    "pxor      %%xmm4,%%xmm4                   \n"
2348    LABELALIGN
2349  "1:                                          \n"
2350    READYUV422
2351    YUVTORGB
2352    "punpcklbw %%xmm1,%%xmm0                   \n"
2353    "punpcklbw %%xmm5,%%xmm2                   \n"
2354    "movdqa    %%xmm0,%%xmm1                   \n"
2355    "punpcklwd %%xmm2,%%xmm0                   \n"
2356    "punpckhwd %%xmm2,%%xmm1                   \n"
2357    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
2358    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2359    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2360    "sub       $0x8,%[width]                   \n"
2361    "jg        1b                              \n"
2362  : [y_buf]"+r"(y_buf),    // %[y_buf]
2363    [u_buf]"+r"(u_buf),    // %[u_buf]
2364    [v_buf]"+r"(v_buf),    // %[v_buf]
2365    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2366    [width]"+rm"(width)    // %[width]
2367  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2368  : "memory", "cc"
2369#if defined(__native_client__) && defined(__x86_64__)
2370    , "r14"
2371#endif
2372#if defined(__SSE2__)
2373    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2374#endif
2375  );
2376}
2377
2378void OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
2379                                const uint8* u_buf,
2380                                const uint8* v_buf,
2381                                uint8* dst_argb,
2382                                int width) {
2383  asm volatile (
2384    "sub       %[u_buf],%[v_buf]               \n"
2385    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2386    "pxor      %%xmm4,%%xmm4                   \n"
2387    LABELALIGN
2388  "1:                                          \n"
2389    READYUV411
2390    YUVTORGB
2391    "punpcklbw %%xmm1,%%xmm0                   \n"
2392    "punpcklbw %%xmm5,%%xmm2                   \n"
2393    "movdqa    %%xmm0,%%xmm1                   \n"
2394    "punpcklwd %%xmm2,%%xmm0                   \n"
2395    "punpckhwd %%xmm2,%%xmm1                   \n"
2396    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
2397    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2398    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2399    "sub       $0x8,%[width]                   \n"
2400    "jg        1b                              \n"
2401  : [y_buf]"+r"(y_buf),    // %[y_buf]
2402    [u_buf]"+r"(u_buf),    // %[u_buf]
2403    [v_buf]"+r"(v_buf),    // %[v_buf]
2404    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2405    [width]"+rm"(width)    // %[width]
2406  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2407  : "memory", "cc"
2408#if defined(__native_client__) && defined(__x86_64__)
2409    , "r14"
2410#endif
2411#if defined(__SSE2__)
2412    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2413#endif
2414  );
2415}
2416
2417void OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
2418                                const uint8* uv_buf,
2419                                uint8* dst_argb,
2420                                int width) {
2421  asm volatile (
2422    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2423    "pxor      %%xmm4,%%xmm4                   \n"
2424    LABELALIGN
2425  "1:                                          \n"
2426    READNV12
2427    YUVTORGB
2428    "punpcklbw %%xmm1,%%xmm0                   \n"
2429    "punpcklbw %%xmm5,%%xmm2                   \n"
2430    "movdqa    %%xmm0,%%xmm1                   \n"
2431    "punpcklwd %%xmm2,%%xmm0                   \n"
2432    "punpckhwd %%xmm2,%%xmm1                   \n"
2433    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
2434    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2435    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2436    "sub       $0x8,%[width]                   \n"
2437    "jg        1b                              \n"
2438  : [y_buf]"+r"(y_buf),    // %[y_buf]
2439    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2440    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2441    [width]"+rm"(width)    // %[width]
2442  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2443  : "memory", "cc"
2444  // Does not use r14.
2445#if defined(__SSE2__)
2446    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2447#endif
2448  );
2449}
2450
2451void OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
2452                                const uint8* uv_buf,
2453                                uint8* dst_argb,
2454                                int width) {
2455  asm volatile (
2456    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2457    "pxor      %%xmm4,%%xmm4                   \n"
2458    LABELALIGN
2459  "1:                                          \n"
2460    READNV12
2461    YVUTORGB
2462    "punpcklbw %%xmm1,%%xmm0                   \n"
2463    "punpcklbw %%xmm5,%%xmm2                   \n"
2464    "movdqa    %%xmm0,%%xmm1                   \n"
2465    "punpcklwd %%xmm2,%%xmm0                   \n"
2466    "punpckhwd %%xmm2,%%xmm1                   \n"
2467    "movdqa    %%xmm0," MEMACCESS([dst_argb]) "\n"
2468    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2469    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2470    "sub       $0x8,%[width]                   \n"
2471    "jg        1b                              \n"
2472  : [y_buf]"+r"(y_buf),    // %[y_buf]
2473    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2474    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2475    [width]"+rm"(width)    // %[width]
2476  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2477  : "memory", "cc"
2478  // Does not use r14.
2479#if defined(__SSE2__)
2480    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2481#endif
2482  );
2483}
2484
2485void OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2486                                          const uint8* u_buf,
2487                                          const uint8* v_buf,
2488                                          uint8* dst_argb,
2489                                          int width) {
2490  asm volatile (
2491    "sub       %[u_buf],%[v_buf]               \n"
2492    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2493    "pxor      %%xmm4,%%xmm4                   \n"
2494    LABELALIGN
2495  "1:                                          \n"
2496    READYUV444
2497    YUVTORGB
2498    "punpcklbw %%xmm1,%%xmm0                   \n"
2499    "punpcklbw %%xmm5,%%xmm2                   \n"
2500    "movdqa    %%xmm0,%%xmm1                   \n"
2501    "punpcklwd %%xmm2,%%xmm0                   \n"
2502    "punpckhwd %%xmm2,%%xmm1                   \n"
2503    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
2504    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2505    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2506    "sub       $0x8,%[width]                   \n"
2507    "jg        1b                              \n"
2508  : [y_buf]"+r"(y_buf),    // %[y_buf]
2509    [u_buf]"+r"(u_buf),    // %[u_buf]
2510    [v_buf]"+r"(v_buf),    // %[v_buf]
2511    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2512    [width]"+rm"(width)    // %[width]
2513  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2514  : "memory", "cc"
2515#if defined(__native_client__) && defined(__x86_64__)
2516    , "r14"
2517#endif
2518#if defined(__SSE2__)
2519    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2520#endif
2521  );
2522}
2523
2524void OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2525                                          const uint8* u_buf,
2526                                          const uint8* v_buf,
2527                                          uint8* dst_argb,
2528                                          int width) {
2529  asm volatile (
2530    "sub       %[u_buf],%[v_buf]               \n"
2531    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2532    "pxor      %%xmm4,%%xmm4                   \n"
2533    LABELALIGN
2534  "1:                                          \n"
2535    READYUV422
2536    YUVTORGB
2537    "punpcklbw %%xmm1,%%xmm0                   \n"
2538    "punpcklbw %%xmm5,%%xmm2                   \n"
2539    "movdqa    %%xmm0,%%xmm1                   \n"
2540    "punpcklwd %%xmm2,%%xmm0                   \n"
2541    "punpckhwd %%xmm2,%%xmm1                   \n"
2542    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
2543    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2544    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2545    "sub       $0x8,%[width]                   \n"
2546    "jg        1b                              \n"
2547  : [y_buf]"+r"(y_buf),    // %[y_buf]
2548    [u_buf]"+r"(u_buf),    // %[u_buf]
2549    [v_buf]"+r"(v_buf),    // %[v_buf]
2550    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2551    [width]"+rm"(width)    // %[width]
2552  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2553  : "memory", "cc"
2554#if defined(__native_client__) && defined(__x86_64__)
2555    , "r14"
2556#endif
2557#if defined(__SSE2__)
2558    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2559#endif
2560  );
2561}
2562
2563void OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2564                                          const uint8* u_buf,
2565                                          const uint8* v_buf,
2566                                          uint8* dst_argb,
2567                                          int width) {
2568  asm volatile (
2569    "sub       %[u_buf],%[v_buf]               \n"
2570    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2571    "pxor      %%xmm4,%%xmm4                   \n"
2572    LABELALIGN
2573  "1:                                          \n"
2574    READYUV411
2575    YUVTORGB
2576    "punpcklbw %%xmm1,%%xmm0                   \n"
2577    "punpcklbw %%xmm5,%%xmm2                   \n"
2578    "movdqa    %%xmm0,%%xmm1                   \n"
2579    "punpcklwd %%xmm2,%%xmm0                   \n"
2580    "punpckhwd %%xmm2,%%xmm1                   \n"
2581    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
2582    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2583    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2584    "sub       $0x8,%[width]                   \n"
2585    "jg        1b                              \n"
2586  : [y_buf]"+r"(y_buf),    // %[y_buf]
2587    [u_buf]"+r"(u_buf),    // %[u_buf]
2588    [v_buf]"+r"(v_buf),    // %[v_buf]
2589    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2590    [width]"+rm"(width)    // %[width]
2591  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2592  : "memory", "cc"
2593#if defined(__native_client__) && defined(__x86_64__)
2594    , "r14"
2595#endif
2596#if defined(__SSE2__)
2597    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2598#endif
2599  );
2600}
2601
2602void OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2603                                          const uint8* uv_buf,
2604                                          uint8* dst_argb,
2605                                          int width) {
2606  asm volatile (
2607    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2608    "pxor      %%xmm4,%%xmm4                   \n"
2609    LABELALIGN
2610  "1:                                          \n"
2611    READNV12
2612    YUVTORGB
2613    "punpcklbw %%xmm1,%%xmm0                   \n"
2614    "punpcklbw %%xmm5,%%xmm2                   \n"
2615    "movdqa    %%xmm0,%%xmm1                   \n"
2616    "punpcklwd %%xmm2,%%xmm0                   \n"
2617    "punpckhwd %%xmm2,%%xmm1                   \n"
2618    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
2619    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2620    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2621    "sub       $0x8,%[width]                   \n"
2622    "jg        1b                              \n"
2623  : [y_buf]"+r"(y_buf),    // %[y_buf]
2624    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2625    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2626    [width]"+rm"(width)    // %[width]
2627  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2628  : "memory", "cc"
2629  // Does not use r14.
2630#if defined(__SSE2__)
2631    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2632#endif
2633  );
2634}
2635
2636void OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
2637                                          const uint8* uv_buf,
2638                                          uint8* dst_argb,
2639                                          int width) {
2640  asm volatile (
2641    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2642    "pxor      %%xmm4,%%xmm4                   \n"
2643    LABELALIGN
2644  "1:                                          \n"
2645    READNV12
2646    YVUTORGB
2647    "punpcklbw %%xmm1,%%xmm0                   \n"
2648    "punpcklbw %%xmm5,%%xmm2                   \n"
2649    "movdqa    %%xmm0,%%xmm1                   \n"
2650    "punpcklwd %%xmm2,%%xmm0                   \n"
2651    "punpckhwd %%xmm2,%%xmm1                   \n"
2652    "movdqu    %%xmm0," MEMACCESS([dst_argb]) "\n"
2653    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_argb]) "\n"
2654    "lea       " MEMLEA(0x20,[dst_argb]) ",%[dst_argb] \n"
2655    "sub       $0x8,%[width]                   \n"
2656    "jg        1b                              \n"
2657  : [y_buf]"+r"(y_buf),    // %[y_buf]
2658    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
2659    [dst_argb]"+r"(dst_argb),  // %[dst_argb]
2660    [width]"+rm"(width)    // %[width]
2661  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2662  : "memory", "cc"
2663  // Does not use r14.
2664#if defined(__SSE2__)
2665    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2666#endif
2667  );
2668}
2669
2670void OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
2671                                const uint8* u_buf,
2672                                const uint8* v_buf,
2673                                uint8* dst_bgra,
2674                                int width) {
2675  asm volatile (
2676    "sub       %[u_buf],%[v_buf]               \n"
2677    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2678    "pxor      %%xmm4,%%xmm4                   \n"
2679    LABELALIGN
2680  "1:                                          \n"
2681    READYUV422
2682    YUVTORGB
2683    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2684    "punpcklbw %%xmm0,%%xmm1                   \n"
2685    "punpcklbw %%xmm2,%%xmm5                   \n"
2686    "movdqa    %%xmm5,%%xmm0                   \n"
2687    "punpcklwd %%xmm1,%%xmm5                   \n"
2688    "punpckhwd %%xmm1,%%xmm0                   \n"
2689    "movdqa    %%xmm5," MEMACCESS([dst_bgra]) "\n"
2690    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
2691    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
2692    "sub       $0x8,%[width]                   \n"
2693    "jg        1b                              \n"
2694  : [y_buf]"+r"(y_buf),    // %[y_buf]
2695    [u_buf]"+r"(u_buf),    // %[u_buf]
2696    [v_buf]"+r"(v_buf),    // %[v_buf]
2697    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
2698    [width]"+rm"(width)    // %[width]
2699  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2700  : "memory", "cc"
2701#if defined(__native_client__) && defined(__x86_64__)
2702    , "r14"
2703#endif
2704#if defined(__SSE2__)
2705    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2706#endif
2707  );
2708}
2709
2710void OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
2711                                const uint8* u_buf,
2712                                const uint8* v_buf,
2713                                uint8* dst_abgr,
2714                                int width) {
2715  asm volatile (
2716    "sub       %[u_buf],%[v_buf]               \n"
2717    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2718    "pxor      %%xmm4,%%xmm4                   \n"
2719    LABELALIGN
2720  "1:                                          \n"
2721    READYUV422
2722    YUVTORGB
2723    "punpcklbw %%xmm1,%%xmm2                   \n"
2724    "punpcklbw %%xmm5,%%xmm0                   \n"
2725    "movdqa    %%xmm2,%%xmm1                   \n"
2726    "punpcklwd %%xmm0,%%xmm2                   \n"
2727    "punpckhwd %%xmm0,%%xmm1                   \n"
2728    "movdqa    %%xmm2," MEMACCESS([dst_abgr]) "\n"
2729    "movdqa    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
2730    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
2731    "sub       $0x8,%[width]                   \n"
2732    "jg        1b                              \n"
2733  : [y_buf]"+r"(y_buf),    // %[y_buf]
2734    [u_buf]"+r"(u_buf),    // %[u_buf]
2735    [v_buf]"+r"(v_buf),    // %[v_buf]
2736    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
2737    [width]"+rm"(width)    // %[width]
2738  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2739  : "memory", "cc"
2740#if defined(__native_client__) && defined(__x86_64__)
2741    , "r14"
2742#endif
2743#if defined(__SSE2__)
2744    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2745#endif
2746  );
2747}
2748
2749void OMITFP I422ToRGBARow_SSSE3(const uint8* y_buf,
2750                                const uint8* u_buf,
2751                                const uint8* v_buf,
2752                                uint8* dst_rgba,
2753                                int width) {
2754  asm volatile (
2755    "sub       %[u_buf],%[v_buf]               \n"
2756    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2757    "pxor      %%xmm4,%%xmm4                   \n"
2758    LABELALIGN
2759  "1:                                          \n"
2760    READYUV422
2761    YUVTORGB
2762    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2763    "punpcklbw %%xmm2,%%xmm1                   \n"
2764    "punpcklbw %%xmm0,%%xmm5                   \n"
2765    "movdqa    %%xmm5,%%xmm0                   \n"
2766    "punpcklwd %%xmm1,%%xmm5                   \n"
2767    "punpckhwd %%xmm1,%%xmm0                   \n"
2768    "movdqa    %%xmm5," MEMACCESS([dst_rgba]) "\n"
2769    "movdqa    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
2770    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
2771    "sub       $0x8,%[width]                   \n"
2772    "jg        1b                              \n"
2773  : [y_buf]"+r"(y_buf),    // %[y_buf]
2774    [u_buf]"+r"(u_buf),    // %[u_buf]
2775    [v_buf]"+r"(v_buf),    // %[v_buf]
2776    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
2777    [width]"+rm"(width)    // %[width]
2778  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2779  : "memory", "cc"
2780#if defined(__native_client__) && defined(__x86_64__)
2781    , "r14"
2782#endif
2783#if defined(__SSE2__)
2784    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2785#endif
2786  );
2787}
2788
2789void OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
2790                                          const uint8* u_buf,
2791                                          const uint8* v_buf,
2792                                          uint8* dst_bgra,
2793                                          int width) {
2794  asm volatile (
2795    "sub       %[u_buf],%[v_buf]               \n"
2796    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2797    "pxor      %%xmm4,%%xmm4                   \n"
2798    LABELALIGN
2799  "1:                                          \n"
2800    READYUV422
2801    YUVTORGB
2802    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2803    "punpcklbw %%xmm0,%%xmm1                   \n"
2804    "punpcklbw %%xmm2,%%xmm5                   \n"
2805    "movdqa    %%xmm5,%%xmm0                   \n"
2806    "punpcklwd %%xmm1,%%xmm5                   \n"
2807    "punpckhwd %%xmm1,%%xmm0                   \n"
2808    "movdqu    %%xmm5," MEMACCESS([dst_bgra]) "\n"
2809    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_bgra]) "\n"
2810    "lea       " MEMLEA(0x20,[dst_bgra]) ",%[dst_bgra] \n"
2811    "sub       $0x8,%[width]                   \n"
2812    "jg        1b                              \n"
2813  : [y_buf]"+r"(y_buf),    // %[y_buf]
2814    [u_buf]"+r"(u_buf),    // %[u_buf]
2815    [v_buf]"+r"(v_buf),    // %[v_buf]
2816    [dst_bgra]"+r"(dst_bgra),  // %[dst_bgra]
2817    [width]"+rm"(width)    // %[width]
2818  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2819  : "memory", "cc"
2820#if defined(__native_client__) && defined(__x86_64__)
2821    , "r14"
2822#endif
2823#if defined(__SSE2__)
2824    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2825#endif
2826  );
2827}
2828
2829void OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
2830                                          const uint8* u_buf,
2831                                          const uint8* v_buf,
2832                                          uint8* dst_abgr,
2833                                          int width) {
2834  asm volatile (
2835    "sub       %[u_buf],%[v_buf]               \n"
2836    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2837    "pxor      %%xmm4,%%xmm4                   \n"
2838    LABELALIGN
2839  "1:                                          \n"
2840    READYUV422
2841    YUVTORGB
2842    "punpcklbw %%xmm1,%%xmm2                   \n"
2843    "punpcklbw %%xmm5,%%xmm0                   \n"
2844    "movdqa    %%xmm2,%%xmm1                   \n"
2845    "punpcklwd %%xmm0,%%xmm2                   \n"
2846    "punpckhwd %%xmm0,%%xmm1                   \n"
2847    "movdqu    %%xmm2," MEMACCESS([dst_abgr]) "\n"
2848    "movdqu    %%xmm1," MEMACCESS2(0x10,[dst_abgr]) "\n"
2849    "lea       " MEMLEA(0x20,[dst_abgr]) ",%[dst_abgr] \n"
2850    "sub       $0x8,%[width]                   \n"
2851    "jg        1b                              \n"
2852  : [y_buf]"+r"(y_buf),    // %[y_buf]
2853    [u_buf]"+r"(u_buf),    // %[u_buf]
2854    [v_buf]"+r"(v_buf),    // %[v_buf]
2855    [dst_abgr]"+r"(dst_abgr),  // %[dst_abgr]
2856    [width]"+rm"(width)    // %[width]
2857  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2858  : "memory", "cc"
2859#if defined(__native_client__) && defined(__x86_64__)
2860    , "r14"
2861#endif
2862#if defined(__SSE2__)
2863    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2864#endif
2865  );
2866}
2867
2868void OMITFP I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
2869                                          const uint8* u_buf,
2870                                          const uint8* v_buf,
2871                                          uint8* dst_rgba,
2872                                          int width) {
2873  asm volatile (
2874    "sub       %[u_buf],%[v_buf]               \n"
2875    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2876    "pxor      %%xmm4,%%xmm4                   \n"
2877    LABELALIGN
2878  "1:                                          \n"
2879    READYUV422
2880    YUVTORGB
2881    "pcmpeqb   %%xmm5,%%xmm5                   \n"
2882    "punpcklbw %%xmm2,%%xmm1                   \n"
2883    "punpcklbw %%xmm0,%%xmm5                   \n"
2884    "movdqa    %%xmm5,%%xmm0                   \n"
2885    "punpcklwd %%xmm1,%%xmm5                   \n"
2886    "punpckhwd %%xmm1,%%xmm0                   \n"
2887    "movdqu    %%xmm5," MEMACCESS([dst_rgba]) "\n"
2888    "movdqu    %%xmm0," MEMACCESS2(0x10,[dst_rgba]) "\n"
2889    "lea       " MEMLEA(0x20,[dst_rgba]) ",%[dst_rgba] \n"
2890    "sub       $0x8,%[width]                   \n"
2891    "jg        1b                              \n"
2892  : [y_buf]"+r"(y_buf),    // %[y_buf]
2893    [u_buf]"+r"(u_buf),    // %[u_buf]
2894    [v_buf]"+r"(v_buf),    // %[v_buf]
2895    [dst_rgba]"+r"(dst_rgba),  // %[dst_rgba]
2896    [width]"+rm"(width)    // %[width]
2897  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
2898  : "memory", "cc"
2899#if defined(__native_client__) && defined(__x86_64__)
2900    , "r14"
2901#endif
2902#if defined(__SSE2__)
2903    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
2904#endif
2905  );
2906}
2907
2908#endif  // HAS_I422TOARGBROW_SSSE3
2909
2910#ifdef HAS_YTOARGBROW_SSE2
2911void YToARGBRow_SSE2(const uint8* y_buf,
2912                     uint8* dst_argb,
2913                     int width) {
2914  asm volatile (
2915    "pxor      %%xmm5,%%xmm5                   \n"
2916    "pcmpeqb   %%xmm4,%%xmm4                   \n"
2917    "pslld     $0x18,%%xmm4                    \n"
2918    "mov       $0x00100010,%%eax               \n"
2919    "movd      %%eax,%%xmm3                    \n"
2920    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
2921    "mov       $0x004a004a,%%eax               \n"
2922    "movd      %%eax,%%xmm2                    \n"
2923    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
2924    LABELALIGN
2925  "1:                                          \n"
2926    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
2927    "movq      " MEMACCESS(0) ",%%xmm0         \n"
2928    "lea       " MEMLEA(0x8,0) ",%0            \n"
2929    "punpcklbw %%xmm5,%%xmm0                   \n"
2930    "psubusw   %%xmm3,%%xmm0                   \n"
2931    "pmullw    %%xmm2,%%xmm0                   \n"
2932    "psrlw     $6, %%xmm0                      \n"
2933    "packuswb  %%xmm0,%%xmm0                   \n"
2934
2935    // Step 2: Weave into ARGB
2936    "punpcklbw %%xmm0,%%xmm0                   \n"
2937    "movdqa    %%xmm0,%%xmm1                   \n"
2938    "punpcklwd %%xmm0,%%xmm0                   \n"
2939    "punpckhwd %%xmm1,%%xmm1                   \n"
2940    "por       %%xmm4,%%xmm0                   \n"
2941    "por       %%xmm4,%%xmm1                   \n"
2942    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
2943    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
2944    "lea       " MEMLEA(0x20,1) ",%1           \n"
2945
2946    "sub       $0x8,%2                         \n"
2947    "jg        1b                              \n"
2948  : "+r"(y_buf),     // %0
2949    "+r"(dst_argb),  // %1
2950    "+rm"(width)     // %2
2951  :
2952  : "memory", "cc", "eax"
2953#if defined(__SSE2__)
2954    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
2955#endif
2956  );
2957}
2958#endif  // HAS_YTOARGBROW_SSE2
2959
2960#ifdef HAS_MIRRORROW_SSSE3
2961// Shuffle table for reversing the bytes.
2962static uvec8 kShuffleMirror = {
2963  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
2964};
2965
2966void MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
2967  intptr_t temp_width = (intptr_t)(width);
2968  asm volatile (
2969    "movdqa    %3,%%xmm5                       \n"
2970    "lea       " MEMLEA(-0x10,0) ",%0          \n"
2971    LABELALIGN
2972  "1:                                          \n"
2973    MEMOPREG(movdqa,0x00,0,2,1,xmm0)           //  movdqa  (%0,%2),%%xmm0
2974    "pshufb    %%xmm5,%%xmm0                   \n"
2975    "sub       $0x10,%2                        \n"
2976    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
2977    "lea       " MEMLEA(0x10,1) ",%1           \n"
2978    "jg        1b                              \n"
2979  : "+r"(src),  // %0
2980    "+r"(dst),  // %1
2981    "+r"(temp_width)  // %2
2982  : "m"(kShuffleMirror) // %3
2983  : "memory", "cc"
2984#if defined(__native_client__) && defined(__x86_64__)
2985    , "r14"
2986#endif
2987#if defined(__SSE2__)
2988    , "xmm0", "xmm5"
2989#endif
2990  );
2991}
2992#endif  // HAS_MIRRORROW_SSSE3
2993
2994#ifdef HAS_MIRRORROW_SSE2
2995void MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
2996  intptr_t temp_width = (intptr_t)(width);
2997  asm volatile (
2998    "lea       " MEMLEA(-0x10,0) ",%0          \n"
2999    LABELALIGN
3000  "1:                                          \n"
3001    MEMOPREG(movdqu,0x00,0,2,1,xmm0)           //  movdqu  (%0,%2),%%xmm0
3002    "movdqa    %%xmm0,%%xmm1                   \n"
3003    "psllw     $0x8,%%xmm0                     \n"
3004    "psrlw     $0x8,%%xmm1                     \n"
3005    "por       %%xmm1,%%xmm0                   \n"
3006    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
3007    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
3008    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
3009    "sub       $0x10,%2                        \n"
3010    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3011    "lea       " MEMLEA(0x10,1)",%1            \n"
3012    "jg        1b                              \n"
3013  : "+r"(src),  // %0
3014    "+r"(dst),  // %1
3015    "+r"(temp_width)  // %2
3016  :
3017  : "memory", "cc"
3018#if defined(__native_client__) && defined(__x86_64__)
3019    , "r14"
3020#endif
3021#if defined(__SSE2__)
3022    , "xmm0", "xmm1"
3023#endif
3024  );
3025}
3026#endif  // HAS_MIRRORROW_SSE2
3027
3028#ifdef HAS_MIRRORROW_UV_SSSE3
3029// Shuffle table for reversing the bytes of UV channels.
3030static uvec8 kShuffleMirrorUV = {
3031  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
3032};
3033void MirrorUVRow_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
3034                       int width) {
3035  intptr_t temp_width = (intptr_t)(width);
3036  asm volatile (
3037    "movdqa    %4,%%xmm1                       \n"
3038    "lea       " MEMLEA4(-0x10,0,3,2) ",%0       \n"
3039    "sub       %1,%2                           \n"
3040    LABELALIGN
3041  "1:                                          \n"
3042    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3043    "lea       " MEMLEA(-0x10,0) ",%0            \n"
3044    "pshufb    %%xmm1,%%xmm0                   \n"
3045    "sub       $8,%3                           \n"
3046    "movlpd    %%xmm0," MEMACCESS(1) "         \n"
3047    BUNDLEALIGN
3048    MEMOPMEM(movhpd,xmm0,0x00,1,2,1)           //  movhpd    %%xmm0,(%1,%2)
3049    "lea       " MEMLEA(0x8,1) ",%1            \n"
3050    "jg        1b                              \n"
3051  : "+r"(src),      // %0
3052    "+r"(dst_u),    // %1
3053    "+r"(dst_v),    // %2
3054    "+r"(temp_width)  // %3
3055  : "m"(kShuffleMirrorUV)  // %4
3056  : "memory", "cc"
3057#if defined(__native_client__) && defined(__x86_64__)
3058    , "r14"
3059#endif
3060#if defined(__SSE2__)
3061    , "xmm0", "xmm1"
3062#endif
3063  );
3064}
3065#endif  // HAS_MIRRORROW_UV_SSSE3
3066
3067#ifdef HAS_ARGBMIRRORROW_SSSE3
3068// Shuffle table for reversing the bytes.
3069static uvec8 kARGBShuffleMirror = {
3070  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
3071};
3072
3073void ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
3074  intptr_t temp_width = (intptr_t)(width);
3075  asm volatile (
3076    "lea       " MEMLEA4(-0x10,0,2,4) ",%0     \n"
3077    "movdqa    %3,%%xmm5                       \n"
3078    LABELALIGN
3079  "1:                                          \n"
3080    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3081    "pshufb    %%xmm5,%%xmm0                   \n"
3082    "lea       " MEMLEA(-0x10,0) ",%0          \n"
3083    "sub       $0x4,%2                         \n"
3084    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
3085    "lea       " MEMLEA(0x10,1) ",%1           \n"
3086    "jg        1b                              \n"
3087  : "+r"(src),  // %0
3088    "+r"(dst),  // %1
3089    "+r"(temp_width)  // %2
3090  : "m"(kARGBShuffleMirror)  // %3
3091  : "memory", "cc"
3092#if defined(__SSE2__)
3093    , "xmm0", "xmm5"
3094#endif
3095  );
3096}
3097#endif  // HAS_ARGBMIRRORROW_SSSE3
3098
3099#ifdef HAS_SPLITUVROW_SSE2
3100void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
3101  asm volatile (
3102    "pcmpeqb    %%xmm5,%%xmm5                    \n"
3103    "psrlw      $0x8,%%xmm5                      \n"
3104    "sub        %1,%2                            \n"
3105    LABELALIGN
3106  "1:                                            \n"
3107    "movdqa     " MEMACCESS(0) ",%%xmm0          \n"
3108    "movdqa     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
3109    "lea        " MEMLEA(0x20,0) ",%0            \n"
3110    "movdqa     %%xmm0,%%xmm2                    \n"
3111    "movdqa     %%xmm1,%%xmm3                    \n"
3112    "pand       %%xmm5,%%xmm0                    \n"
3113    "pand       %%xmm5,%%xmm1                    \n"
3114    "packuswb   %%xmm1,%%xmm0                    \n"
3115    "psrlw      $0x8,%%xmm2                      \n"
3116    "psrlw      $0x8,%%xmm3                      \n"
3117    "packuswb   %%xmm3,%%xmm2                    \n"
3118    "movdqa     %%xmm0," MEMACCESS(1) "          \n"
3119    MEMOPMEM(movdqa,xmm2,0x00,1,2,1)             // movdqa     %%xmm2,(%1,%2)
3120    "lea        " MEMLEA(0x10,1) ",%1            \n"
3121    "sub        $0x10,%3                         \n"
3122    "jg         1b                               \n"
3123  : "+r"(src_uv),     // %0
3124    "+r"(dst_u),      // %1
3125    "+r"(dst_v),      // %2
3126    "+r"(pix)         // %3
3127  :
3128  : "memory", "cc"
3129#if defined(__native_client__) && defined(__x86_64__)
3130    , "r14"
3131#endif
3132#if defined(__SSE2__)
3133    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3134#endif
3135  );
3136}
3137
3138void SplitUVRow_Unaligned_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
3139                               int pix) {
3140  asm volatile (
3141    "pcmpeqb    %%xmm5,%%xmm5                    \n"
3142    "psrlw      $0x8,%%xmm5                      \n"
3143    "sub        %1,%2                            \n"
3144    LABELALIGN
3145  "1:                                            \n"
3146    "movdqu     " MEMACCESS(0) ",%%xmm0          \n"
3147    "movdqu     " MEMACCESS2(0x10,0) ",%%xmm1    \n"
3148    "lea        " MEMLEA(0x20,0) ",%0            \n"
3149    "movdqa     %%xmm0,%%xmm2                    \n"
3150    "movdqa     %%xmm1,%%xmm3                    \n"
3151    "pand       %%xmm5,%%xmm0                    \n"
3152    "pand       %%xmm5,%%xmm1                    \n"
3153    "packuswb   %%xmm1,%%xmm0                    \n"
3154    "psrlw      $0x8,%%xmm2                      \n"
3155    "psrlw      $0x8,%%xmm3                      \n"
3156    "packuswb   %%xmm3,%%xmm2                    \n"
3157    "movdqu     %%xmm0," MEMACCESS(1) "          \n"
3158    MEMOPMEM(movdqu,xmm2,0x00,1,2,1)             //  movdqu     %%xmm2,(%1,%2)
3159    "lea        " MEMLEA(0x10,1) ",%1            \n"
3160    "sub        $0x10,%3                         \n"
3161    "jg         1b                               \n"
3162  : "+r"(src_uv),     // %0
3163    "+r"(dst_u),      // %1
3164    "+r"(dst_v),      // %2
3165    "+r"(pix)         // %3
3166  :
3167  : "memory", "cc"
3168#if defined(__native_client__) && defined(__x86_64__)
3169    , "r14"
3170#endif
3171#if defined(__SSE2__)
3172    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3173#endif
3174  );
3175}
3176#endif  // HAS_SPLITUVROW_SSE2
3177
3178#ifdef HAS_MERGEUVROW_SSE2
3179void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
3180                     int width) {
3181  asm volatile (
3182    "sub       %0,%1                             \n"
3183    LABELALIGN
3184  "1:                                            \n"
3185    "movdqa    " MEMACCESS(0) ",%%xmm0           \n"
3186    MEMOPREG(movdqa,0x00,0,1,1,xmm1)             //  movdqa    (%0,%1,1),%%xmm1
3187    "lea       " MEMLEA(0x10,0) ",%0             \n"
3188    "movdqa    %%xmm0,%%xmm2                     \n"
3189    "punpcklbw %%xmm1,%%xmm0                     \n"
3190    "punpckhbw %%xmm1,%%xmm2                     \n"
3191    "movdqa    %%xmm0," MEMACCESS(2) "           \n"
3192    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "     \n"
3193    "lea       " MEMLEA(0x20,2) ",%2             \n"
3194    "sub       $0x10,%3                          \n"
3195    "jg        1b                                \n"
3196  : "+r"(src_u),     // %0
3197    "+r"(src_v),     // %1
3198    "+r"(dst_uv),    // %2
3199    "+r"(width)      // %3
3200  :
3201  : "memory", "cc"
3202#if defined(__native_client__) && defined(__x86_64__)
3203    , "r14"
3204#endif
3205#if defined(__SSE2__)
3206    , "xmm0", "xmm1", "xmm2"
3207#endif
3208  );
3209}
3210
3211void MergeUVRow_Unaligned_SSE2(const uint8* src_u, const uint8* src_v,
3212                               uint8* dst_uv, int width) {
3213  asm volatile (
3214    "sub       %0,%1                             \n"
3215    LABELALIGN
3216  "1:                                            \n"
3217    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
3218    MEMOPREG(movdqu,0x00,0,1,1,xmm1)             //  movdqu    (%0,%1,1),%%xmm1
3219    "lea       " MEMLEA(0x10,0) ",%0             \n"
3220    "movdqa    %%xmm0,%%xmm2                     \n"
3221    "punpcklbw %%xmm1,%%xmm0                     \n"
3222    "punpckhbw %%xmm1,%%xmm2                     \n"
3223    "movdqu    %%xmm0," MEMACCESS(2) "           \n"
3224    "movdqu    %%xmm2," MEMACCESS2(0x10,2) "     \n"
3225    "lea       " MEMLEA(0x20,2) ",%2             \n"
3226    "sub       $0x10,%3                          \n"
3227    "jg        1b                                \n"
3228  : "+r"(src_u),     // %0
3229    "+r"(src_v),     // %1
3230    "+r"(dst_uv),    // %2
3231    "+r"(width)      // %3
3232  :
3233  : "memory", "cc"
3234#if defined(__native_client__) && defined(__x86_64__)
3235    , "r14"
3236#endif
3237#if defined(__SSE2__)
3238    , "xmm0", "xmm1", "xmm2"
3239#endif
3240  );
3241}
3242#endif  // HAS_MERGEUVROW_SSE2
3243
3244#ifdef HAS_COPYROW_SSE2
3245void CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
3246  asm volatile (
3247    LABELALIGN
3248  "1:                                          \n"
3249    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3250    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3251    "lea       " MEMLEA(0x20,0) ",%0           \n"
3252    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
3253    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
3254    "lea       " MEMLEA(0x20,1) ",%1           \n"
3255    "sub       $0x20,%2                        \n"
3256    "jg        1b                              \n"
3257  : "+r"(src),   // %0
3258    "+r"(dst),   // %1
3259    "+r"(count)  // %2
3260  :
3261  : "memory", "cc"
3262#if defined(__SSE2__)
3263    , "xmm0", "xmm1"
3264#endif
3265  );
3266}
3267#endif  // HAS_COPYROW_SSE2
3268
3269#ifdef HAS_COPYROW_X86
3270void CopyRow_X86(const uint8* src, uint8* dst, int width) {
3271  size_t width_tmp = (size_t)(width);
3272  asm volatile (
3273    "shr       $0x2,%2                         \n"
3274    "rep movsl " MEMMOVESTRING(0,1) "          \n"
3275  : "+S"(src),  // %0
3276    "+D"(dst),  // %1
3277    "+c"(width_tmp) // %2
3278  :
3279  : "memory", "cc"
3280  );
3281}
3282#endif  // HAS_COPYROW_X86
3283
3284#ifdef HAS_COPYROW_ERMS
3285// Unaligned Multiple of 1.
3286void CopyRow_ERMS(const uint8* src, uint8* dst, int width) {
3287  size_t width_tmp = (size_t)(width);
3288  asm volatile (
3289    "rep movsb " MEMMOVESTRING(0,1) "          \n"
3290  : "+S"(src),  // %0
3291    "+D"(dst),  // %1
3292    "+c"(width_tmp) // %2
3293  :
3294  : "memory", "cc"
3295  );
3296}
3297#endif  // HAS_COPYROW_ERMS
3298
3299#ifdef HAS_ARGBCOPYALPHAROW_SSE2
3300// width in pixels
3301void ARGBCopyAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3302  asm volatile (
3303    "pcmpeqb   %%xmm0,%%xmm0                   \n"
3304    "pslld     $0x18,%%xmm0                    \n"
3305    "pcmpeqb   %%xmm1,%%xmm1                   \n"
3306    "psrld     $0x8,%%xmm1                     \n"
3307    LABELALIGN
3308  "1:                                          \n"
3309    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
3310    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
3311    "lea       " MEMLEA(0x20,0) ",%0           \n"
3312    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
3313    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
3314    "pand      %%xmm0,%%xmm2                   \n"
3315    "pand      %%xmm0,%%xmm3                   \n"
3316    "pand      %%xmm1,%%xmm4                   \n"
3317    "pand      %%xmm1,%%xmm5                   \n"
3318    "por       %%xmm4,%%xmm2                   \n"
3319    "por       %%xmm5,%%xmm3                   \n"
3320    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
3321    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
3322    "lea       " MEMLEA(0x20,1) ",%1           \n"
3323    "sub       $0x8,%2                         \n"
3324    "jg        1b                              \n"
3325  : "+r"(src),   // %0
3326    "+r"(dst),   // %1
3327    "+r"(width)  // %2
3328  :
3329  : "memory", "cc"
3330#if defined(__SSE2__)
3331    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3332#endif
3333  );
3334}
3335#endif  // HAS_ARGBCOPYALPHAROW_SSE2
3336
3337#ifdef HAS_ARGBCOPYALPHAROW_AVX2
3338// width in pixels
3339void ARGBCopyAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3340  asm volatile (
3341    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
3342    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
3343    LABELALIGN
3344  "1:                                          \n"
3345    "vmovdqu   " MEMACCESS(0) ",%%ymm1         \n"
3346    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm2   \n"
3347    "lea       " MEMLEA(0x40,0) ",%0           \n"
3348    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
3349    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
3350    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
3351    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
3352    "lea       " MEMLEA(0x40,1) ",%1           \n"
3353    "sub       $0x10,%2                        \n"
3354    "jg        1b                              \n"
3355    "vzeroupper                                \n"
3356  : "+r"(src),   // %0
3357    "+r"(dst),   // %1
3358    "+r"(width)  // %2
3359  :
3360  : "memory", "cc"
3361#if defined(__SSE2__)
3362    , "xmm0", "xmm1", "xmm2"
3363#endif
3364  );
3365}
3366#endif  // HAS_ARGBCOPYALPHAROW_AVX2
3367
3368#ifdef HAS_ARGBCOPYYTOALPHAROW_SSE2
3369// width in pixels
3370void ARGBCopyYToAlphaRow_SSE2(const uint8* src, uint8* dst, int width) {
3371  asm volatile (
3372    "pcmpeqb   %%xmm0,%%xmm0                   \n"
3373    "pslld     $0x18,%%xmm0                    \n"
3374    "pcmpeqb   %%xmm1,%%xmm1                   \n"
3375    "psrld     $0x8,%%xmm1                     \n"
3376    LABELALIGN
3377  "1:                                          \n"
3378    "movq      " MEMACCESS(0) ",%%xmm2         \n"
3379    "lea       " MEMLEA(0x8,0) ",%0            \n"
3380    "punpcklbw %%xmm2,%%xmm2                   \n"
3381    "punpckhwd %%xmm2,%%xmm3                   \n"
3382    "punpcklwd %%xmm2,%%xmm2                   \n"
3383    "movdqa    " MEMACCESS(1) ",%%xmm4         \n"
3384    "movdqa    " MEMACCESS2(0x10,1) ",%%xmm5   \n"
3385    "pand      %%xmm0,%%xmm2                   \n"
3386    "pand      %%xmm0,%%xmm3                   \n"
3387    "pand      %%xmm1,%%xmm4                   \n"
3388    "pand      %%xmm1,%%xmm5                   \n"
3389    "por       %%xmm4,%%xmm2                   \n"
3390    "por       %%xmm5,%%xmm3                   \n"
3391    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
3392    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
3393    "lea       " MEMLEA(0x20,1) ",%1           \n"
3394    "sub       $0x8,%2                         \n"
3395    "jg        1b                              \n"
3396  : "+r"(src),   // %0
3397    "+r"(dst),   // %1
3398    "+r"(width)  // %2
3399  :
3400  : "memory", "cc"
3401#if defined(__SSE2__)
3402    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
3403#endif
3404  );
3405}
3406#endif  // HAS_ARGBCOPYYTOALPHAROW_SSE2
3407
3408#ifdef HAS_ARGBCOPYYTOALPHAROW_AVX2
3409// width in pixels
3410void ARGBCopyYToAlphaRow_AVX2(const uint8* src, uint8* dst, int width) {
3411  asm volatile (
3412    "vpcmpeqb  %%ymm0,%%ymm0,%%ymm0            \n"
3413    "vpsrld    $0x8,%%ymm0,%%ymm0              \n"
3414    LABELALIGN
3415  "1:                                          \n"
3416    "vpmovzxbd " MEMACCESS(0) ",%%ymm1         \n"
3417    "vpmovzxbd " MEMACCESS2(0x8,0) ",%%ymm2    \n"
3418    "lea       " MEMLEA(0x10,0) ",%0           \n"
3419    "vpslld    $0x18,%%ymm1,%%ymm1             \n"
3420    "vpslld    $0x18,%%ymm2,%%ymm2             \n"
3421    "vpblendvb %%ymm0," MEMACCESS(1) ",%%ymm1,%%ymm1        \n"
3422    "vpblendvb %%ymm0," MEMACCESS2(0x20,1) ",%%ymm2,%%ymm2  \n"
3423    "vmovdqu   %%ymm1," MEMACCESS(1) "         \n"
3424    "vmovdqu   %%ymm2," MEMACCESS2(0x20,1) "   \n"
3425    "lea       " MEMLEA(0x40,1) ",%1           \n"
3426    "sub       $0x10,%2                        \n"
3427    "jg        1b                              \n"
3428    "vzeroupper                                \n"
3429  : "+r"(src),   // %0
3430    "+r"(dst),   // %1
3431    "+r"(width)  // %2
3432  :
3433  : "memory", "cc"
3434#if defined(__SSE2__)
3435    , "xmm0", "xmm1", "xmm2"
3436#endif
3437  );
3438}
3439#endif  // HAS_ARGBCOPYYTOALPHAROW_AVX2
3440
3441#ifdef HAS_SETROW_X86
3442void SetRow_X86(uint8* dst, uint32 v32, int width) {
3443  size_t width_tmp = (size_t)(width);
3444  asm volatile (
3445    "shr       $0x2,%1                         \n"
3446    "rep stosl " MEMSTORESTRING(eax,0) "       \n"
3447    : "+D"(dst),       // %0
3448      "+c"(width_tmp)  // %1
3449    : "a"(v32)         // %2
3450    : "memory", "cc");
3451}
3452
3453void ARGBSetRows_X86(uint8* dst, uint32 v32, int width,
3454                   int dst_stride, int height) {
3455  for (int y = 0; y < height; ++y) {
3456    size_t width_tmp = (size_t)(width);
3457    uint32* d = (uint32*)(dst);
3458    asm volatile (
3459      "rep stosl " MEMSTORESTRING(eax,0) "     \n"
3460      : "+D"(d),         // %0
3461        "+c"(width_tmp)  // %1
3462      : "a"(v32)         // %2
3463      : "memory", "cc");
3464    dst += dst_stride;
3465  }
3466}
3467#endif  // HAS_SETROW_X86
3468
3469#ifdef HAS_YUY2TOYROW_SSE2
3470void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
3471  asm volatile (
3472    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3473    "psrlw     $0x8,%%xmm5                     \n"
3474    LABELALIGN
3475  "1:                                          \n"
3476    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3477    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3478    "lea       " MEMLEA(0x20,0) ",%0           \n"
3479    "pand      %%xmm5,%%xmm0                   \n"
3480    "pand      %%xmm5,%%xmm1                   \n"
3481    "packuswb  %%xmm1,%%xmm0                   \n"
3482    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
3483    "lea       " MEMLEA(0x10,1) ",%1           \n"
3484    "sub       $0x10,%2                        \n"
3485    "jg        1b                              \n"
3486  : "+r"(src_yuy2),  // %0
3487    "+r"(dst_y),     // %1
3488    "+r"(pix)        // %2
3489  :
3490  : "memory", "cc"
3491#if defined(__SSE2__)
3492    , "xmm0", "xmm1", "xmm5"
3493#endif
3494  );
3495}
3496
3497void YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
3498                      uint8* dst_u, uint8* dst_v, int pix) {
3499  asm volatile (
3500    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3501    "psrlw     $0x8,%%xmm5                     \n"
3502    "sub       %1,%2                           \n"
3503    LABELALIGN
3504  "1:                                          \n"
3505    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3506    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3507    BUNDLEALIGN
3508    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
3509    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
3510    "lea       " MEMLEA(0x20,0) ",%0           \n"
3511    "pavgb     %%xmm2,%%xmm0                   \n"
3512    "pavgb     %%xmm3,%%xmm1                   \n"
3513    "psrlw     $0x8,%%xmm0                     \n"
3514    "psrlw     $0x8,%%xmm1                     \n"
3515    "packuswb  %%xmm1,%%xmm0                   \n"
3516    "movdqa    %%xmm0,%%xmm1                   \n"
3517    "pand      %%xmm5,%%xmm0                   \n"
3518    "packuswb  %%xmm0,%%xmm0                   \n"
3519    "psrlw     $0x8,%%xmm1                     \n"
3520    "packuswb  %%xmm1,%%xmm1                   \n"
3521    "movq      %%xmm0," MEMACCESS(1) "         \n"
3522    BUNDLEALIGN
3523    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3524    "lea       " MEMLEA(0x8,1) ",%1            \n"
3525    "sub       $0x10,%3                        \n"
3526    "jg        1b                              \n"
3527  : "+r"(src_yuy2),    // %0
3528    "+r"(dst_u),       // %1
3529    "+r"(dst_v),       // %2
3530    "+r"(pix)          // %3
3531  : "r"((intptr_t)(stride_yuy2))  // %4
3532  : "memory", "cc"
3533#if defined(__native_client__) && defined(__x86_64__)
3534    , "r14"
3535#endif
3536#if defined(__SSE2__)
3537    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3538#endif
3539  );
3540}
3541
3542void YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
3543                         uint8* dst_u, uint8* dst_v, int pix) {
3544  asm volatile (
3545    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3546    "psrlw     $0x8,%%xmm5                     \n"
3547    "sub       %1,%2                           \n"
3548    LABELALIGN
3549  "1:                                          \n"
3550    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3551    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3552    "lea       " MEMLEA(0x20,0) ",%0           \n"
3553    "psrlw     $0x8,%%xmm0                     \n"
3554    "psrlw     $0x8,%%xmm1                     \n"
3555    "packuswb  %%xmm1,%%xmm0                   \n"
3556    "movdqa    %%xmm0,%%xmm1                   \n"
3557    "pand      %%xmm5,%%xmm0                   \n"
3558    "packuswb  %%xmm0,%%xmm0                   \n"
3559    "psrlw     $0x8,%%xmm1                     \n"
3560    "packuswb  %%xmm1,%%xmm1                   \n"
3561    "movq      %%xmm0," MEMACCESS(1) "         \n"
3562    BUNDLEALIGN
3563    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3564    "lea       " MEMLEA(0x8,1) ",%1            \n"
3565    "sub       $0x10,%3                        \n"
3566    "jg        1b                              \n"
3567  : "+r"(src_yuy2),    // %0
3568    "+r"(dst_u),       // %1
3569    "+r"(dst_v),       // %2
3570    "+r"(pix)          // %3
3571  :
3572  : "memory", "cc"
3573#if defined(__native_client__) && defined(__x86_64__)
3574    , "r14"
3575#endif
3576#if defined(__SSE2__)
3577    , "xmm0", "xmm1", "xmm5"
3578#endif
3579  );
3580}
3581
3582void YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
3583                               uint8* dst_y, int pix) {
3584  asm volatile (
3585    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3586    "psrlw     $0x8,%%xmm5                     \n"
3587    LABELALIGN
3588  "1:                                          \n"
3589    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3590    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3591    "lea       " MEMLEA(0x20,0) ",%0           \n"
3592    "pand      %%xmm5,%%xmm0                   \n"
3593    "pand      %%xmm5,%%xmm1                   \n"
3594    "packuswb  %%xmm1,%%xmm0                   \n"
3595    "sub       $0x10,%2                        \n"
3596    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3597    "lea       " MEMLEA(0x10,1) ",%1           \n"
3598    "jg        1b                              \n"
3599  : "+r"(src_yuy2),  // %0
3600    "+r"(dst_y),     // %1
3601    "+r"(pix)        // %2
3602  :
3603  : "memory", "cc"
3604#if defined(__SSE2__)
3605    , "xmm0", "xmm1", "xmm5"
3606#endif
3607  );
3608}
3609
3610void YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
3611                                int stride_yuy2,
3612                                uint8* dst_u, uint8* dst_v, int pix) {
3613  asm volatile (
3614    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3615    "psrlw     $0x8,%%xmm5                     \n"
3616    "sub       %1,%2                           \n"
3617    LABELALIGN
3618  "1:                                          \n"
3619    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3620    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3621    BUNDLEALIGN
3622    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3623    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3624    "lea       " MEMLEA(0x20,0) ",%0           \n"
3625    "pavgb     %%xmm2,%%xmm0                   \n"
3626    "pavgb     %%xmm3,%%xmm1                   \n"
3627    "psrlw     $0x8,%%xmm0                     \n"
3628    "psrlw     $0x8,%%xmm1                     \n"
3629    "packuswb  %%xmm1,%%xmm0                   \n"
3630    "movdqa    %%xmm0,%%xmm1                   \n"
3631    "pand      %%xmm5,%%xmm0                   \n"
3632    "packuswb  %%xmm0,%%xmm0                   \n"
3633    "psrlw     $0x8,%%xmm1                     \n"
3634    "packuswb  %%xmm1,%%xmm1                   \n"
3635    "movq      %%xmm0," MEMACCESS(1) "         \n"
3636    BUNDLEALIGN
3637    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3638    "lea       " MEMLEA(0x8,1) ",%1            \n"
3639    "sub       $0x10,%3                        \n"
3640    "jg        1b                              \n"
3641  : "+r"(src_yuy2),    // %0
3642    "+r"(dst_u),       // %1
3643    "+r"(dst_v),       // %2
3644    "+r"(pix)          // %3
3645  : "r"((intptr_t)(stride_yuy2))  // %4
3646  : "memory", "cc"
3647#if defined(__native_client__) && defined(__x86_64__)
3648    , "r14"
3649#endif
3650#if defined(__SSE2__)
3651    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3652#endif
3653  );
3654}
3655
3656void YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
3657                                   uint8* dst_u, uint8* dst_v, int pix) {
3658  asm volatile (
3659    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3660    "psrlw     $0x8,%%xmm5                     \n"
3661    "sub       %1,%2                           \n"
3662    LABELALIGN
3663  "1:                                          \n"
3664    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3665    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3666    "lea       " MEMLEA(0x20,0) ",%0           \n"
3667    "psrlw     $0x8,%%xmm0                     \n"
3668    "psrlw     $0x8,%%xmm1                     \n"
3669    "packuswb  %%xmm1,%%xmm0                   \n"
3670    "movdqa    %%xmm0,%%xmm1                   \n"
3671    "pand      %%xmm5,%%xmm0                   \n"
3672    "packuswb  %%xmm0,%%xmm0                   \n"
3673    "psrlw     $0x8,%%xmm1                     \n"
3674    "packuswb  %%xmm1,%%xmm1                   \n"
3675    "movq      %%xmm0," MEMACCESS(1) "         \n"
3676    BUNDLEALIGN
3677    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3678    "lea       " MEMLEA(0x8,1) ",%1            \n"
3679    "sub       $0x10,%3                        \n"
3680    "jg        1b                              \n"
3681  : "+r"(src_yuy2),    // %0
3682    "+r"(dst_u),       // %1
3683    "+r"(dst_v),       // %2
3684    "+r"(pix)          // %3
3685  :
3686  : "memory", "cc"
3687#if defined(__native_client__) && defined(__x86_64__)
3688    , "r14"
3689#endif
3690#if defined(__SSE2__)
3691    , "xmm0", "xmm1", "xmm5"
3692#endif
3693  );
3694}
3695
3696void UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
3697  asm volatile (
3698    LABELALIGN
3699  "1:                                          \n"
3700    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3701    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3702    "lea       " MEMLEA(0x20,0) ",%0           \n"
3703    "psrlw     $0x8,%%xmm0                     \n"
3704    "psrlw     $0x8,%%xmm1                     \n"
3705    "packuswb  %%xmm1,%%xmm0                   \n"
3706    "sub       $0x10,%2                        \n"
3707    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
3708    "lea       " MEMLEA(0x10,1) ",%1           \n"
3709    "jg        1b                              \n"
3710  : "+r"(src_uyvy),  // %0
3711    "+r"(dst_y),     // %1
3712    "+r"(pix)        // %2
3713  :
3714  : "memory", "cc"
3715#if defined(__SSE2__)
3716    , "xmm0", "xmm1"
3717#endif
3718  );
3719}
3720
3721void UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
3722                      uint8* dst_u, uint8* dst_v, int pix) {
3723  asm volatile (
3724    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3725    "psrlw     $0x8,%%xmm5                     \n"
3726    "sub       %1,%2                           \n"
3727    LABELALIGN
3728  "1:                                          \n"
3729    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3730    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3731    BUNDLEALIGN
3732    MEMOPREG(movdqa,0x00,0,4,1,xmm2)           //  movdqa  (%0,%4,1),%%xmm2
3733    MEMOPREG(movdqa,0x10,0,4,1,xmm3)           //  movdqa  0x10(%0,%4,1),%%xmm3
3734    "lea       " MEMLEA(0x20,0) ",%0           \n"
3735    "pavgb     %%xmm2,%%xmm0                   \n"
3736    "pavgb     %%xmm3,%%xmm1                   \n"
3737    "pand      %%xmm5,%%xmm0                   \n"
3738    "pand      %%xmm5,%%xmm1                   \n"
3739    "packuswb  %%xmm1,%%xmm0                   \n"
3740    "movdqa    %%xmm0,%%xmm1                   \n"
3741    "pand      %%xmm5,%%xmm0                   \n"
3742    "packuswb  %%xmm0,%%xmm0                   \n"
3743    "psrlw     $0x8,%%xmm1                     \n"
3744    "packuswb  %%xmm1,%%xmm1                   \n"
3745    "movq      %%xmm0," MEMACCESS(1) "         \n"
3746    BUNDLEALIGN
3747    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3748    "lea       " MEMLEA(0x8,1) ",%1            \n"
3749    "sub       $0x10,%3                        \n"
3750    "jg        1b                              \n"
3751  : "+r"(src_uyvy),    // %0
3752    "+r"(dst_u),       // %1
3753    "+r"(dst_v),       // %2
3754    "+r"(pix)          // %3
3755  : "r"((intptr_t)(stride_uyvy))  // %4
3756  : "memory", "cc"
3757#if defined(__native_client__) && defined(__x86_64__)
3758    , "r14"
3759#endif
3760#if defined(__SSE2__)
3761    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3762#endif
3763  );
3764}
3765
3766void UYVYToUV422Row_SSE2(const uint8* src_uyvy,
3767                         uint8* dst_u, uint8* dst_v, int pix) {
3768  asm volatile (
3769    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3770    "psrlw     $0x8,%%xmm5                     \n"
3771    "sub       %1,%2                           \n"
3772    LABELALIGN
3773  "1:                                          \n"
3774    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
3775    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3776    "lea       " MEMLEA(0x20,0) ",%0           \n"
3777    "pand      %%xmm5,%%xmm0                   \n"
3778    "pand      %%xmm5,%%xmm1                   \n"
3779    "packuswb  %%xmm1,%%xmm0                   \n"
3780    "movdqa    %%xmm0,%%xmm1                   \n"
3781    "pand      %%xmm5,%%xmm0                   \n"
3782    "packuswb  %%xmm0,%%xmm0                   \n"
3783    "psrlw     $0x8,%%xmm1                     \n"
3784    "packuswb  %%xmm1,%%xmm1                   \n"
3785    "movq      %%xmm0," MEMACCESS(1) "         \n"
3786    BUNDLEALIGN
3787    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3788    "lea       " MEMLEA(0x8,1) ",%1            \n"
3789    "sub       $0x10,%3                        \n"
3790    "jg        1b                              \n"
3791  : "+r"(src_uyvy),    // %0
3792    "+r"(dst_u),       // %1
3793    "+r"(dst_v),       // %2
3794    "+r"(pix)          // %3
3795  :
3796  : "memory", "cc"
3797#if defined(__native_client__) && defined(__x86_64__)
3798    , "r14"
3799#endif
3800#if defined(__SSE2__)
3801    , "xmm0", "xmm1", "xmm5"
3802#endif
3803  );
3804}
3805
3806void UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
3807                               uint8* dst_y, int pix) {
3808  asm volatile (
3809    LABELALIGN
3810  "1:                                          \n"
3811    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3812    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3813    "lea       " MEMLEA(0x20,0) ",%0           \n"
3814    "psrlw     $0x8,%%xmm0                     \n"
3815    "psrlw     $0x8,%%xmm1                     \n"
3816    "packuswb  %%xmm1,%%xmm0                   \n"
3817    "sub       $0x10,%2                        \n"
3818    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
3819    "lea       " MEMLEA(0x10,1) ",%1           \n"
3820    "jg        1b                              \n"
3821  : "+r"(src_uyvy),  // %0
3822    "+r"(dst_y),     // %1
3823    "+r"(pix)        // %2
3824  :
3825  : "memory", "cc"
3826#if defined(__SSE2__)
3827    , "xmm0", "xmm1"
3828#endif
3829  );
3830}
3831
3832void UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
3833                                uint8* dst_u, uint8* dst_v, int pix) {
3834  asm volatile (
3835    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3836    "psrlw     $0x8,%%xmm5                     \n"
3837    "sub       %1,%2                           \n"
3838    LABELALIGN
3839  "1:                                          \n"
3840    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3841    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3842    BUNDLEALIGN
3843    MEMOPREG(movdqu,0x00,0,4,1,xmm2)           //  movdqu  (%0,%4,1),%%xmm2
3844    MEMOPREG(movdqu,0x10,0,4,1,xmm3)           //  movdqu  0x10(%0,%4,1),%%xmm3
3845    "lea       " MEMLEA(0x20,0) ",%0           \n"
3846    "pavgb     %%xmm2,%%xmm0                   \n"
3847    "pavgb     %%xmm3,%%xmm1                   \n"
3848    "pand      %%xmm5,%%xmm0                   \n"
3849    "pand      %%xmm5,%%xmm1                   \n"
3850    "packuswb  %%xmm1,%%xmm0                   \n"
3851    "movdqa    %%xmm0,%%xmm1                   \n"
3852    "pand      %%xmm5,%%xmm0                   \n"
3853    "packuswb  %%xmm0,%%xmm0                   \n"
3854    "psrlw     $0x8,%%xmm1                     \n"
3855    "packuswb  %%xmm1,%%xmm1                   \n"
3856    "movq      %%xmm0," MEMACCESS(1) "         \n"
3857    BUNDLEALIGN
3858    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3859    "lea       " MEMLEA(0x8,1) ",%1            \n"
3860    "sub       $0x10,%3                        \n"
3861    "jg        1b                              \n"
3862  : "+r"(src_uyvy),    // %0
3863    "+r"(dst_u),       // %1
3864    "+r"(dst_v),       // %2
3865    "+r"(pix)          // %3
3866  : "r"((intptr_t)(stride_uyvy))  // %4
3867  : "memory", "cc"
3868#if defined(__native_client__) && defined(__x86_64__)
3869    , "r14"
3870#endif
3871#if defined(__SSE2__)
3872    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
3873#endif
3874  );
3875}
3876
3877void UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
3878                                   uint8* dst_u, uint8* dst_v, int pix) {
3879  asm volatile (
3880    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3881    "psrlw     $0x8,%%xmm5                     \n"
3882    "sub       %1,%2                           \n"
3883    LABELALIGN
3884  "1:                                          \n"
3885    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
3886    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
3887    "lea       " MEMLEA(0x20,0) ",%0           \n"
3888    "pand      %%xmm5,%%xmm0                   \n"
3889    "pand      %%xmm5,%%xmm1                   \n"
3890    "packuswb  %%xmm1,%%xmm0                   \n"
3891    "movdqa    %%xmm0,%%xmm1                   \n"
3892    "pand      %%xmm5,%%xmm0                   \n"
3893    "packuswb  %%xmm0,%%xmm0                   \n"
3894    "psrlw     $0x8,%%xmm1                     \n"
3895    "packuswb  %%xmm1,%%xmm1                   \n"
3896    "movq      %%xmm0," MEMACCESS(1) "         \n"
3897    BUNDLEALIGN
3898    MEMOPMEM(movq,xmm1,0x00,1,2,1)             //  movq    %%xmm1,(%1,%2)
3899    "lea       " MEMLEA(0x8,1) ",%1            \n"
3900    "sub       $0x10,%3                        \n"
3901    "jg        1b                              \n"
3902  : "+r"(src_uyvy),    // %0
3903    "+r"(dst_u),       // %1
3904    "+r"(dst_v),       // %2
3905    "+r"(pix)          // %3
3906  :
3907  : "memory", "cc"
3908#if defined(__native_client__) && defined(__x86_64__)
3909    , "r14"
3910#endif
3911#if defined(__SSE2__)
3912    , "xmm0", "xmm1", "xmm5"
3913#endif
3914  );
3915}
3916#endif  // HAS_YUY2TOYROW_SSE2
3917
3918#ifdef HAS_ARGBBLENDROW_SSE2
3919// Blend 8 pixels at a time.
3920void ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
3921                       uint8* dst_argb, int width) {
3922  asm volatile (
3923    "pcmpeqb   %%xmm7,%%xmm7                   \n"
3924    "psrlw     $0xf,%%xmm7                     \n"
3925    "pcmpeqb   %%xmm6,%%xmm6                   \n"
3926    "psrlw     $0x8,%%xmm6                     \n"
3927    "pcmpeqb   %%xmm5,%%xmm5                   \n"
3928    "psllw     $0x8,%%xmm5                     \n"
3929    "pcmpeqb   %%xmm4,%%xmm4                   \n"
3930    "pslld     $0x18,%%xmm4                    \n"
3931    "sub       $0x1,%3                         \n"
3932    "je        91f                             \n"
3933    "jl        99f                             \n"
3934
3935    // 1 pixel loop until destination pointer is aligned.
3936  "10:                                         \n"
3937    "test      $0xf,%2                         \n"
3938    "je        19f                             \n"
3939    "movd      " MEMACCESS(0) ",%%xmm3         \n"
3940    "lea       " MEMLEA(0x4,0) ",%0            \n"
3941    "movdqa    %%xmm3,%%xmm0                   \n"
3942    "pxor      %%xmm4,%%xmm3                   \n"
3943    "movd      " MEMACCESS(1) ",%%xmm2         \n"
3944    "psrlw     $0x8,%%xmm3                     \n"
3945    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3946    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3947    "pand      %%xmm6,%%xmm2                   \n"
3948    "paddw     %%xmm7,%%xmm3                   \n"
3949    "pmullw    %%xmm3,%%xmm2                   \n"
3950    "movd      " MEMACCESS(1) ",%%xmm1         \n"
3951    "lea       " MEMLEA(0x4,1) ",%1            \n"
3952    "psrlw     $0x8,%%xmm1                     \n"
3953    "por       %%xmm4,%%xmm0                   \n"
3954    "pmullw    %%xmm3,%%xmm1                   \n"
3955    "psrlw     $0x8,%%xmm2                     \n"
3956    "paddusb   %%xmm2,%%xmm0                   \n"
3957    "pand      %%xmm5,%%xmm1                   \n"
3958    "paddusb   %%xmm1,%%xmm0                   \n"
3959    "sub       $0x1,%3                         \n"
3960    "movd      %%xmm0," MEMACCESS(2) "         \n"
3961    "lea       " MEMLEA(0x4,2) ",%2            \n"
3962    "jge       10b                             \n"
3963
3964  "19:                                         \n"
3965    "add       $1-4,%3                         \n"
3966    "jl        49f                             \n"
3967
3968    // 4 pixel loop.
3969    LABELALIGN
3970  "41:                                         \n"
3971    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
3972    "lea       " MEMLEA(0x10,0) ",%0           \n"
3973    "movdqa    %%xmm3,%%xmm0                   \n"
3974    "pxor      %%xmm4,%%xmm3                   \n"
3975    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
3976    "psrlw     $0x8,%%xmm3                     \n"
3977    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
3978    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
3979    "pand      %%xmm6,%%xmm2                   \n"
3980    "paddw     %%xmm7,%%xmm3                   \n"
3981    "pmullw    %%xmm3,%%xmm2                   \n"
3982    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
3983    "lea       " MEMLEA(0x10,1) ",%1           \n"
3984    "psrlw     $0x8,%%xmm1                     \n"
3985    "por       %%xmm4,%%xmm0                   \n"
3986    "pmullw    %%xmm3,%%xmm1                   \n"
3987    "psrlw     $0x8,%%xmm2                     \n"
3988    "paddusb   %%xmm2,%%xmm0                   \n"
3989    "pand      %%xmm5,%%xmm1                   \n"
3990    "paddusb   %%xmm1,%%xmm0                   \n"
3991    "sub       $0x4,%3                         \n"
3992    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
3993    "lea       " MEMLEA(0x10,2) ",%2           \n"
3994    "jge       41b                             \n"
3995
3996  "49:                                         \n"
3997    "add       $0x3,%3                         \n"
3998    "jl        99f                             \n"
3999
4000    // 1 pixel loop.
4001  "91:                                         \n"
4002    "movd      " MEMACCESS(0) ",%%xmm3         \n"
4003    "lea       " MEMLEA(0x4,0) ",%0            \n"
4004    "movdqa    %%xmm3,%%xmm0                   \n"
4005    "pxor      %%xmm4,%%xmm3                   \n"
4006    "movd      " MEMACCESS(1) ",%%xmm2         \n"
4007    "psrlw     $0x8,%%xmm3                     \n"
4008    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
4009    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
4010    "pand      %%xmm6,%%xmm2                   \n"
4011    "paddw     %%xmm7,%%xmm3                   \n"
4012    "pmullw    %%xmm3,%%xmm2                   \n"
4013    "movd      " MEMACCESS(1) ",%%xmm1         \n"
4014    "lea       " MEMLEA(0x4,1) ",%1            \n"
4015    "psrlw     $0x8,%%xmm1                     \n"
4016    "por       %%xmm4,%%xmm0                   \n"
4017    "pmullw    %%xmm3,%%xmm1                   \n"
4018    "psrlw     $0x8,%%xmm2                     \n"
4019    "paddusb   %%xmm2,%%xmm0                   \n"
4020    "pand      %%xmm5,%%xmm1                   \n"
4021    "paddusb   %%xmm1,%%xmm0                   \n"
4022    "sub       $0x1,%3                         \n"
4023    "movd      %%xmm0," MEMACCESS(2) "         \n"
4024    "lea       " MEMLEA(0x4,2) ",%2            \n"
4025    "jge       91b                             \n"
4026  "99:                                         \n"
4027  : "+r"(src_argb0),    // %0
4028    "+r"(src_argb1),    // %1
4029    "+r"(dst_argb),     // %2
4030    "+r"(width)         // %3
4031  :
4032  : "memory", "cc"
4033#if defined(__SSE2__)
4034    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4035#endif
4036  );
4037}
4038#endif  // HAS_ARGBBLENDROW_SSE2
4039
4040#ifdef HAS_ARGBBLENDROW_SSSE3
4041// Shuffle table for isolating alpha.
4042static uvec8 kShuffleAlpha = {
4043  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
4044  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
4045};
4046
4047// Blend 8 pixels at a time
4048// Shuffle table for reversing the bytes.
4049
4050// Same as SSE2, but replaces
4051//    psrlw      xmm3, 8          // alpha
4052//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
4053//    pshuflw    xmm3, xmm3,0F5h
4054// with..
4055//    pshufb     xmm3, kShuffleAlpha // alpha
4056
4057void ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
4058                        uint8* dst_argb, int width) {
4059  asm volatile (
4060    "pcmpeqb   %%xmm7,%%xmm7                   \n"
4061    "psrlw     $0xf,%%xmm7                     \n"
4062    "pcmpeqb   %%xmm6,%%xmm6                   \n"
4063    "psrlw     $0x8,%%xmm6                     \n"
4064    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4065    "psllw     $0x8,%%xmm5                     \n"
4066    "pcmpeqb   %%xmm4,%%xmm4                   \n"
4067    "pslld     $0x18,%%xmm4                    \n"
4068    "sub       $0x1,%3                         \n"
4069    "je        91f                             \n"
4070    "jl        99f                             \n"
4071
4072    // 1 pixel loop until destination pointer is aligned.
4073  "10:                                         \n"
4074    "test      $0xf,%2                         \n"
4075    "je        19f                             \n"
4076    "movd      " MEMACCESS(0) ",%%xmm3         \n"
4077    "lea       " MEMLEA(0x4,0) ",%0            \n"
4078    "movdqa    %%xmm3,%%xmm0                   \n"
4079    "pxor      %%xmm4,%%xmm3                   \n"
4080    "movd      " MEMACCESS(1) ",%%xmm2         \n"
4081    "pshufb    %4,%%xmm3                       \n"
4082    "pand      %%xmm6,%%xmm2                   \n"
4083    "paddw     %%xmm7,%%xmm3                   \n"
4084    "pmullw    %%xmm3,%%xmm2                   \n"
4085    "movd      " MEMACCESS(1) ",%%xmm1         \n"
4086    "lea       " MEMLEA(0x4,1) ",%1            \n"
4087    "psrlw     $0x8,%%xmm1                     \n"
4088    "por       %%xmm4,%%xmm0                   \n"
4089    "pmullw    %%xmm3,%%xmm1                   \n"
4090    "psrlw     $0x8,%%xmm2                     \n"
4091    "paddusb   %%xmm2,%%xmm0                   \n"
4092    "pand      %%xmm5,%%xmm1                   \n"
4093    "paddusb   %%xmm1,%%xmm0                   \n"
4094    "sub       $0x1,%3                         \n"
4095    "movd      %%xmm0," MEMACCESS(2) "         \n"
4096    "lea       " MEMLEA(0x4,2) ",%2            \n"
4097    "jge       10b                             \n"
4098
4099  "19:                                         \n"
4100    "add       $1-4,%3                         \n"
4101    "jl        49f                             \n"
4102    "test      $0xf,%0                         \n"
4103    "jne       41f                             \n"
4104    "test      $0xf,%1                         \n"
4105    "jne       41f                             \n"
4106
4107    // 4 pixel loop.
4108    LABELALIGN
4109  "40:                                         \n"
4110    "movdqa    " MEMACCESS(0) ",%%xmm3         \n"
4111    "lea       " MEMLEA(0x10,0) ",%0           \n"
4112    "movdqa    %%xmm3,%%xmm0                   \n"
4113    "pxor      %%xmm4,%%xmm3                   \n"
4114    "movdqa    " MEMACCESS(1) ",%%xmm2         \n"
4115    "pshufb    %4,%%xmm3                       \n"
4116    "pand      %%xmm6,%%xmm2                   \n"
4117    "paddw     %%xmm7,%%xmm3                   \n"
4118    "pmullw    %%xmm3,%%xmm2                   \n"
4119    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
4120    "lea       " MEMLEA(0x10,1) ",%1           \n"
4121    "psrlw     $0x8,%%xmm1                     \n"
4122    "por       %%xmm4,%%xmm0                   \n"
4123    "pmullw    %%xmm3,%%xmm1                   \n"
4124    "psrlw     $0x8,%%xmm2                     \n"
4125    "paddusb   %%xmm2,%%xmm0                   \n"
4126    "pand      %%xmm5,%%xmm1                   \n"
4127    "paddusb   %%xmm1,%%xmm0                   \n"
4128    "sub       $0x4,%3                         \n"
4129    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
4130    "lea       " MEMLEA(0x10,2) ",%2           \n"
4131    "jge       40b                             \n"
4132    "jmp       49f                             \n"
4133
4134    // 4 pixel unaligned loop.
4135    LABELALIGN
4136  "41:                                         \n"
4137    "movdqu    " MEMACCESS(0) ",%%xmm3         \n"
4138    "lea       " MEMLEA(0x10,0) ",%0           \n"
4139    "movdqa    %%xmm3,%%xmm0                   \n"
4140    "pxor      %%xmm4,%%xmm3                   \n"
4141    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
4142    "pshufb    %4,%%xmm3                       \n"
4143    "pand      %%xmm6,%%xmm2                   \n"
4144    "paddw     %%xmm7,%%xmm3                   \n"
4145    "pmullw    %%xmm3,%%xmm2                   \n"
4146    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4147    "lea       " MEMLEA(0x10,1) ",%1           \n"
4148    "psrlw     $0x8,%%xmm1                     \n"
4149    "por       %%xmm4,%%xmm0                   \n"
4150    "pmullw    %%xmm3,%%xmm1                   \n"
4151    "psrlw     $0x8,%%xmm2                     \n"
4152    "paddusb   %%xmm2,%%xmm0                   \n"
4153    "pand      %%xmm5,%%xmm1                   \n"
4154    "paddusb   %%xmm1,%%xmm0                   \n"
4155    "sub       $0x4,%3                         \n"
4156    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
4157    "lea       " MEMLEA(0x10,2) ",%2           \n"
4158    "jge       41b                             \n"
4159
4160  "49:                                         \n"
4161    "add       $0x3,%3                         \n"
4162    "jl        99f                             \n"
4163
4164    // 1 pixel loop.
4165  "91:                                         \n"
4166    "movd      " MEMACCESS(0) ",%%xmm3         \n"
4167    "lea       " MEMLEA(0x4,0) ",%0            \n"
4168    "movdqa    %%xmm3,%%xmm0                   \n"
4169    "pxor      %%xmm4,%%xmm3                   \n"
4170    "movd      " MEMACCESS(1) ",%%xmm2         \n"
4171    "pshufb    %4,%%xmm3                       \n"
4172    "pand      %%xmm6,%%xmm2                   \n"
4173    "paddw     %%xmm7,%%xmm3                   \n"
4174    "pmullw    %%xmm3,%%xmm2                   \n"
4175    "movd      " MEMACCESS(1) ",%%xmm1         \n"
4176    "lea       " MEMLEA(0x4,1) ",%1            \n"
4177    "psrlw     $0x8,%%xmm1                     \n"
4178    "por       %%xmm4,%%xmm0                   \n"
4179    "pmullw    %%xmm3,%%xmm1                   \n"
4180    "psrlw     $0x8,%%xmm2                     \n"
4181    "paddusb   %%xmm2,%%xmm0                   \n"
4182    "pand      %%xmm5,%%xmm1                   \n"
4183    "paddusb   %%xmm1,%%xmm0                   \n"
4184    "sub       $0x1,%3                         \n"
4185    "movd      %%xmm0," MEMACCESS(2) "         \n"
4186    "lea       " MEMLEA(0x4,2) ",%2            \n"
4187    "jge       91b                             \n"
4188  "99:                                         \n"
4189  : "+r"(src_argb0),    // %0
4190    "+r"(src_argb1),    // %1
4191    "+r"(dst_argb),     // %2
4192    "+r"(width)         // %3
4193  : "m"(kShuffleAlpha)  // %4
4194  : "memory", "cc"
4195#if defined(__SSE2__)
4196    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4197#endif
4198  );
4199}
4200#endif  // HAS_ARGBBLENDROW_SSSE3
4201
4202#ifdef HAS_ARGBATTENUATEROW_SSE2
4203// Attenuate 4 pixels at a time.
4204// aligned to 16 bytes
4205void ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
4206  asm volatile (
4207    "pcmpeqb   %%xmm4,%%xmm4                   \n"
4208    "pslld     $0x18,%%xmm4                    \n"
4209    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4210    "psrld     $0x8,%%xmm5                     \n"
4211
4212    // 4 pixel loop.
4213    LABELALIGN
4214  "1:                                          \n"
4215    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4216    "punpcklbw %%xmm0,%%xmm0                   \n"
4217    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
4218    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
4219    "pmulhuw   %%xmm2,%%xmm0                   \n"
4220    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
4221    "punpckhbw %%xmm1,%%xmm1                   \n"
4222    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
4223    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
4224    "pmulhuw   %%xmm2,%%xmm1                   \n"
4225    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
4226    "lea       " MEMLEA(0x10,0) ",%0           \n"
4227    "psrlw     $0x8,%%xmm0                     \n"
4228    "pand      %%xmm4,%%xmm2                   \n"
4229    "psrlw     $0x8,%%xmm1                     \n"
4230    "packuswb  %%xmm1,%%xmm0                   \n"
4231    "pand      %%xmm5,%%xmm0                   \n"
4232    "por       %%xmm2,%%xmm0                   \n"
4233    "sub       $0x4,%2                         \n"
4234    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
4235    "lea       " MEMLEA(0x10,1) ",%1           \n"
4236    "jg        1b                              \n"
4237  : "+r"(src_argb),    // %0
4238    "+r"(dst_argb),    // %1
4239    "+r"(width)        // %2
4240  :
4241  : "memory", "cc"
4242#if defined(__SSE2__)
4243    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4244#endif
4245  );
4246}
4247#endif  // HAS_ARGBATTENUATEROW_SSE2
4248
4249#ifdef HAS_ARGBATTENUATEROW_SSSE3
4250// Shuffle table duplicating alpha
4251static uvec8 kShuffleAlpha0 = {
4252  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
4253};
4254static uvec8 kShuffleAlpha1 = {
4255  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
4256  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
4257};
4258// Attenuate 4 pixels at a time.
4259// aligned to 16 bytes
4260void ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4261  asm volatile (
4262    "pcmpeqb   %%xmm3,%%xmm3                   \n"
4263    "pslld     $0x18,%%xmm3                    \n"
4264    "movdqa    %3,%%xmm4                       \n"
4265    "movdqa    %4,%%xmm5                       \n"
4266
4267    // 4 pixel loop.
4268    LABELALIGN
4269  "1:                                          \n"
4270    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4271    "pshufb    %%xmm4,%%xmm0                   \n"
4272    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4273    "punpcklbw %%xmm1,%%xmm1                   \n"
4274    "pmulhuw   %%xmm1,%%xmm0                   \n"
4275    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4276    "pshufb    %%xmm5,%%xmm1                   \n"
4277    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4278    "punpckhbw %%xmm2,%%xmm2                   \n"
4279    "pmulhuw   %%xmm2,%%xmm1                   \n"
4280    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
4281    "lea       " MEMLEA(0x10,0) ",%0           \n"
4282    "pand      %%xmm3,%%xmm2                   \n"
4283    "psrlw     $0x8,%%xmm0                     \n"
4284    "psrlw     $0x8,%%xmm1                     \n"
4285    "packuswb  %%xmm1,%%xmm0                   \n"
4286    "por       %%xmm2,%%xmm0                   \n"
4287    "sub       $0x4,%2                         \n"
4288    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4289    "lea       " MEMLEA(0x10,1) ",%1           \n"
4290    "jg        1b                              \n"
4291  : "+r"(src_argb),    // %0
4292    "+r"(dst_argb),    // %1
4293    "+r"(width)        // %2
4294  : "m"(kShuffleAlpha0),  // %3
4295    "m"(kShuffleAlpha1)  // %4
4296  : "memory", "cc"
4297#if defined(__SSE2__)
4298    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4299#endif
4300  );
4301}
4302#endif  // HAS_ARGBATTENUATEROW_SSSE3
4303
4304#ifdef HAS_ARGBUNATTENUATEROW_SSE2
4305// Unattenuate 4 pixels at a time.
4306// aligned to 16 bytes
4307void ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
4308                             int width) {
4309  uintptr_t alpha = 0;
4310  asm volatile (
4311    // 4 pixel loop.
4312    LABELALIGN
4313  "1:                                          \n"
4314    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4315    "movzb     " MEMACCESS2(0x03,0) ",%3       \n"
4316    "punpcklbw %%xmm0,%%xmm0                   \n"
4317    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
4318    "movzb     " MEMACCESS2(0x07,0) ",%3       \n"
4319    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
4320    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4321    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4322    "movlhps   %%xmm3,%%xmm2                   \n"
4323    "pmulhuw   %%xmm2,%%xmm0                   \n"
4324    "movdqu    " MEMACCESS(0) ",%%xmm1         \n"
4325    "movzb     " MEMACCESS2(0x0b,0) ",%3       \n"
4326    "punpckhbw %%xmm1,%%xmm1                   \n"
4327    BUNDLEALIGN
4328    MEMOPREG(movd,0x00,4,3,4,xmm2)             //  movd      0x0(%4,%3,4),%%xmm2
4329    "movzb     " MEMACCESS2(0x0f,0) ",%3       \n"
4330    MEMOPREG(movd,0x00,4,3,4,xmm3)             //  movd      0x0(%4,%3,4),%%xmm3
4331    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4332    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4333    "movlhps   %%xmm3,%%xmm2                   \n"
4334    "pmulhuw   %%xmm2,%%xmm1                   \n"
4335    "lea       " MEMLEA(0x10,0) ",%0           \n"
4336    "packuswb  %%xmm1,%%xmm0                   \n"
4337    "sub       $0x4,%2                         \n"
4338    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
4339    "lea       " MEMLEA(0x10,1) ",%1           \n"
4340    "jg        1b                              \n"
4341  : "+r"(src_argb),    // %0
4342    "+r"(dst_argb),    // %1
4343    "+r"(width),       // %2
4344    "+r"(alpha)        // %3
4345  : "r"(fixed_invtbl8)  // %4
4346  : "memory", "cc"
4347#if defined(__native_client__) && defined(__x86_64__)
4348    , "r14"
4349#endif
4350#if defined(__SSE2__)
4351    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4352#endif
4353  );
4354}
4355#endif  // HAS_ARGBUNATTENUATEROW_SSE2
4356
4357#ifdef HAS_ARGBGRAYROW_SSSE3
4358// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
4359void ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
4360  asm volatile (
4361    "movdqa    %3,%%xmm4                       \n"
4362    "movdqa    %4,%%xmm5                       \n"
4363
4364    // 8 pixel loop.
4365    LABELALIGN
4366  "1:                                          \n"
4367    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4368    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4369    "pmaddubsw %%xmm4,%%xmm0                   \n"
4370    "pmaddubsw %%xmm4,%%xmm1                   \n"
4371    "phaddw    %%xmm1,%%xmm0                   \n"
4372    "paddw     %%xmm5,%%xmm0                   \n"
4373    "psrlw     $0x7,%%xmm0                     \n"
4374    "packuswb  %%xmm0,%%xmm0                   \n"
4375    "movdqa    " MEMACCESS(0) ",%%xmm2         \n"
4376    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm3   \n"
4377    "lea       " MEMLEA(0x20,0) ",%0           \n"
4378    "psrld     $0x18,%%xmm2                    \n"
4379    "psrld     $0x18,%%xmm3                    \n"
4380    "packuswb  %%xmm3,%%xmm2                   \n"
4381    "packuswb  %%xmm2,%%xmm2                   \n"
4382    "movdqa    %%xmm0,%%xmm3                   \n"
4383    "punpcklbw %%xmm0,%%xmm0                   \n"
4384    "punpcklbw %%xmm2,%%xmm3                   \n"
4385    "movdqa    %%xmm0,%%xmm1                   \n"
4386    "punpcklwd %%xmm3,%%xmm0                   \n"
4387    "punpckhwd %%xmm3,%%xmm1                   \n"
4388    "sub       $0x8,%2                         \n"
4389    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
4390    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
4391    "lea       " MEMLEA(0x20,1) ",%1           \n"
4392    "jg        1b                              \n"
4393  : "+r"(src_argb),   // %0
4394    "+r"(dst_argb),   // %1
4395    "+r"(width)       // %2
4396  : "m"(kARGBToYJ),   // %3
4397    "m"(kAddYJ64)     // %4
4398  : "memory", "cc"
4399#if defined(__SSE2__)
4400    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
4401#endif
4402  );
4403}
4404#endif  // HAS_ARGBGRAYROW_SSSE3
4405
4406#ifdef HAS_ARGBSEPIAROW_SSSE3
4407//    b = (r * 35 + g * 68 + b * 17) >> 7
4408//    g = (r * 45 + g * 88 + b * 22) >> 7
4409//    r = (r * 50 + g * 98 + b * 24) >> 7
4410// Constant for ARGB color to sepia tone
4411static vec8 kARGBToSepiaB = {
4412  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
4413};
4414
4415static vec8 kARGBToSepiaG = {
4416  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
4417};
4418
4419static vec8 kARGBToSepiaR = {
4420  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
4421};
4422
4423// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
4424void ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
4425  asm volatile (
4426    "movdqa    %2,%%xmm2                       \n"
4427    "movdqa    %3,%%xmm3                       \n"
4428    "movdqa    %4,%%xmm4                       \n"
4429
4430    // 8 pixel loop.
4431    LABELALIGN
4432  "1:                                          \n"
4433    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4434    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm6   \n"
4435    "pmaddubsw %%xmm2,%%xmm0                   \n"
4436    "pmaddubsw %%xmm2,%%xmm6                   \n"
4437    "phaddw    %%xmm6,%%xmm0                   \n"
4438    "psrlw     $0x7,%%xmm0                     \n"
4439    "packuswb  %%xmm0,%%xmm0                   \n"
4440    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
4441    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4442    "pmaddubsw %%xmm3,%%xmm5                   \n"
4443    "pmaddubsw %%xmm3,%%xmm1                   \n"
4444    "phaddw    %%xmm1,%%xmm5                   \n"
4445    "psrlw     $0x7,%%xmm5                     \n"
4446    "packuswb  %%xmm5,%%xmm5                   \n"
4447    "punpcklbw %%xmm5,%%xmm0                   \n"
4448    "movdqa    " MEMACCESS(0) ",%%xmm5         \n"
4449    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4450    "pmaddubsw %%xmm4,%%xmm5                   \n"
4451    "pmaddubsw %%xmm4,%%xmm1                   \n"
4452    "phaddw    %%xmm1,%%xmm5                   \n"
4453    "psrlw     $0x7,%%xmm5                     \n"
4454    "packuswb  %%xmm5,%%xmm5                   \n"
4455    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
4456    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4457    "psrld     $0x18,%%xmm6                    \n"
4458    "psrld     $0x18,%%xmm1                    \n"
4459    "packuswb  %%xmm1,%%xmm6                   \n"
4460    "packuswb  %%xmm6,%%xmm6                   \n"
4461    "punpcklbw %%xmm6,%%xmm5                   \n"
4462    "movdqa    %%xmm0,%%xmm1                   \n"
4463    "punpcklwd %%xmm5,%%xmm0                   \n"
4464    "punpckhwd %%xmm5,%%xmm1                   \n"
4465    "sub       $0x8,%1                         \n"
4466    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
4467    "movdqa    %%xmm1," MEMACCESS2(0x10,0) "   \n"
4468    "lea       " MEMLEA(0x20,0) ",%0           \n"
4469    "jg        1b                              \n"
4470  : "+r"(dst_argb),      // %0
4471    "+r"(width)          // %1
4472  : "m"(kARGBToSepiaB),  // %2
4473    "m"(kARGBToSepiaG),  // %3
4474    "m"(kARGBToSepiaR)   // %4
4475  : "memory", "cc"
4476#if defined(__SSE2__)
4477    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
4478#endif
4479  );
4480}
4481#endif  // HAS_ARGBSEPIAROW_SSSE3
4482
4483#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
4484// Tranform 8 ARGB pixels (32 bytes) with color matrix.
4485// Same as Sepia except matrix is provided.
4486void ARGBColorMatrixRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
4487                              const int8* matrix_argb, int width) {
4488  asm volatile (
4489    "movdqu    " MEMACCESS(3) ",%%xmm5         \n"
4490    "pshufd    $0x00,%%xmm5,%%xmm2             \n"
4491    "pshufd    $0x55,%%xmm5,%%xmm3             \n"
4492    "pshufd    $0xaa,%%xmm5,%%xmm4             \n"
4493    "pshufd    $0xff,%%xmm5,%%xmm5             \n"
4494
4495    // 8 pixel loop.
4496    LABELALIGN
4497  "1:                                          \n"
4498    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4499    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4500    "pmaddubsw %%xmm2,%%xmm0                   \n"
4501    "pmaddubsw %%xmm2,%%xmm7                   \n"
4502    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
4503    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
4504    "pmaddubsw %%xmm3,%%xmm6                   \n"
4505    "pmaddubsw %%xmm3,%%xmm1                   \n"
4506    "phaddsw   %%xmm7,%%xmm0                   \n"
4507    "phaddsw   %%xmm1,%%xmm6                   \n"
4508    "psraw     $0x6,%%xmm0                     \n"
4509    "psraw     $0x6,%%xmm6                     \n"
4510    "packuswb  %%xmm0,%%xmm0                   \n"
4511    "packuswb  %%xmm6,%%xmm6                   \n"
4512    "punpcklbw %%xmm6,%%xmm0                   \n"
4513    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
4514    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4515    "pmaddubsw %%xmm4,%%xmm1                   \n"
4516    "pmaddubsw %%xmm4,%%xmm7                   \n"
4517    "phaddsw   %%xmm7,%%xmm1                   \n"
4518    "movdqa    " MEMACCESS(0) ",%%xmm6         \n"
4519    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm7   \n"
4520    "pmaddubsw %%xmm5,%%xmm6                   \n"
4521    "pmaddubsw %%xmm5,%%xmm7                   \n"
4522    "phaddsw   %%xmm7,%%xmm6                   \n"
4523    "psraw     $0x6,%%xmm1                     \n"
4524    "psraw     $0x6,%%xmm6                     \n"
4525    "packuswb  %%xmm1,%%xmm1                   \n"
4526    "packuswb  %%xmm6,%%xmm6                   \n"
4527    "punpcklbw %%xmm6,%%xmm1                   \n"
4528    "movdqa    %%xmm0,%%xmm6                   \n"
4529    "punpcklwd %%xmm1,%%xmm0                   \n"
4530    "punpckhwd %%xmm1,%%xmm6                   \n"
4531    "sub       $0x8,%2                         \n"
4532    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
4533    "movdqa    %%xmm6," MEMACCESS2(0x10,1) "   \n"
4534    "lea       " MEMLEA(0x20,0) ",%0           \n"
4535    "lea       " MEMLEA(0x20,1) ",%1           \n"
4536    "jg        1b                              \n"
4537  : "+r"(src_argb),      // %0
4538    "+r"(dst_argb),      // %1
4539    "+r"(width)          // %2
4540  : "r"(matrix_argb)     // %3
4541  : "memory", "cc"
4542#if defined(__SSE2__)
4543    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4544#endif
4545  );
4546}
4547#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
4548
4549#ifdef HAS_ARGBQUANTIZEROW_SSE2
4550// Quantize 4 ARGB pixels (16 bytes).
4551// aligned to 16 bytes
4552void ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
4553                          int interval_offset, int width) {
4554  asm volatile (
4555    "movd      %2,%%xmm2                       \n"
4556    "movd      %3,%%xmm3                       \n"
4557    "movd      %4,%%xmm4                       \n"
4558    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
4559    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
4560    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
4561    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
4562    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
4563    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
4564    "pxor      %%xmm5,%%xmm5                   \n"
4565    "pcmpeqb   %%xmm6,%%xmm6                   \n"
4566    "pslld     $0x18,%%xmm6                    \n"
4567
4568    // 4 pixel loop.
4569    LABELALIGN
4570  "1:                                          \n"
4571    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4572    "punpcklbw %%xmm5,%%xmm0                   \n"
4573    "pmulhuw   %%xmm2,%%xmm0                   \n"
4574    "movdqa    " MEMACCESS(0) ",%%xmm1         \n"
4575    "punpckhbw %%xmm5,%%xmm1                   \n"
4576    "pmulhuw   %%xmm2,%%xmm1                   \n"
4577    "pmullw    %%xmm3,%%xmm0                   \n"
4578    "movdqa    " MEMACCESS(0) ",%%xmm7         \n"
4579    "pmullw    %%xmm3,%%xmm1                   \n"
4580    "pand      %%xmm6,%%xmm7                   \n"
4581    "paddw     %%xmm4,%%xmm0                   \n"
4582    "paddw     %%xmm4,%%xmm1                   \n"
4583    "packuswb  %%xmm1,%%xmm0                   \n"
4584    "por       %%xmm7,%%xmm0                   \n"
4585    "sub       $0x4,%1                         \n"
4586    "movdqa    %%xmm0," MEMACCESS(0) "         \n"
4587    "lea       " MEMLEA(0x10,0) ",%0           \n"
4588    "jg        1b                              \n"
4589  : "+r"(dst_argb),       // %0
4590    "+r"(width)           // %1
4591  : "r"(scale),           // %2
4592    "r"(interval_size),   // %3
4593    "r"(interval_offset)  // %4
4594  : "memory", "cc"
4595#if defined(__SSE2__)
4596    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
4597#endif
4598  );
4599}
4600#endif  // HAS_ARGBQUANTIZEROW_SSE2
4601
4602#ifdef HAS_ARGBSHADEROW_SSE2
4603// Shade 4 pixels at a time by specified value.
4604// Aligned to 16 bytes.
4605void ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
4606                       uint32 value) {
4607  asm volatile (
4608    "movd      %3,%%xmm2                       \n"
4609    "punpcklbw %%xmm2,%%xmm2                   \n"
4610    "punpcklqdq %%xmm2,%%xmm2                  \n"
4611
4612    // 4 pixel loop.
4613    LABELALIGN
4614  "1:                                          \n"
4615    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4616    "lea       " MEMLEA(0x10,0) ",%0           \n"
4617    "movdqa    %%xmm0,%%xmm1                   \n"
4618    "punpcklbw %%xmm0,%%xmm0                   \n"
4619    "punpckhbw %%xmm1,%%xmm1                   \n"
4620    "pmulhuw   %%xmm2,%%xmm0                   \n"
4621    "pmulhuw   %%xmm2,%%xmm1                   \n"
4622    "psrlw     $0x8,%%xmm0                     \n"
4623    "psrlw     $0x8,%%xmm1                     \n"
4624    "packuswb  %%xmm1,%%xmm0                   \n"
4625    "sub       $0x4,%2                         \n"
4626    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
4627    "lea       " MEMLEA(0x10,1) ",%1           \n"
4628    "jg        1b                              \n"
4629  : "+r"(src_argb),  // %0
4630    "+r"(dst_argb),  // %1
4631    "+r"(width)      // %2
4632  : "r"(value)       // %3
4633  : "memory", "cc"
4634#if defined(__SSE2__)
4635    , "xmm0", "xmm1", "xmm2"
4636#endif
4637  );
4638}
4639#endif  // HAS_ARGBSHADEROW_SSE2
4640
4641#ifdef HAS_ARGBMULTIPLYROW_SSE2
4642// Multiply 2 rows of ARGB pixels together, 4 pixels at a time.
4643void ARGBMultiplyRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4644                          uint8* dst_argb, int width) {
4645  asm volatile (
4646    "pxor      %%xmm5,%%xmm5                   \n"
4647
4648    // 4 pixel loop.
4649    LABELALIGN
4650  "1:                                          \n"
4651    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4652    "lea       " MEMLEA(0x10,0) ",%0           \n"
4653    "movdqu    " MEMACCESS(1) ",%%xmm2         \n"
4654    "lea       " MEMLEA(0x10,1) ",%1           \n"
4655    "movdqu    %%xmm0,%%xmm1                   \n"
4656    "movdqu    %%xmm2,%%xmm3                   \n"
4657    "punpcklbw %%xmm0,%%xmm0                   \n"
4658    "punpckhbw %%xmm1,%%xmm1                   \n"
4659    "punpcklbw %%xmm5,%%xmm2                   \n"
4660    "punpckhbw %%xmm5,%%xmm3                   \n"
4661    "pmulhuw   %%xmm2,%%xmm0                   \n"
4662    "pmulhuw   %%xmm3,%%xmm1                   \n"
4663    "packuswb  %%xmm1,%%xmm0                   \n"
4664    "sub       $0x4,%3                         \n"
4665    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4666    "lea       " MEMLEA(0x10,2) ",%2           \n"
4667    "jg        1b                              \n"
4668  : "+r"(src_argb0),  // %0
4669    "+r"(src_argb1),  // %1
4670    "+r"(dst_argb),   // %2
4671    "+r"(width)       // %3
4672  :
4673  : "memory", "cc"
4674#if defined(__SSE2__)
4675    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4676#endif
4677  );
4678}
4679#endif  // HAS_ARGBMULTIPLYROW_SSE2
4680
4681#ifdef HAS_ARGBADDROW_SSE2
4682// Add 2 rows of ARGB pixels together, 4 pixels at a time.
4683void ARGBAddRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4684                     uint8* dst_argb, int width) {
4685  asm volatile (
4686    // 4 pixel loop.
4687    LABELALIGN
4688  "1:                                          \n"
4689    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4690    "lea       " MEMLEA(0x10,0) ",%0           \n"
4691    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4692    "lea       " MEMLEA(0x10,1) ",%1           \n"
4693    "paddusb   %%xmm1,%%xmm0                   \n"
4694    "sub       $0x4,%3                         \n"
4695    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4696    "lea       " MEMLEA(0x10,2) ",%2           \n"
4697    "jg        1b                              \n"
4698  : "+r"(src_argb0),  // %0
4699    "+r"(src_argb1),  // %1
4700    "+r"(dst_argb),   // %2
4701    "+r"(width)       // %3
4702  :
4703  : "memory", "cc"
4704#if defined(__SSE2__)
4705    , "xmm0", "xmm1"
4706#endif
4707  );
4708}
4709#endif  // HAS_ARGBADDROW_SSE2
4710
4711#ifdef HAS_ARGBSUBTRACTROW_SSE2
4712// Subtract 2 rows of ARGB pixels, 4 pixels at a time.
4713void ARGBSubtractRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
4714                          uint8* dst_argb, int width) {
4715  asm volatile (
4716    // 4 pixel loop.
4717    LABELALIGN
4718  "1:                                          \n"
4719    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
4720    "lea       " MEMLEA(0x10,0) ",%0           \n"
4721    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
4722    "lea       " MEMLEA(0x10,1) ",%1           \n"
4723    "psubusb   %%xmm1,%%xmm0                   \n"
4724    "sub       $0x4,%3                         \n"
4725    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
4726    "lea       " MEMLEA(0x10,2) ",%2           \n"
4727    "jg        1b                              \n"
4728  : "+r"(src_argb0),  // %0
4729    "+r"(src_argb1),  // %1
4730    "+r"(dst_argb),   // %2
4731    "+r"(width)       // %3
4732  :
4733  : "memory", "cc"
4734#if defined(__SSE2__)
4735    , "xmm0", "xmm1"
4736#endif
4737  );
4738}
4739#endif  // HAS_ARGBSUBTRACTROW_SSE2
4740
4741#ifdef HAS_SOBELXROW_SSE2
4742// SobelX as a matrix is
4743// -1  0  1
4744// -2  0  2
4745// -1  0  1
4746void SobelXRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4747                    const uint8* src_y2, uint8* dst_sobelx, int width) {
4748  asm volatile (
4749    "sub       %0,%1                           \n"
4750    "sub       %0,%2                           \n"
4751    "sub       %0,%3                           \n"
4752    "pxor      %%xmm5,%%xmm5                   \n"
4753
4754    // 8 pixel loop.
4755    LABELALIGN
4756  "1:                                          \n"
4757    "movq      " MEMACCESS(0) ",%%xmm0         \n"
4758    "movq      " MEMACCESS2(0x2,0) ",%%xmm1    \n"
4759    "punpcklbw %%xmm5,%%xmm0                   \n"
4760    "punpcklbw %%xmm5,%%xmm1                   \n"
4761    "psubw     %%xmm1,%%xmm0                   \n"
4762    BUNDLEALIGN
4763    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4764    MEMOPREG(movq,0x02,0,1,1,xmm2)             //  movq      0x2(%0,%1,1),%%xmm2
4765    "punpcklbw %%xmm5,%%xmm1                   \n"
4766    "punpcklbw %%xmm5,%%xmm2                   \n"
4767    "psubw     %%xmm2,%%xmm1                   \n"
4768    BUNDLEALIGN
4769    MEMOPREG(movq,0x00,0,2,1,xmm2)             //  movq      (%0,%2,1),%%xmm2
4770    MEMOPREG(movq,0x02,0,2,1,xmm3)             //  movq      0x2(%0,%2,1),%%xmm3
4771    "punpcklbw %%xmm5,%%xmm2                   \n"
4772    "punpcklbw %%xmm5,%%xmm3                   \n"
4773    "psubw     %%xmm3,%%xmm2                   \n"
4774    "paddw     %%xmm2,%%xmm0                   \n"
4775    "paddw     %%xmm1,%%xmm0                   \n"
4776    "paddw     %%xmm1,%%xmm0                   \n"
4777    "pxor      %%xmm1,%%xmm1                   \n"
4778    "psubw     %%xmm0,%%xmm1                   \n"
4779    "pmaxsw    %%xmm1,%%xmm0                   \n"
4780    "packuswb  %%xmm0,%%xmm0                   \n"
4781    "sub       $0x8,%4                         \n"
4782    BUNDLEALIGN
4783    MEMOPMEM(movq,xmm0,0x00,0,3,1)             //  movq      %%xmm0,(%0,%3,1)
4784    "lea       " MEMLEA(0x8,0) ",%0            \n"
4785    "jg        1b                              \n"
4786  : "+r"(src_y0),      // %0
4787    "+r"(src_y1),      // %1
4788    "+r"(src_y2),      // %2
4789    "+r"(dst_sobelx),  // %3
4790    "+r"(width)        // %4
4791  :
4792  : "memory", "cc"
4793#if defined(__native_client__) && defined(__x86_64__)
4794    , "r14"
4795#endif
4796#if defined(__SSE2__)
4797    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4798#endif
4799  );
4800}
4801#endif  // HAS_SOBELXROW_SSE2
4802
4803#ifdef HAS_SOBELYROW_SSE2
4804// SobelY as a matrix is
4805// -1 -2 -1
4806//  0  0  0
4807//  1  2  1
4808void SobelYRow_SSE2(const uint8* src_y0, const uint8* src_y1,
4809                    uint8* dst_sobely, int width) {
4810  asm volatile (
4811    "sub       %0,%1                           \n"
4812    "sub       %0,%2                           \n"
4813    "pxor      %%xmm5,%%xmm5                   \n"
4814
4815    // 8 pixel loop.
4816    LABELALIGN
4817  "1:                                          \n"
4818    "movq      " MEMACCESS(0) ",%%xmm0         \n"
4819    MEMOPREG(movq,0x00,0,1,1,xmm1)             //  movq      (%0,%1,1),%%xmm1
4820    "punpcklbw %%xmm5,%%xmm0                   \n"
4821    "punpcklbw %%xmm5,%%xmm1                   \n"
4822    "psubw     %%xmm1,%%xmm0                   \n"
4823    BUNDLEALIGN
4824    "movq      " MEMACCESS2(0x1,0) ",%%xmm1    \n"
4825    MEMOPREG(movq,0x01,0,1,1,xmm2)             //  movq      0x1(%0,%1,1),%%xmm2
4826    "punpcklbw %%xmm5,%%xmm1                   \n"
4827    "punpcklbw %%xmm5,%%xmm2                   \n"
4828    "psubw     %%xmm2,%%xmm1                   \n"
4829    BUNDLEALIGN
4830    "movq      " MEMACCESS2(0x2,0) ",%%xmm2    \n"
4831    MEMOPREG(movq,0x02,0,1,1,xmm3)             //  movq      0x2(%0,%1,1),%%xmm3
4832    "punpcklbw %%xmm5,%%xmm2                   \n"
4833    "punpcklbw %%xmm5,%%xmm3                   \n"
4834    "psubw     %%xmm3,%%xmm2                   \n"
4835    "paddw     %%xmm2,%%xmm0                   \n"
4836    "paddw     %%xmm1,%%xmm0                   \n"
4837    "paddw     %%xmm1,%%xmm0                   \n"
4838    "pxor      %%xmm1,%%xmm1                   \n"
4839    "psubw     %%xmm0,%%xmm1                   \n"
4840    "pmaxsw    %%xmm1,%%xmm0                   \n"
4841    "packuswb  %%xmm0,%%xmm0                   \n"
4842    "sub       $0x8,%3                         \n"
4843    BUNDLEALIGN
4844    MEMOPMEM(movq,xmm0,0x00,0,2,1)             //  movq      %%xmm0,(%0,%2,1)
4845    "lea       " MEMLEA(0x8,0) ",%0            \n"
4846    "jg        1b                              \n"
4847  : "+r"(src_y0),      // %0
4848    "+r"(src_y1),      // %1
4849    "+r"(dst_sobely),  // %2
4850    "+r"(width)        // %3
4851  :
4852  : "memory", "cc"
4853#if defined(__native_client__) && defined(__x86_64__)
4854    , "r14"
4855#endif
4856#if defined(__SSE2__)
4857    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4858#endif
4859  );
4860}
4861#endif  // HAS_SOBELYROW_SSE2
4862
4863#ifdef HAS_SOBELROW_SSE2
4864// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
4865// A = 255
4866// R = Sobel
4867// G = Sobel
4868// B = Sobel
4869void SobelRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4870                   uint8* dst_argb, int width) {
4871  asm volatile (
4872    "sub       %0,%1                           \n"
4873    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4874    "pslld     $0x18,%%xmm5                    \n"
4875
4876    // 8 pixel loop.
4877    LABELALIGN
4878  "1:                                          \n"
4879    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4880    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
4881    "lea       " MEMLEA(0x10,0) ",%0           \n"
4882    "paddusb   %%xmm1,%%xmm0                   \n"
4883    "movdqa    %%xmm0,%%xmm2                   \n"
4884    "punpcklbw %%xmm0,%%xmm2                   \n"
4885    "punpckhbw %%xmm0,%%xmm0                   \n"
4886    "movdqa    %%xmm2,%%xmm1                   \n"
4887    "punpcklwd %%xmm2,%%xmm1                   \n"
4888    "punpckhwd %%xmm2,%%xmm2                   \n"
4889    "por       %%xmm5,%%xmm1                   \n"
4890    "por       %%xmm5,%%xmm2                   \n"
4891    "movdqa    %%xmm0,%%xmm3                   \n"
4892    "punpcklwd %%xmm0,%%xmm3                   \n"
4893    "punpckhwd %%xmm0,%%xmm0                   \n"
4894    "por       %%xmm5,%%xmm3                   \n"
4895    "por       %%xmm5,%%xmm0                   \n"
4896    "sub       $0x10,%3                        \n"
4897    "movdqa    %%xmm1," MEMACCESS(2) "         \n"
4898    "movdqa    %%xmm2," MEMACCESS2(0x10,2) "   \n"
4899    "movdqa    %%xmm3," MEMACCESS2(0x20,2) "   \n"
4900    "movdqa    %%xmm0," MEMACCESS2(0x30,2) "   \n"
4901    "lea       " MEMLEA(0x40,2) ",%2           \n"
4902    "jg        1b                              \n"
4903  : "+r"(src_sobelx),  // %0
4904    "+r"(src_sobely),  // %1
4905    "+r"(dst_argb),    // %2
4906    "+r"(width)        // %3
4907  :
4908  : "memory", "cc"
4909#if defined(__native_client__) && defined(__x86_64__)
4910    , "r14"
4911#endif
4912#if defined(__SSE2__)
4913    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
4914#endif
4915  );
4916}
4917#endif  // HAS_SOBELROW_SSE2
4918
4919#ifdef HAS_SOBELTOPLANEROW_SSE2
4920// Adds Sobel X and Sobel Y and stores Sobel into a plane.
4921void SobelToPlaneRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4922                          uint8* dst_y, int width) {
4923  asm volatile (
4924    "sub       %0,%1                           \n"
4925    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4926    "pslld     $0x18,%%xmm5                    \n"
4927
4928    // 8 pixel loop.
4929    LABELALIGN
4930  "1:                                          \n"
4931    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4932    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
4933    "lea       " MEMLEA(0x10,0) ",%0           \n"
4934    "paddusb   %%xmm1,%%xmm0                   \n"
4935    "sub       $0x10,%3                        \n"
4936    "movdqa    %%xmm0," MEMACCESS(2) "         \n"
4937    "lea       " MEMLEA(0x10,2) ",%2           \n"
4938    "jg        1b                              \n"
4939  : "+r"(src_sobelx),  // %0
4940    "+r"(src_sobely),  // %1
4941    "+r"(dst_y),       // %2
4942    "+r"(width)        // %3
4943  :
4944  : "memory", "cc"
4945#if defined(__native_client__) && defined(__x86_64__)
4946    , "r14"
4947#endif
4948#if defined(__SSE2__)
4949    , "xmm0", "xmm1"
4950#endif
4951  );
4952}
4953#endif  // HAS_SOBELTOPLANEROW_SSE2
4954
4955#ifdef HAS_SOBELXYROW_SSE2
4956// Mixes Sobel X, Sobel Y and Sobel into ARGB.
4957// A = 255
4958// R = Sobel X
4959// G = Sobel
4960// B = Sobel Y
4961void SobelXYRow_SSE2(const uint8* src_sobelx, const uint8* src_sobely,
4962                     uint8* dst_argb, int width) {
4963  asm volatile (
4964    "sub       %0,%1                           \n"
4965    "pcmpeqb   %%xmm5,%%xmm5                   \n"
4966
4967    // 8 pixel loop.
4968    LABELALIGN
4969  "1:                                          \n"
4970    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
4971    MEMOPREG(movdqa,0x00,0,1,1,xmm1)           //  movdqa    (%0,%1,1),%%xmm1
4972    "lea       " MEMLEA(0x10,0) ",%0           \n"
4973    "movdqa    %%xmm0,%%xmm2                   \n"
4974    "paddusb   %%xmm1,%%xmm2                   \n"
4975    "movdqa    %%xmm0,%%xmm3                   \n"
4976    "punpcklbw %%xmm5,%%xmm3                   \n"
4977    "punpckhbw %%xmm5,%%xmm0                   \n"
4978    "movdqa    %%xmm1,%%xmm4                   \n"
4979    "punpcklbw %%xmm2,%%xmm4                   \n"
4980    "punpckhbw %%xmm2,%%xmm1                   \n"
4981    "movdqa    %%xmm4,%%xmm6                   \n"
4982    "punpcklwd %%xmm3,%%xmm6                   \n"
4983    "punpckhwd %%xmm3,%%xmm4                   \n"
4984    "movdqa    %%xmm1,%%xmm7                   \n"
4985    "punpcklwd %%xmm0,%%xmm7                   \n"
4986    "punpckhwd %%xmm0,%%xmm1                   \n"
4987    "sub       $0x10,%3                        \n"
4988    "movdqa    %%xmm6," MEMACCESS(2) "         \n"
4989    "movdqa    %%xmm4," MEMACCESS2(0x10,2) "   \n"
4990    "movdqa    %%xmm7," MEMACCESS2(0x20,2) "   \n"
4991    "movdqa    %%xmm1," MEMACCESS2(0x30,2) "   \n"
4992    "lea       " MEMLEA(0x40,2) ",%2           \n"
4993    "jg        1b                              \n"
4994  : "+r"(src_sobelx),  // %0
4995    "+r"(src_sobely),  // %1
4996    "+r"(dst_argb),    // %2
4997    "+r"(width)        // %3
4998  :
4999  : "memory", "cc"
5000#if defined(__native_client__) && defined(__x86_64__)
5001    , "r14"
5002#endif
5003#if defined(__SSE2__)
5004    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5005#endif
5006  );
5007}
5008#endif  // HAS_SOBELXYROW_SSE2
5009
5010#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
5011// Creates a table of cumulative sums where each value is a sum of all values
5012// above and to the left of the value, inclusive of the value.
5013void ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
5014                                  const int32* previous_cumsum, int width) {
5015  asm volatile (
5016    "pxor      %%xmm0,%%xmm0                   \n"
5017    "pxor      %%xmm1,%%xmm1                   \n"
5018    "sub       $0x4,%3                         \n"
5019    "jl        49f                             \n"
5020    "test      $0xf,%1                         \n"
5021    "jne       49f                             \n"
5022
5023  // 4 pixel loop                              \n"
5024    LABELALIGN
5025  "40:                                         \n"
5026    "movdqu    " MEMACCESS(0) ",%%xmm2         \n"
5027    "lea       " MEMLEA(0x10,0) ",%0           \n"
5028    "movdqa    %%xmm2,%%xmm4                   \n"
5029    "punpcklbw %%xmm1,%%xmm2                   \n"
5030    "movdqa    %%xmm2,%%xmm3                   \n"
5031    "punpcklwd %%xmm1,%%xmm2                   \n"
5032    "punpckhwd %%xmm1,%%xmm3                   \n"
5033    "punpckhbw %%xmm1,%%xmm4                   \n"
5034    "movdqa    %%xmm4,%%xmm5                   \n"
5035    "punpcklwd %%xmm1,%%xmm4                   \n"
5036    "punpckhwd %%xmm1,%%xmm5                   \n"
5037    "paddd     %%xmm2,%%xmm0                   \n"
5038    "movdqa    " MEMACCESS(2) ",%%xmm2         \n"
5039    "paddd     %%xmm0,%%xmm2                   \n"
5040    "paddd     %%xmm3,%%xmm0                   \n"
5041    "movdqa    " MEMACCESS2(0x10,2) ",%%xmm3   \n"
5042    "paddd     %%xmm0,%%xmm3                   \n"
5043    "paddd     %%xmm4,%%xmm0                   \n"
5044    "movdqa    " MEMACCESS2(0x20,2) ",%%xmm4   \n"
5045    "paddd     %%xmm0,%%xmm4                   \n"
5046    "paddd     %%xmm5,%%xmm0                   \n"
5047    "movdqa    " MEMACCESS2(0x30,2) ",%%xmm5   \n"
5048    "lea       " MEMLEA(0x40,2) ",%2           \n"
5049    "paddd     %%xmm0,%%xmm5                   \n"
5050    "movdqa    %%xmm2," MEMACCESS(1) "         \n"
5051    "movdqa    %%xmm3," MEMACCESS2(0x10,1) "   \n"
5052    "movdqa    %%xmm4," MEMACCESS2(0x20,1) "   \n"
5053    "movdqa    %%xmm5," MEMACCESS2(0x30,1) "   \n"
5054    "lea       " MEMLEA(0x40,1) ",%1           \n"
5055    "sub       $0x4,%3                         \n"
5056    "jge       40b                             \n"
5057
5058  "49:                                         \n"
5059    "add       $0x3,%3                         \n"
5060    "jl        19f                             \n"
5061
5062  // 1 pixel loop                              \n"
5063    LABELALIGN
5064  "10:                                         \n"
5065    "movd      " MEMACCESS(0) ",%%xmm2         \n"
5066    "lea       " MEMLEA(0x4,0) ",%0            \n"
5067    "punpcklbw %%xmm1,%%xmm2                   \n"
5068    "punpcklwd %%xmm1,%%xmm2                   \n"
5069    "paddd     %%xmm2,%%xmm0                   \n"
5070    "movdqu    " MEMACCESS(2) ",%%xmm2         \n"
5071    "lea       " MEMLEA(0x10,2) ",%2           \n"
5072    "paddd     %%xmm0,%%xmm2                   \n"
5073    "movdqu    %%xmm2," MEMACCESS(1) "         \n"
5074    "lea       " MEMLEA(0x10,1) ",%1           \n"
5075    "sub       $0x1,%3                         \n"
5076    "jge       10b                             \n"
5077
5078  "19:                                         \n"
5079  : "+r"(row),  // %0
5080    "+r"(cumsum),  // %1
5081    "+r"(previous_cumsum),  // %2
5082    "+r"(width)  // %3
5083  :
5084  : "memory", "cc"
5085#if defined(__SSE2__)
5086    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5087#endif
5088  );
5089}
5090#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
5091
5092#ifdef HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5093void CumulativeSumToAverageRow_SSE2(const int32* topleft, const int32* botleft,
5094                                    int width, int area, uint8* dst,
5095                                    int count) {
5096  asm volatile (
5097    "movd      %5,%%xmm5                       \n"
5098    "cvtdq2ps  %%xmm5,%%xmm5                   \n"
5099    "rcpss     %%xmm5,%%xmm4                   \n"
5100    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
5101    "sub       $0x4,%3                         \n"
5102    "jl        49f                             \n"
5103    "cmpl      $0x80,%5                        \n"
5104    "ja        40f                             \n"
5105
5106    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5107    "pcmpeqb   %%xmm6,%%xmm6                   \n"
5108    "psrld     $0x10,%%xmm6                    \n"
5109    "cvtdq2ps  %%xmm6,%%xmm6                   \n"
5110    "addps     %%xmm6,%%xmm5                   \n"
5111    "mulps     %%xmm4,%%xmm5                   \n"
5112    "cvtps2dq  %%xmm5,%%xmm5                   \n"
5113    "packssdw  %%xmm5,%%xmm5                   \n"
5114
5115  // 4 pixel small loop                        \n"
5116    LABELALIGN
5117  "4:                                         \n"
5118    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
5119    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5120    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
5121    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
5122    BUNDLEALIGN
5123    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
5124    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
5125    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
5126    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
5127    "lea       " MEMLEA(0x40,0) ",%0           \n"
5128    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
5129    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
5130    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
5131    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
5132    BUNDLEALIGN
5133    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
5134    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
5135    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
5136    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
5137    "lea       " MEMLEA(0x40,1) ",%1           \n"
5138    "packssdw  %%xmm1,%%xmm0                   \n"
5139    "packssdw  %%xmm3,%%xmm2                   \n"
5140    "pmulhuw   %%xmm5,%%xmm0                   \n"
5141    "pmulhuw   %%xmm5,%%xmm2                   \n"
5142    "packuswb  %%xmm2,%%xmm0                   \n"
5143    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
5144    "lea       " MEMLEA(0x10,2) ",%2           \n"
5145    "sub       $0x4,%3                         \n"
5146    "jge       4b                              \n"
5147    "jmp       49f                             \n"
5148
5149  // 4 pixel loop                              \n"
5150    LABELALIGN
5151  "40:                                         \n"
5152    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
5153    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5154    "movdqa    " MEMACCESS2(0x20,0) ",%%xmm2   \n"
5155    "movdqa    " MEMACCESS2(0x30,0) ",%%xmm3   \n"
5156    BUNDLEALIGN
5157    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
5158    MEMOPREG(psubd,0x10,0,4,4,xmm1)            // psubd    0x10(%0,%4,4),%%xmm1
5159    MEMOPREG(psubd,0x20,0,4,4,xmm2)            // psubd    0x20(%0,%4,4),%%xmm2
5160    MEMOPREG(psubd,0x30,0,4,4,xmm3)            // psubd    0x30(%0,%4,4),%%xmm3
5161    "lea       " MEMLEA(0x40,0) ",%0           \n"
5162    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
5163    "psubd     " MEMACCESS2(0x10,1) ",%%xmm1   \n"
5164    "psubd     " MEMACCESS2(0x20,1) ",%%xmm2   \n"
5165    "psubd     " MEMACCESS2(0x30,1) ",%%xmm3   \n"
5166    BUNDLEALIGN
5167    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
5168    MEMOPREG(paddd,0x10,1,4,4,xmm1)            // paddd    0x10(%1,%4,4),%%xmm1
5169    MEMOPREG(paddd,0x20,1,4,4,xmm2)            // paddd    0x20(%1,%4,4),%%xmm2
5170    MEMOPREG(paddd,0x30,1,4,4,xmm3)            // paddd    0x30(%1,%4,4),%%xmm3
5171    "lea       " MEMLEA(0x40,1) ",%1           \n"
5172    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5173    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
5174    "mulps     %%xmm4,%%xmm0                   \n"
5175    "mulps     %%xmm4,%%xmm1                   \n"
5176    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
5177    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
5178    "mulps     %%xmm4,%%xmm2                   \n"
5179    "mulps     %%xmm4,%%xmm3                   \n"
5180    "cvtps2dq  %%xmm0,%%xmm0                   \n"
5181    "cvtps2dq  %%xmm1,%%xmm1                   \n"
5182    "cvtps2dq  %%xmm2,%%xmm2                   \n"
5183    "cvtps2dq  %%xmm3,%%xmm3                   \n"
5184    "packssdw  %%xmm1,%%xmm0                   \n"
5185    "packssdw  %%xmm3,%%xmm2                   \n"
5186    "packuswb  %%xmm2,%%xmm0                   \n"
5187    "movdqu    %%xmm0," MEMACCESS(2) "         \n"
5188    "lea       " MEMLEA(0x10,2) ",%2           \n"
5189    "sub       $0x4,%3                         \n"
5190    "jge       40b                             \n"
5191
5192  "49:                                         \n"
5193    "add       $0x3,%3                         \n"
5194    "jl        19f                             \n"
5195
5196  // 1 pixel loop                              \n"
5197    LABELALIGN
5198  "10:                                         \n"
5199    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
5200    MEMOPREG(psubd,0x00,0,4,4,xmm0)            // psubd    0x00(%0,%4,4),%%xmm0
5201    "lea       " MEMLEA(0x10,0) ",%0           \n"
5202    "psubd     " MEMACCESS(1) ",%%xmm0         \n"
5203    BUNDLEALIGN
5204    MEMOPREG(paddd,0x00,1,4,4,xmm0)            // paddd    0x00(%1,%4,4),%%xmm0
5205    "lea       " MEMLEA(0x10,1) ",%1           \n"
5206    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
5207    "mulps     %%xmm4,%%xmm0                   \n"
5208    "cvtps2dq  %%xmm0,%%xmm0                   \n"
5209    "packssdw  %%xmm0,%%xmm0                   \n"
5210    "packuswb  %%xmm0,%%xmm0                   \n"
5211    "movd      %%xmm0," MEMACCESS(2) "         \n"
5212    "lea       " MEMLEA(0x4,2) ",%2            \n"
5213    "sub       $0x1,%3                         \n"
5214    "jge       10b                             \n"
5215  "19:                                         \n"
5216  : "+r"(topleft),  // %0
5217    "+r"(botleft),  // %1
5218    "+r"(dst),      // %2
5219    "+rm"(count)    // %3
5220  : "r"((intptr_t)(width)),  // %4
5221    "rm"(area)     // %5
5222  : "memory", "cc"
5223#if defined(__native_client__) && defined(__x86_64__)
5224    , "r14"
5225#endif
5226#if defined(__SSE2__)
5227    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
5228#endif
5229  );
5230}
5231#endif  // HAS_CUMULATIVESUMTOAVERAGEROW_SSE2
5232
5233#ifdef HAS_ARGBAFFINEROW_SSE2
5234// Copy ARGB pixels from source image with slope to a row of destination.
5235LIBYUV_API
5236void ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
5237                        uint8* dst_argb, const float* src_dudv, int width) {
5238  intptr_t src_argb_stride_temp = src_argb_stride;
5239  intptr_t temp = 0;
5240  asm volatile (
5241    "movq      " MEMACCESS(3) ",%%xmm2         \n"
5242    "movq      " MEMACCESS2(0x08,3) ",%%xmm7   \n"
5243    "shl       $0x10,%1                        \n"
5244    "add       $0x4,%1                         \n"
5245    "movd      %1,%%xmm5                       \n"
5246    "sub       $0x4,%4                         \n"
5247    "jl        49f                             \n"
5248
5249    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
5250    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5251    "movdqa    %%xmm2,%%xmm0                   \n"
5252    "addps     %%xmm7,%%xmm0                   \n"
5253    "movlhps   %%xmm0,%%xmm2                   \n"
5254    "movdqa    %%xmm7,%%xmm4                   \n"
5255    "addps     %%xmm4,%%xmm4                   \n"
5256    "movdqa    %%xmm2,%%xmm3                   \n"
5257    "addps     %%xmm4,%%xmm3                   \n"
5258    "addps     %%xmm4,%%xmm4                   \n"
5259
5260  // 4 pixel loop                              \n"
5261    LABELALIGN
5262  "40:                                         \n"
5263    "cvttps2dq %%xmm2,%%xmm0                   \n"  // x, y float to int first 2
5264    "cvttps2dq %%xmm3,%%xmm1                   \n"  // x, y float to int next 2
5265    "packssdw  %%xmm1,%%xmm0                   \n"  // x, y as 8 shorts
5266    "pmaddwd   %%xmm5,%%xmm0                   \n"  // off = x * 4 + y * stride
5267    "movd      %%xmm0,%k1                      \n"
5268    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5269    "movd      %%xmm0,%k5                      \n"
5270    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5271    BUNDLEALIGN
5272    MEMOPREG(movd,0x00,0,1,1,xmm1)             //  movd      (%0,%1,1),%%xmm1
5273    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
5274    "punpckldq %%xmm6,%%xmm1                   \n"
5275    "addps     %%xmm4,%%xmm2                   \n"
5276    "movq      %%xmm1," MEMACCESS(2) "         \n"
5277    "movd      %%xmm0,%k1                      \n"
5278    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
5279    "movd      %%xmm0,%k5                      \n"
5280    BUNDLEALIGN
5281    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
5282    MEMOPREG(movd,0x00,0,5,1,xmm6)             //  movd      (%0,%5,1),%%xmm6
5283    "punpckldq %%xmm6,%%xmm0                   \n"
5284    "addps     %%xmm4,%%xmm3                   \n"
5285    "sub       $0x4,%4                         \n"
5286    "movq      %%xmm0," MEMACCESS2(0x08,2) "   \n"
5287    "lea       " MEMLEA(0x10,2) ",%2           \n"
5288    "jge       40b                             \n"
5289
5290  "49:                                         \n"
5291    "add       $0x3,%4                         \n"
5292    "jl        19f                             \n"
5293
5294  // 1 pixel loop                              \n"
5295    LABELALIGN
5296  "10:                                         \n"
5297    "cvttps2dq %%xmm2,%%xmm0                   \n"
5298    "packssdw  %%xmm0,%%xmm0                   \n"
5299    "pmaddwd   %%xmm5,%%xmm0                   \n"
5300    "addps     %%xmm7,%%xmm2                   \n"
5301    "movd      %%xmm0,%k1                      \n"
5302    BUNDLEALIGN
5303    MEMOPREG(movd,0x00,0,1,1,xmm0)             //  movd      (%0,%1,1),%%xmm0
5304    "sub       $0x1,%4                         \n"
5305    "movd      %%xmm0," MEMACCESS(2) "         \n"
5306    "lea       " MEMLEA(0x04,2) ",%2           \n"
5307    "jge       10b                             \n"
5308  "19:                                         \n"
5309  : "+r"(src_argb),  // %0
5310    "+r"(src_argb_stride_temp),  // %1
5311    "+r"(dst_argb),  // %2
5312    "+r"(src_dudv),  // %3
5313    "+rm"(width),    // %4
5314    "+r"(temp)   // %5
5315  :
5316  : "memory", "cc"
5317#if defined(__native_client__) && defined(__x86_64__)
5318    , "r14"
5319#endif
5320#if defined(__SSE2__)
5321    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
5322#endif
5323  );
5324}
5325#endif  // HAS_ARGBAFFINEROW_SSE2
5326
5327#ifdef HAS_INTERPOLATEROW_SSSE3
5328// Bilinear filter 16x2 -> 16x1
5329void InterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5330                          ptrdiff_t src_stride, int dst_width,
5331                          int source_y_fraction) {
5332  asm volatile (
5333    "sub       %1,%0                           \n"
5334    "shr       %3                              \n"
5335    "cmp       $0x0,%3                         \n"
5336    "je        100f                            \n"
5337    "cmp       $0x20,%3                        \n"
5338    "je        75f                             \n"
5339    "cmp       $0x40,%3                        \n"
5340    "je        50f                             \n"
5341    "cmp       $0x60,%3                        \n"
5342    "je        25f                             \n"
5343
5344    "movd      %3,%%xmm0                       \n"
5345    "neg       %3                              \n"
5346    "add       $0x80,%3                        \n"
5347    "movd      %3,%%xmm5                       \n"
5348    "punpcklbw %%xmm0,%%xmm5                   \n"
5349    "punpcklwd %%xmm5,%%xmm5                   \n"
5350    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5351
5352    // General purpose row blend.
5353    LABELALIGN
5354  "1:                                          \n"
5355    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
5356    MEMOPREG(movdqa,0x00,1,4,1,xmm2)
5357    "movdqa    %%xmm0,%%xmm1                   \n"
5358    "punpcklbw %%xmm2,%%xmm0                   \n"
5359    "punpckhbw %%xmm2,%%xmm1                   \n"
5360    "pmaddubsw %%xmm5,%%xmm0                   \n"
5361    "pmaddubsw %%xmm5,%%xmm1                   \n"
5362    "psrlw     $0x7,%%xmm0                     \n"
5363    "psrlw     $0x7,%%xmm1                     \n"
5364    "packuswb  %%xmm1,%%xmm0                   \n"
5365    "sub       $0x10,%2                        \n"
5366    BUNDLEALIGN
5367    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5368    "lea       " MEMLEA(0x10,1) ",%1           \n"
5369    "jg        1b                              \n"
5370    "jmp       99f                             \n"
5371
5372    // Blend 25 / 75.
5373    LABELALIGN
5374  "25:                                         \n"
5375    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
5376    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
5377    "pavgb     %%xmm1,%%xmm0                   \n"
5378    "pavgb     %%xmm1,%%xmm0                   \n"
5379    "sub       $0x10,%2                        \n"
5380    BUNDLEALIGN
5381    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5382    "lea       " MEMLEA(0x10,1) ",%1           \n"
5383    "jg        25b                             \n"
5384    "jmp       99f                             \n"
5385
5386    // Blend 50 / 50.
5387    LABELALIGN
5388  "50:                                         \n"
5389    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
5390    MEMOPREG(movdqa,0x00,1,4,1,xmm1)
5391    "pavgb     %%xmm1,%%xmm0                   \n"
5392    "sub       $0x10,%2                        \n"
5393    BUNDLEALIGN
5394    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5395    "lea       " MEMLEA(0x10,1) ",%1           \n"
5396    "jg        50b                             \n"
5397    "jmp       99f                             \n"
5398
5399    // Blend 75 / 25.
5400    LABELALIGN
5401  "75:                                         \n"
5402    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
5403    MEMOPREG(movdqa,0x00,1,4,1,xmm0)
5404    "pavgb     %%xmm1,%%xmm0                   \n"
5405    "pavgb     %%xmm1,%%xmm0                   \n"
5406    "sub       $0x10,%2                        \n"
5407    BUNDLEALIGN
5408    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5409    "lea       " MEMLEA(0x10,1) ",%1           \n"
5410    "jg        75b                             \n"
5411    "jmp       99f                             \n"
5412
5413    // Blend 100 / 0 - Copy row unchanged.
5414    LABELALIGN
5415  "100:                                        \n"
5416    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
5417    "sub       $0x10,%2                        \n"
5418    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)
5419    "lea       " MEMLEA(0x10,1) ",%1           \n"
5420    "jg        100b                            \n"
5421
5422  "99:                                         \n"
5423  : "+r"(dst_ptr),    // %0
5424    "+r"(src_ptr),    // %1
5425    "+r"(dst_width),  // %2
5426    "+r"(source_y_fraction)  // %3
5427  : "r"((intptr_t)(src_stride))  // %4
5428  : "memory", "cc"
5429#if defined(__native_client__) && defined(__x86_64__)
5430    , "r14"
5431#endif
5432#if defined(__SSE2__)
5433    , "xmm0", "xmm1", "xmm2", "xmm5"
5434#endif
5435  );
5436}
5437#endif  // HAS_INTERPOLATEROW_SSSE3
5438
5439#ifdef HAS_INTERPOLATEROW_SSE2
5440// Bilinear filter 16x2 -> 16x1
5441void InterpolateRow_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5442                         ptrdiff_t src_stride, int dst_width,
5443                         int source_y_fraction) {
5444  asm volatile (
5445    "sub       %1,%0                           \n"
5446    "shr       %3                              \n"
5447    "cmp       $0x0,%3                         \n"
5448    "je        100f                            \n"
5449    "cmp       $0x20,%3                        \n"
5450    "je        75f                             \n"
5451    "cmp       $0x40,%3                        \n"
5452    "je        50f                             \n"
5453    "cmp       $0x60,%3                        \n"
5454    "je        25f                             \n"
5455
5456    "movd      %3,%%xmm0                       \n"
5457    "neg       %3                              \n"
5458    "add       $0x80,%3                        \n"
5459    "movd      %3,%%xmm5                       \n"
5460    "punpcklbw %%xmm0,%%xmm5                   \n"
5461    "punpcklwd %%xmm5,%%xmm5                   \n"
5462    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5463    "pxor      %%xmm4,%%xmm4                   \n"
5464
5465    // General purpose row blend.
5466    LABELALIGN
5467  "1:                                          \n"
5468    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
5469    MEMOPREG(movdqa,0x00,1,4,1,xmm2)           //  movdqa    (%1,%4,1),%%xmm2
5470    "movdqa    %%xmm0,%%xmm1                   \n"
5471    "movdqa    %%xmm2,%%xmm3                   \n"
5472    "punpcklbw %%xmm4,%%xmm2                   \n"
5473    "punpckhbw %%xmm4,%%xmm3                   \n"
5474    "punpcklbw %%xmm4,%%xmm0                   \n"
5475    "punpckhbw %%xmm4,%%xmm1                   \n"
5476    "psubw     %%xmm0,%%xmm2                   \n"
5477    "psubw     %%xmm1,%%xmm3                   \n"
5478    "paddw     %%xmm2,%%xmm2                   \n"
5479    "paddw     %%xmm3,%%xmm3                   \n"
5480    "pmulhw    %%xmm5,%%xmm2                   \n"
5481    "pmulhw    %%xmm5,%%xmm3                   \n"
5482    "paddw     %%xmm2,%%xmm0                   \n"
5483    "paddw     %%xmm3,%%xmm1                   \n"
5484    "packuswb  %%xmm1,%%xmm0                   \n"
5485    "sub       $0x10,%2                        \n"
5486    BUNDLEALIGN
5487    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
5488    "lea       " MEMLEA(0x10,1) ",%1           \n"
5489    "jg        1b                              \n"
5490    "jmp       99f                             \n"
5491
5492    // Blend 25 / 75.
5493    LABELALIGN
5494  "25:                                         \n"
5495    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
5496    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
5497    "pavgb     %%xmm1,%%xmm0                   \n"
5498    "pavgb     %%xmm1,%%xmm0                   \n"
5499    "sub       $0x10,%2                        \n"
5500    BUNDLEALIGN
5501    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
5502    "lea       " MEMLEA(0x10,1) ",%1           \n"
5503    "jg        25b                             \n"
5504    "jmp       99f                             \n"
5505
5506    // Blend 50 / 50.
5507    LABELALIGN
5508  "50:                                         \n"
5509    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
5510    MEMOPREG(movdqa,0x00,1,4,1,xmm1)           //  movdqa    (%1,%4,1),%%xmm1
5511    "pavgb     %%xmm1,%%xmm0                   \n"
5512    "sub       $0x10,%2                        \n"
5513    BUNDLEALIGN
5514    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
5515    "lea       " MEMLEA(0x10,1) ",%1           \n"
5516    "jg        50b                             \n"
5517    "jmp       99f                             \n"
5518
5519    // Blend 75 / 25.
5520    LABELALIGN
5521  "75:                                         \n"
5522    "movdqa    " MEMACCESS(1) ",%%xmm1         \n"
5523    MEMOPREG(movdqa,0x00,1,4,1,xmm0)           //  movdqa    (%1,%4,1),%%xmm0
5524    "pavgb     %%xmm1,%%xmm0                   \n"
5525    "pavgb     %%xmm1,%%xmm0                   \n"
5526    "sub       $0x10,%2                        \n"
5527    BUNDLEALIGN
5528    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
5529    "lea       " MEMLEA(0x10,1) ",%1           \n"
5530    "jg        75b                             \n"
5531    "jmp       99f                             \n"
5532
5533    // Blend 100 / 0 - Copy row unchanged.
5534    LABELALIGN
5535  "100:                                        \n"
5536    "movdqa    " MEMACCESS(1) ",%%xmm0         \n"
5537    "sub       $0x10,%2                        \n"
5538    MEMOPMEM(movdqa,xmm0,0x00,1,0,1)           //  movdqa    %%xmm0,(%1,%0,1)
5539    "lea       " MEMLEA(0x10,1) ",%1           \n"
5540    "jg        100b                            \n"
5541
5542  "99:                                         \n"
5543  : "+r"(dst_ptr),    // %0
5544    "+r"(src_ptr),    // %1
5545    "+r"(dst_width),  // %2
5546    "+r"(source_y_fraction)  // %3
5547  : "r"((intptr_t)(src_stride))  // %4
5548  : "memory", "cc"
5549#if defined(__native_client__) && defined(__x86_64__)
5550    , "r14"
5551#endif
5552#if defined(__SSE2__)
5553    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5554#endif
5555  );
5556}
5557#endif  // HAS_INTERPOLATEROW_SSE2
5558
5559#ifdef HAS_INTERPOLATEROW_SSSE3
5560// Bilinear filter 16x2 -> 16x1
5561void InterpolateRow_Unaligned_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
5562                                    ptrdiff_t src_stride, int dst_width,
5563                                    int source_y_fraction) {
5564  asm volatile (
5565    "sub       %1,%0                           \n"
5566    "shr       %3                              \n"
5567    "cmp       $0x0,%3                         \n"
5568    "je        100f                            \n"
5569    "cmp       $0x20,%3                        \n"
5570    "je        75f                             \n"
5571    "cmp       $0x40,%3                        \n"
5572    "je        50f                             \n"
5573    "cmp       $0x60,%3                        \n"
5574    "je        25f                             \n"
5575
5576    "movd      %3,%%xmm0                       \n"
5577    "neg       %3                              \n"
5578    "add       $0x80,%3                        \n"
5579    "movd      %3,%%xmm5                       \n"
5580    "punpcklbw %%xmm0,%%xmm5                   \n"
5581    "punpcklwd %%xmm5,%%xmm5                   \n"
5582    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5583
5584    // General purpose row blend.
5585    LABELALIGN
5586  "1:                                          \n"
5587    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5588    MEMOPREG(movdqu,0x00,1,4,1,xmm2)
5589    "movdqu    %%xmm0,%%xmm1                   \n"
5590    "punpcklbw %%xmm2,%%xmm0                   \n"
5591    "punpckhbw %%xmm2,%%xmm1                   \n"
5592    "pmaddubsw %%xmm5,%%xmm0                   \n"
5593    "pmaddubsw %%xmm5,%%xmm1                   \n"
5594    "psrlw     $0x7,%%xmm0                     \n"
5595    "psrlw     $0x7,%%xmm1                     \n"
5596    "packuswb  %%xmm1,%%xmm0                   \n"
5597    "sub       $0x10,%2                        \n"
5598    BUNDLEALIGN
5599    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5600    "lea       " MEMLEA(0x10,1) ",%1           \n"
5601    "jg        1b                              \n"
5602    "jmp       99f                             \n"
5603
5604    // Blend 25 / 75.
5605    LABELALIGN
5606  "25:                                         \n"
5607    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5608    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
5609    "pavgb     %%xmm1,%%xmm0                   \n"
5610    "pavgb     %%xmm1,%%xmm0                   \n"
5611    "sub       $0x10,%2                        \n"
5612    BUNDLEALIGN
5613    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5614    "lea       " MEMLEA(0x10,1) ",%1           \n"
5615    "jg        25b                             \n"
5616    "jmp       99f                             \n"
5617
5618    // Blend 50 / 50.
5619    LABELALIGN
5620  "50:                                         \n"
5621    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5622    MEMOPREG(movdqu,0x00,1,4,1,xmm1)
5623    "pavgb     %%xmm1,%%xmm0                   \n"
5624    "sub       $0x10,%2                        \n"
5625    BUNDLEALIGN
5626    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5627    "lea       " MEMLEA(0x10,1) ",%1           \n"
5628    "jg        50b                             \n"
5629    "jmp       99f                             \n"
5630
5631    // Blend 75 / 25.
5632    LABELALIGN
5633  "75:                                         \n"
5634    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
5635    MEMOPREG(movdqu,0x00,1,4,1,xmm0)
5636    "pavgb     %%xmm1,%%xmm0                   \n"
5637    "pavgb     %%xmm1,%%xmm0                   \n"
5638    "sub       $0x10,%2                        \n"
5639    BUNDLEALIGN
5640    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5641    "lea       " MEMLEA(0x10,1) ",%1           \n"
5642    "jg        75b                             \n"
5643    "jmp       99f                             \n"
5644
5645    // Blend 100 / 0 - Copy row unchanged.
5646    LABELALIGN
5647  "100:                                        \n"
5648    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5649    "sub       $0x10,%2                        \n"
5650    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)
5651    "lea       " MEMLEA(0x10,1) ",%1           \n"
5652    "jg        100b                            \n"
5653
5654  "99:                                         \n"
5655  : "+r"(dst_ptr),    // %0
5656    "+r"(src_ptr),    // %1
5657    "+r"(dst_width),  // %2
5658    "+r"(source_y_fraction)  // %3
5659  : "r"((intptr_t)(src_stride))  // %4
5660  : "memory", "cc"
5661#if defined(__native_client__) && defined(__x86_64__)
5662    , "r14"
5663#endif
5664#if defined(__SSE2__)
5665    , "xmm0", "xmm1", "xmm2", "xmm5"
5666#endif
5667  );
5668}
5669#endif   // HAS_INTERPOLATEROW_SSSE3
5670
5671#ifdef HAS_INTERPOLATEROW_SSE2
5672// Bilinear filter 16x2 -> 16x1
5673void InterpolateRow_Unaligned_SSE2(uint8* dst_ptr, const uint8* src_ptr,
5674                                   ptrdiff_t src_stride, int dst_width,
5675                                   int source_y_fraction) {
5676  asm volatile (
5677    "sub       %1,%0                           \n"
5678    "shr       %3                              \n"
5679    "cmp       $0x0,%3                         \n"
5680    "je        100f                            \n"
5681    "cmp       $0x20,%3                        \n"
5682    "je        75f                             \n"
5683    "cmp       $0x40,%3                        \n"
5684    "je        50f                             \n"
5685    "cmp       $0x60,%3                        \n"
5686    "je        25f                             \n"
5687
5688    "movd      %3,%%xmm0                       \n"
5689    "neg       %3                              \n"
5690    "add       $0x80,%3                        \n"
5691    "movd      %3,%%xmm5                       \n"
5692    "punpcklbw %%xmm0,%%xmm5                   \n"
5693    "punpcklwd %%xmm5,%%xmm5                   \n"
5694    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5695    "pxor      %%xmm4,%%xmm4                   \n"
5696
5697    // General purpose row blend.
5698    LABELALIGN
5699  "1:                                          \n"
5700    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5701    MEMOPREG(movdqu,0x00,1,4,1,xmm2)           //  movdqu    (%1,%4,1),%%xmm2
5702    "movdqu    %%xmm0,%%xmm1                   \n"
5703    "movdqu    %%xmm2,%%xmm3                   \n"
5704    "punpcklbw %%xmm4,%%xmm2                   \n"
5705    "punpckhbw %%xmm4,%%xmm3                   \n"
5706    "punpcklbw %%xmm4,%%xmm0                   \n"
5707    "punpckhbw %%xmm4,%%xmm1                   \n"
5708    "psubw     %%xmm0,%%xmm2                   \n"
5709    "psubw     %%xmm1,%%xmm3                   \n"
5710    "paddw     %%xmm2,%%xmm2                   \n"
5711    "paddw     %%xmm3,%%xmm3                   \n"
5712    "pmulhw    %%xmm5,%%xmm2                   \n"
5713    "pmulhw    %%xmm5,%%xmm3                   \n"
5714    "paddw     %%xmm2,%%xmm0                   \n"
5715    "paddw     %%xmm3,%%xmm1                   \n"
5716    "packuswb  %%xmm1,%%xmm0                   \n"
5717    "sub       $0x10,%2                        \n"
5718    BUNDLEALIGN
5719    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
5720    "lea       " MEMLEA(0x10,1) ",%1           \n"
5721    "jg        1b                              \n"
5722    "jmp       99f                             \n"
5723
5724    // Blend 25 / 75.
5725    LABELALIGN
5726  "25:                                         \n"
5727    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5728    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
5729    "pavgb     %%xmm1,%%xmm0                   \n"
5730    "pavgb     %%xmm1,%%xmm0                   \n"
5731    "sub       $0x10,%2                        \n"
5732    BUNDLEALIGN
5733    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
5734    "lea       " MEMLEA(0x10,1) ",%1           \n"
5735    "jg        25b                             \n"
5736    "jmp       99f                             \n"
5737
5738    // Blend 50 / 50.
5739    LABELALIGN
5740  "50:                                         \n"
5741    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5742    MEMOPREG(movdqu,0x00,1,4,1,xmm1)           //  movdqu    (%1,%4,1),%%xmm1
5743    "pavgb     %%xmm1,%%xmm0                   \n"
5744    "sub       $0x10,%2                        \n"
5745    BUNDLEALIGN
5746    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
5747    "lea       " MEMLEA(0x10,1) ",%1           \n"
5748    "jg        50b                             \n"
5749    "jmp       99f                             \n"
5750
5751    // Blend 75 / 25.
5752    LABELALIGN
5753  "75:                                         \n"
5754    "movdqu    " MEMACCESS(1) ",%%xmm1         \n"
5755    MEMOPREG(movdqu,0x00,1,4,1,xmm0)           //  movdqu    (%1,%4,1),%%xmm0
5756    "pavgb     %%xmm1,%%xmm0                   \n"
5757    "pavgb     %%xmm1,%%xmm0                   \n"
5758    "sub       $0x10,%2                        \n"
5759    BUNDLEALIGN
5760    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
5761    "lea       " MEMLEA(0x10,1) ",%1           \n"
5762    "jg        75b                             \n"
5763    "jmp       99f                             \n"
5764
5765    // Blend 100 / 0 - Copy row unchanged.
5766    LABELALIGN
5767  "100:                                        \n"
5768    "movdqu    " MEMACCESS(1) ",%%xmm0         \n"
5769    "sub       $0x10,%2                        \n"
5770    MEMOPMEM(movdqu,xmm0,0x00,1,0,1)           //  movdqu    %%xmm0,(%1,%0,1)
5771    "lea       " MEMLEA(0x10,1) ",%1           \n"
5772    "jg        100b                            \n"
5773
5774  "99:                                         \n"
5775  : "+r"(dst_ptr),    // %0
5776    "+r"(src_ptr),    // %1
5777    "+r"(dst_width),  // %2
5778    "+r"(source_y_fraction)  // %3
5779  : "r"((intptr_t)(src_stride))  // %4
5780  : "memory", "cc"
5781#if defined(__native_client__) && defined(__x86_64__)
5782    , "r14"
5783#endif
5784#if defined(__SSE2__)
5785    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
5786#endif
5787  );
5788}
5789#endif  // HAS_INTERPOLATEROW_SSE2
5790
5791#ifdef HAS_HALFROW_SSE2
5792void HalfRow_SSE2(const uint8* src_uv, int src_uv_stride,
5793                  uint8* dst_uv, int pix) {
5794  asm volatile (
5795    "sub       %0,%1                           \n"
5796    LABELALIGN
5797  "1:                                          \n"
5798    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
5799    MEMOPREG(pavgb,0x00,0,3,1,xmm0)            //  pavgb     (%0,%3),%%xmm0
5800    "sub       $0x10,%2                        \n"
5801    MEMOPMEM(movdqa,xmm0,0x00,0,1,1)           //  movdqa    %%xmm0,(%0,%1)
5802    "lea       " MEMLEA(0x10,0) ",%0           \n"
5803    "jg        1b                              \n"
5804  : "+r"(src_uv),  // %0
5805    "+r"(dst_uv),  // %1
5806    "+r"(pix)      // %2
5807  : "r"((intptr_t)(src_uv_stride))  // %3
5808  : "memory", "cc"
5809#if defined(__SSE2__)
5810      , "xmm0"
5811#endif
5812  );
5813}
5814#endif  // HAS_HALFROW_SSE2
5815
5816#ifdef HAS_ARGBTOBAYERROW_SSSE3
5817void ARGBToBayerRow_SSSE3(const uint8* src_argb, uint8* dst_bayer,
5818                          uint32 selector, int pix) {
5819  asm volatile (
5820    // NaCL caveat - assumes movd is from GPR
5821    "movd      %3,%%xmm5                       \n"
5822    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
5823    LABELALIGN
5824  "1:                                          \n"
5825    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
5826    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5827    "lea       " MEMLEA(0x20,0) ",%0           \n"
5828    "pshufb    %%xmm5,%%xmm0                   \n"
5829    "pshufb    %%xmm5,%%xmm1                   \n"
5830    "punpckldq %%xmm1,%%xmm0                   \n"
5831    "sub       $0x8,%2                         \n"
5832    "movq      %%xmm0," MEMACCESS(1) "         \n"
5833    "lea       " MEMLEA(0x8,1) ",%1            \n"
5834    "jg        1b                              \n"
5835  : "+r"(src_argb),  // %0
5836    "+r"(dst_bayer), // %1
5837    "+r"(pix)        // %2
5838  : "g"(selector)    // %3
5839  : "memory", "cc"
5840#if defined(__SSE2__)
5841    , "xmm0", "xmm1", "xmm5"
5842#endif
5843  );
5844}
5845#endif  // HAS_ARGBTOBAYERROW_SSSE3
5846
5847#ifdef HAS_ARGBTOBAYERGGROW_SSE2
5848void ARGBToBayerGGRow_SSE2(const uint8* src_argb, uint8* dst_bayer,
5849                           uint32 selector, int pix) {
5850  asm volatile (
5851    "pcmpeqb   %%xmm5,%%xmm5                   \n"
5852    "psrld     $0x18,%%xmm5                    \n"
5853    LABELALIGN
5854  "1:                                          \n"
5855    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
5856    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5857    "lea       " MEMLEA(0x20,0) ",%0           \n"
5858    "psrld     $0x8,%%xmm0                     \n"
5859    "psrld     $0x8,%%xmm1                     \n"
5860    "pand      %%xmm5,%%xmm0                   \n"
5861    "pand      %%xmm5,%%xmm1                   \n"
5862    "packssdw  %%xmm1,%%xmm0                   \n"
5863    "packuswb  %%xmm1,%%xmm0                   \n"
5864    "sub       $0x8,%2                         \n"
5865    "movq      %%xmm0," MEMACCESS(1) "         \n"
5866    "lea       " MEMLEA(0x8,1) ",%1            \n"
5867    "jg        1b                              \n"
5868  : "+r"(src_argb),  // %0
5869    "+r"(dst_bayer), // %1
5870    "+r"(pix)        // %2
5871  :
5872  : "memory", "cc"
5873#if defined(__SSE2__)
5874    , "xmm0", "xmm1", "xmm5"
5875#endif
5876  );
5877}
5878#endif  // HAS_ARGBTOBAYERGGROW_SSE2
5879
5880#ifdef HAS_ARGBSHUFFLEROW_SSSE3
5881// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5882void ARGBShuffleRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
5883                          const uint8* shuffler, int pix) {
5884  asm volatile (
5885    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
5886    LABELALIGN
5887  "1:                                          \n"
5888    "movdqa    " MEMACCESS(0) ",%%xmm0         \n"
5889    "movdqa    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5890    "lea       " MEMLEA(0x20,0) ",%0           \n"
5891    "pshufb    %%xmm5,%%xmm0                   \n"
5892    "pshufb    %%xmm5,%%xmm1                   \n"
5893    "sub       $0x8,%2                         \n"
5894    "movdqa    %%xmm0," MEMACCESS(1) "         \n"
5895    "movdqa    %%xmm1," MEMACCESS2(0x10,1) "   \n"
5896    "lea       " MEMLEA(0x20,1) ",%1           \n"
5897    "jg        1b                              \n"
5898  : "+r"(src_argb),  // %0
5899    "+r"(dst_argb),  // %1
5900    "+r"(pix)        // %2
5901  : "r"(shuffler)    // %3
5902  : "memory", "cc"
5903#if defined(__SSE2__)
5904    , "xmm0", "xmm1", "xmm5"
5905#endif
5906  );
5907}
5908
5909void ARGBShuffleRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_argb,
5910                                    const uint8* shuffler, int pix) {
5911  asm volatile (
5912    "movdqa    " MEMACCESS(3) ",%%xmm5         \n"
5913    LABELALIGN
5914  "1:                                          \n"
5915    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
5916    "movdqu    " MEMACCESS2(0x10,0) ",%%xmm1   \n"
5917    "lea       " MEMLEA(0x20,0) ",%0           \n"
5918    "pshufb    %%xmm5,%%xmm0                   \n"
5919    "pshufb    %%xmm5,%%xmm1                   \n"
5920    "sub       $0x8,%2                         \n"
5921    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
5922    "movdqu    %%xmm1," MEMACCESS2(0x10,1) "   \n"
5923    "lea       " MEMLEA(0x20,1) ",%1           \n"
5924    "jg        1b                              \n"
5925  : "+r"(src_argb),  // %0
5926    "+r"(dst_argb),  // %1
5927    "+r"(pix)        // %2
5928  : "r"(shuffler)    // %3
5929  : "memory", "cc"
5930#if defined(__SSE2__)
5931    , "xmm0", "xmm1", "xmm5"
5932#endif
5933  );
5934}
5935#endif  // HAS_ARGBSHUFFLEROW_SSSE3
5936
5937#ifdef HAS_ARGBSHUFFLEROW_AVX2
5938// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5939void ARGBShuffleRow_AVX2(const uint8* src_argb, uint8* dst_argb,
5940                         const uint8* shuffler, int pix) {
5941  asm volatile (
5942    "vbroadcastf128 " MEMACCESS(3) ",%%ymm5    \n"
5943    LABELALIGN
5944  "1:                                          \n"
5945    "vmovdqu   " MEMACCESS(0) ",%%ymm0         \n"
5946    "vmovdqu   " MEMACCESS2(0x20,0) ",%%ymm1   \n"
5947    "lea       " MEMLEA(0x40,0) ",%0           \n"
5948    "vpshufb   %%ymm5,%%ymm0,%%ymm0            \n"
5949    "vpshufb   %%ymm5,%%ymm1,%%ymm1            \n"
5950    "sub       $0x10,%2                        \n"
5951    "vmovdqu   %%ymm0," MEMACCESS(1) "         \n"
5952    "vmovdqu   %%ymm1," MEMACCESS2(0x20,1) "   \n"
5953    "lea       " MEMLEA(0x40,1) ",%1           \n"
5954    "jg        1b                              \n"
5955  : "+r"(src_argb),  // %0
5956    "+r"(dst_argb),  // %1
5957    "+r"(pix)        // %2
5958  : "r"(shuffler)    // %3
5959  : "memory", "cc"
5960#if defined(__SSE2__)
5961    , "xmm0", "xmm1", "xmm5"
5962#endif
5963  );
5964}
5965#endif  // HAS_ARGBSHUFFLEROW_AVX2
5966
5967#ifdef HAS_ARGBSHUFFLEROW_SSE2
5968// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
5969void ARGBShuffleRow_SSE2(const uint8* src_argb, uint8* dst_argb,
5970                         const uint8* shuffler, int pix) {
5971  uintptr_t pixel_temp = 0u;
5972  asm volatile (
5973    "pxor      %%xmm5,%%xmm5                   \n"
5974    "mov       " MEMACCESS(4) ",%k2            \n"
5975    "cmp       $0x3000102,%k2                  \n"
5976    "je        3012f                           \n"
5977    "cmp       $0x10203,%k2                    \n"
5978    "je        123f                            \n"
5979    "cmp       $0x30201,%k2                    \n"
5980    "je        321f                            \n"
5981    "cmp       $0x2010003,%k2                  \n"
5982    "je        2103f                           \n"
5983
5984    LABELALIGN
5985  "1:                                          \n"
5986    "movzb     " MEMACCESS(4) ",%2             \n"
5987    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5988    "mov       %b2," MEMACCESS(1) "            \n"
5989    "movzb     " MEMACCESS2(0x1,4) ",%2        \n"
5990    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5991    "mov       %b2," MEMACCESS2(0x1,1) "       \n"
5992    BUNDLEALIGN
5993    "movzb     " MEMACCESS2(0x2,4) ",%2        \n"
5994    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5995    "mov       %b2," MEMACCESS2(0x2,1) "       \n"
5996    "movzb     " MEMACCESS2(0x3,4) ",%2        \n"
5997    MEMOPARG(movzb,0x00,0,2,1,2) "             \n"  //  movzb     (%0,%2,1),%2
5998    "mov       %b2," MEMACCESS2(0x3,1) "       \n"
5999    "lea       " MEMLEA(0x4,0) ",%0            \n"
6000    "lea       " MEMLEA(0x4,1) ",%1            \n"
6001    "sub       $0x1,%3                         \n"
6002    "jg        1b                              \n"
6003    "jmp       99f                             \n"
6004
6005    LABELALIGN
6006  "123:                                        \n"
6007    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
6008    "lea       " MEMLEA(0x10,0) ",%0           \n"
6009    "movdqa    %%xmm0,%%xmm1                   \n"
6010    "punpcklbw %%xmm5,%%xmm0                   \n"
6011    "punpckhbw %%xmm5,%%xmm1                   \n"
6012    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
6013    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
6014    "pshufhw   $0x1b,%%xmm1,%%xmm1             \n"
6015    "pshuflw   $0x1b,%%xmm1,%%xmm1             \n"
6016    "packuswb  %%xmm1,%%xmm0                   \n"
6017    "sub       $0x4,%3                         \n"
6018    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
6019    "lea       " MEMLEA(0x10,1) ",%1           \n"
6020    "jg        123b                            \n"
6021    "jmp       99f                             \n"
6022
6023    LABELALIGN
6024  "321:                                        \n"
6025    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
6026    "lea       " MEMLEA(0x10,0) ",%0           \n"
6027    "movdqa    %%xmm0,%%xmm1                   \n"
6028    "punpcklbw %%xmm5,%%xmm0                   \n"
6029    "punpckhbw %%xmm5,%%xmm1                   \n"
6030    "pshufhw   $0x39,%%xmm0,%%xmm0             \n"
6031    "pshuflw   $0x39,%%xmm0,%%xmm0             \n"
6032    "pshufhw   $0x39,%%xmm1,%%xmm1             \n"
6033    "pshuflw   $0x39,%%xmm1,%%xmm1             \n"
6034    "packuswb  %%xmm1,%%xmm0                   \n"
6035    "sub       $0x4,%3                         \n"
6036    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
6037    "lea       " MEMLEA(0x10,1) ",%1           \n"
6038    "jg        321b                            \n"
6039    "jmp       99f                             \n"
6040
6041    LABELALIGN
6042  "2103:                                       \n"
6043    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
6044    "lea       " MEMLEA(0x10,0) ",%0           \n"
6045    "movdqa    %%xmm0,%%xmm1                   \n"
6046    "punpcklbw %%xmm5,%%xmm0                   \n"
6047    "punpckhbw %%xmm5,%%xmm1                   \n"
6048    "pshufhw   $0x93,%%xmm0,%%xmm0             \n"
6049    "pshuflw   $0x93,%%xmm0,%%xmm0             \n"
6050    "pshufhw   $0x93,%%xmm1,%%xmm1             \n"
6051    "pshuflw   $0x93,%%xmm1,%%xmm1             \n"
6052    "packuswb  %%xmm1,%%xmm0                   \n"
6053    "sub       $0x4,%3                         \n"
6054    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
6055    "lea       " MEMLEA(0x10,1) ",%1           \n"
6056    "jg        2103b                           \n"
6057    "jmp       99f                             \n"
6058
6059    LABELALIGN
6060  "3012:                                       \n"
6061    "movdqu    " MEMACCESS(0) ",%%xmm0         \n"
6062    "lea       " MEMLEA(0x10,0) ",%0           \n"
6063    "movdqa    %%xmm0,%%xmm1                   \n"
6064    "punpcklbw %%xmm5,%%xmm0                   \n"
6065    "punpckhbw %%xmm5,%%xmm1                   \n"
6066    "pshufhw   $0xc6,%%xmm0,%%xmm0             \n"
6067    "pshuflw   $0xc6,%%xmm0,%%xmm0             \n"
6068    "pshufhw   $0xc6,%%xmm1,%%xmm1             \n"
6069    "pshuflw   $0xc6,%%xmm1,%%xmm1             \n"
6070    "packuswb  %%xmm1,%%xmm0                   \n"
6071    "sub       $0x4,%3                         \n"
6072    "movdqu    %%xmm0," MEMACCESS(1) "         \n"
6073    "lea       " MEMLEA(0x10,1) ",%1           \n"
6074    "jg        3012b                           \n"
6075
6076  "99:                                         \n"
6077  : "+r"(src_argb),    // %0
6078    "+r"(dst_argb),    // %1
6079    "+d"(pixel_temp),  // %2
6080    "+r"(pix)         // %3
6081  : "r"(shuffler)      // %4
6082  : "memory", "cc"
6083#if defined(__native_client__) && defined(__x86_64__)
6084    , "r14"
6085#endif
6086#if defined(__SSE2__)
6087    , "xmm0", "xmm1", "xmm5"
6088#endif
6089  );
6090}
6091#endif  // HAS_ARGBSHUFFLEROW_SSE2
6092
6093#ifdef HAS_I422TOYUY2ROW_SSE2
6094void I422ToYUY2Row_SSE2(const uint8* src_y,
6095                        const uint8* src_u,
6096                        const uint8* src_v,
6097                        uint8* dst_frame, int width) {
6098 asm volatile (
6099    "sub       %1,%2                             \n"
6100    LABELALIGN
6101  "1:                                            \n"
6102    "movq      " MEMACCESS(1) ",%%xmm2           \n"
6103    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
6104    "lea       " MEMLEA(0x8,1) ",%1              \n"
6105    "punpcklbw %%xmm3,%%xmm2                     \n"
6106    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
6107    "lea       " MEMLEA(0x10,0) ",%0             \n"
6108    "movdqa    %%xmm0,%%xmm1                     \n"
6109    "punpcklbw %%xmm2,%%xmm0                     \n"
6110    "punpckhbw %%xmm2,%%xmm1                     \n"
6111    "movdqu    %%xmm0," MEMACCESS(3) "           \n"
6112    "movdqu    %%xmm1," MEMACCESS2(0x10,3) "     \n"
6113    "lea       " MEMLEA(0x20,3) ",%3             \n"
6114    "sub       $0x10,%4                          \n"
6115    "jg         1b                               \n"
6116    : "+r"(src_y),  // %0
6117      "+r"(src_u),  // %1
6118      "+r"(src_v),  // %2
6119      "+r"(dst_frame),  // %3
6120      "+rm"(width)  // %4
6121    :
6122    : "memory", "cc"
6123#if defined(__native_client__) && defined(__x86_64__)
6124    , "r14"
6125#endif
6126#if defined(__SSE2__)
6127    , "xmm0", "xmm1", "xmm2", "xmm3"
6128#endif
6129  );
6130}
6131#endif  // HAS_I422TOYUY2ROW_SSE2
6132
6133#ifdef HAS_I422TOUYVYROW_SSE2
6134void I422ToUYVYRow_SSE2(const uint8* src_y,
6135                        const uint8* src_u,
6136                        const uint8* src_v,
6137                        uint8* dst_frame, int width) {
6138 asm volatile (
6139    "sub        %1,%2                            \n"
6140    LABELALIGN
6141  "1:                                            \n"
6142    "movq      " MEMACCESS(1) ",%%xmm2           \n"
6143    MEMOPREG(movq,0x00,1,2,1,xmm3)               //  movq    (%1,%2,1),%%xmm3
6144    "lea       " MEMLEA(0x8,1) ",%1              \n"
6145    "punpcklbw %%xmm3,%%xmm2                     \n"
6146    "movdqu    " MEMACCESS(0) ",%%xmm0           \n"
6147    "movdqa    %%xmm2,%%xmm1                     \n"
6148    "lea       " MEMLEA(0x10,0) ",%0             \n"
6149    "punpcklbw %%xmm0,%%xmm1                     \n"
6150    "punpckhbw %%xmm0,%%xmm2                     \n"
6151    "movdqu    %%xmm1," MEMACCESS(3) "           \n"
6152    "movdqu    %%xmm2," MEMACCESS2(0x10,3) "     \n"
6153    "lea       " MEMLEA(0x20,3) ",%3             \n"
6154    "sub       $0x10,%4                          \n"
6155    "jg         1b                               \n"
6156    : "+r"(src_y),  // %0
6157      "+r"(src_u),  // %1
6158      "+r"(src_v),  // %2
6159      "+r"(dst_frame),  // %3
6160      "+rm"(width)  // %4
6161    :
6162    : "memory", "cc"
6163#if defined(__native_client__) && defined(__x86_64__)
6164    , "r14"
6165#endif
6166#if defined(__SSE2__)
6167    , "xmm0", "xmm1", "xmm2", "xmm3"
6168#endif
6169  );
6170}
6171#endif  // HAS_I422TOUYVYROW_SSE2
6172
6173#ifdef HAS_ARGBPOLYNOMIALROW_SSE2
6174void ARGBPolynomialRow_SSE2(const uint8* src_argb,
6175                            uint8* dst_argb, const float* poly,
6176                            int width) {
6177  asm volatile (
6178    "pxor      %%xmm3,%%xmm3                   \n"
6179
6180    // 2 pixel loop.
6181    LABELALIGN
6182  "1:                                          \n"
6183    "movq      " MEMACCESS(0) ",%%xmm0         \n"
6184    "lea       " MEMLEA(0x8,0) ",%0            \n"
6185    "punpcklbw %%xmm3,%%xmm0                   \n"
6186    "movdqa    %%xmm0,%%xmm4                   \n"
6187    "punpcklwd %%xmm3,%%xmm0                   \n"
6188    "punpckhwd %%xmm3,%%xmm4                   \n"
6189    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
6190    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
6191    "movdqa    %%xmm0,%%xmm1                   \n"
6192    "movdqa    %%xmm4,%%xmm5                   \n"
6193    "mulps     " MEMACCESS2(0x10,3) ",%%xmm0   \n"
6194    "mulps     " MEMACCESS2(0x10,3) ",%%xmm4   \n"
6195    "addps     " MEMACCESS(3) ",%%xmm0         \n"
6196    "addps     " MEMACCESS(3) ",%%xmm4         \n"
6197    "movdqa    %%xmm1,%%xmm2                   \n"
6198    "movdqa    %%xmm5,%%xmm6                   \n"
6199    "mulps     %%xmm1,%%xmm2                   \n"
6200    "mulps     %%xmm5,%%xmm6                   \n"
6201    "mulps     %%xmm2,%%xmm1                   \n"
6202    "mulps     %%xmm6,%%xmm5                   \n"
6203    "mulps     " MEMACCESS2(0x20,3) ",%%xmm2   \n"
6204    "mulps     " MEMACCESS2(0x20,3) ",%%xmm6   \n"
6205    "mulps     " MEMACCESS2(0x30,3) ",%%xmm1   \n"
6206    "mulps     " MEMACCESS2(0x30,3) ",%%xmm5   \n"
6207    "addps     %%xmm2,%%xmm0                   \n"
6208    "addps     %%xmm6,%%xmm4                   \n"
6209    "addps     %%xmm1,%%xmm0                   \n"
6210    "addps     %%xmm5,%%xmm4                   \n"
6211    "cvttps2dq %%xmm0,%%xmm0                   \n"
6212    "cvttps2dq %%xmm4,%%xmm4                   \n"
6213    "packuswb  %%xmm4,%%xmm0                   \n"
6214    "packuswb  %%xmm0,%%xmm0                   \n"
6215    "sub       $0x2,%2                         \n"
6216    "movq      %%xmm0," MEMACCESS(1) "         \n"
6217    "lea       " MEMLEA(0x8,1) ",%1            \n"
6218    "jg        1b                              \n"
6219  : "+r"(src_argb),  // %0
6220    "+r"(dst_argb),  // %1
6221    "+r"(width)      // %2
6222  : "r"(poly)        // %3
6223  : "memory", "cc"
6224#if defined(__SSE2__)
6225    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
6226#endif
6227  );
6228}
6229#endif  // HAS_ARGBPOLYNOMIALROW_SSE2
6230
6231#ifdef HAS_ARGBPOLYNOMIALROW_AVX2
6232void ARGBPolynomialRow_AVX2(const uint8* src_argb,
6233                            uint8* dst_argb, const float* poly,
6234                            int width) {
6235  asm volatile (
6236    "vbroadcastf128 " MEMACCESS(3) ",%%ymm4     \n"
6237    "vbroadcastf128 " MEMACCESS2(0x10,3) ",%%ymm5 \n"
6238    "vbroadcastf128 " MEMACCESS2(0x20,3) ",%%ymm6 \n"
6239    "vbroadcastf128 " MEMACCESS2(0x30,3) ",%%ymm7 \n"
6240
6241    // 2 pixel loop.
6242    LABELALIGN
6243  "1:                                          \n"
6244    "vpmovzxbd   " MEMACCESS(0) ",%%ymm0       \n"  // 2 ARGB pixels
6245    "lea         " MEMLEA(0x8,0) ",%0          \n"
6246    "vcvtdq2ps   %%ymm0,%%ymm0                 \n"  // X 8 floats
6247    "vmulps      %%ymm0,%%ymm0,%%ymm2          \n"  // X * X
6248    "vmulps      %%ymm7,%%ymm0,%%ymm3          \n"  // C3 * X
6249    "vfmadd132ps %%ymm5,%%ymm4,%%ymm0          \n"  // result = C0 + C1 * X
6250    "vfmadd231ps %%ymm6,%%ymm2,%%ymm0          \n"  // result += C2 * X * X
6251    "vfmadd231ps %%ymm3,%%ymm2,%%ymm0          \n"  // result += C3 * X * X * X
6252    "vcvttps2dq  %%ymm0,%%ymm0                 \n"
6253    "vpackusdw   %%ymm0,%%ymm0,%%ymm0          \n"
6254    "vpermq      $0xd8,%%ymm0,%%ymm0           \n"
6255    "vpackuswb   %%xmm0,%%xmm0,%%xmm0          \n"
6256    "sub         $0x2,%2                       \n"
6257    "vmovq       %%xmm0," MEMACCESS(1) "       \n"
6258    "lea         " MEMLEA(0x8,1) ",%1          \n"
6259    "jg          1b                            \n"
6260    "vzeroupper                                \n"
6261  : "+r"(src_argb),  // %0
6262    "+r"(dst_argb),  // %1
6263    "+r"(width)      // %2
6264  : "r"(poly)        // %3
6265  : "memory", "cc"
6266#if defined(__SSE2__)
6267// TODO(fbarchard): declare ymm usage when applicable.
6268    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
6269#endif
6270  );
6271}
6272#endif  // HAS_ARGBPOLYNOMIALROW_AVX2
6273
6274#ifdef HAS_ARGBCOLORTABLEROW_X86
6275// Tranform ARGB pixels with color table.
6276void ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
6277                           int width) {
6278  uintptr_t pixel_temp = 0u;
6279  asm volatile (
6280    // 1 pixel loop.
6281    LABELALIGN
6282  "1:                                          \n"
6283    "movzb     " MEMACCESS(0) ",%1             \n"
6284    "lea       " MEMLEA(0x4,0) ",%0            \n"
6285    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
6286    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
6287    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
6288    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
6289    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
6290    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
6291    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
6292    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
6293    "movzb     " MEMACCESS2(-0x1,0) ",%1       \n"
6294    MEMOPARG(movzb,0x03,3,1,4,1) "             \n"  // movzb 0x3(%3,%1,4),%1
6295    "mov       %b1," MEMACCESS2(-0x1,0) "      \n"
6296    "dec       %2                              \n"
6297    "jg        1b                              \n"
6298  : "+r"(dst_argb),   // %0
6299    "+d"(pixel_temp), // %1
6300    "+r"(width)       // %2
6301  : "r"(table_argb)   // %3
6302  : "memory", "cc");
6303}
6304#endif  // HAS_ARGBCOLORTABLEROW_X86
6305
6306#ifdef HAS_RGBCOLORTABLEROW_X86
6307// Tranform RGB pixels with color table.
6308void RGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, int width) {
6309  uintptr_t pixel_temp = 0u;
6310  asm volatile (
6311    // 1 pixel loop.
6312    LABELALIGN
6313  "1:                                          \n"
6314    "movzb     " MEMACCESS(0) ",%1             \n"
6315    "lea       " MEMLEA(0x4,0) ",%0            \n"
6316    MEMOPARG(movzb,0x00,3,1,4,1) "             \n"  // movzb (%3,%1,4),%1
6317    "mov       %b1," MEMACCESS2(-0x4,0) "      \n"
6318    "movzb     " MEMACCESS2(-0x3,0) ",%1       \n"
6319    MEMOPARG(movzb,0x01,3,1,4,1) "             \n"  // movzb 0x1(%3,%1,4),%1
6320    "mov       %b1," MEMACCESS2(-0x3,0) "      \n"
6321    "movzb     " MEMACCESS2(-0x2,0) ",%1       \n"
6322    MEMOPARG(movzb,0x02,3,1,4,1) "             \n"  // movzb 0x2(%3,%1,4),%1
6323    "mov       %b1," MEMACCESS2(-0x2,0) "      \n"
6324    "dec       %2                              \n"
6325    "jg        1b                              \n"
6326  : "+r"(dst_argb),   // %0
6327    "+d"(pixel_temp), // %1
6328    "+r"(width)       // %2
6329  : "r"(table_argb)   // %3
6330  : "memory", "cc");
6331}
6332#endif  // HAS_RGBCOLORTABLEROW_X86
6333
6334#ifdef HAS_ARGBLUMACOLORTABLEROW_SSSE3
6335// Tranform RGB pixels with luma table.
6336void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
6337                                 int width,
6338                                 const uint8* luma, uint32 lumacoeff) {
6339  uintptr_t pixel_temp = 0u;
6340  uintptr_t table_temp = 0u;
6341  asm volatile (
6342    "movd      %6,%%xmm3                       \n"
6343    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
6344    "pcmpeqb   %%xmm4,%%xmm4                   \n"
6345    "psllw     $0x8,%%xmm4                     \n"
6346    "pxor      %%xmm5,%%xmm5                   \n"
6347
6348    // 4 pixel loop.
6349    LABELALIGN
6350  "1:                                          \n"
6351    "movdqu    " MEMACCESS(2) ",%%xmm0         \n"
6352    "pmaddubsw %%xmm3,%%xmm0                   \n"
6353    "phaddw    %%xmm0,%%xmm0                   \n"
6354    "pand      %%xmm4,%%xmm0                   \n"
6355    "punpcklwd %%xmm5,%%xmm0                   \n"
6356    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
6357    "add       %5,%1                           \n"
6358    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
6359
6360    "movzb     " MEMACCESS(2) ",%0             \n"
6361    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6362    "mov       %b0," MEMACCESS(3) "            \n"
6363    "movzb     " MEMACCESS2(0x1,2) ",%0        \n"
6364    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6365    "mov       %b0," MEMACCESS2(0x1,3) "       \n"
6366    "movzb     " MEMACCESS2(0x2,2) ",%0        \n"
6367    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6368    "mov       %b0," MEMACCESS2(0x2,3) "       \n"
6369    "movzb     " MEMACCESS2(0x3,2) ",%0        \n"
6370    "mov       %b0," MEMACCESS2(0x3,3) "       \n"
6371
6372    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
6373    "add       %5,%1                           \n"
6374    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
6375
6376    "movzb     " MEMACCESS2(0x4,2) ",%0        \n"
6377    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6378    "mov       %b0," MEMACCESS2(0x4,3) "       \n"
6379    BUNDLEALIGN
6380    "movzb     " MEMACCESS2(0x5,2) ",%0        \n"
6381    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6382    "mov       %b0," MEMACCESS2(0x5,3) "       \n"
6383    "movzb     " MEMACCESS2(0x6,2) ",%0        \n"
6384    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6385    "mov       %b0," MEMACCESS2(0x6,3) "       \n"
6386    "movzb     " MEMACCESS2(0x7,2) ",%0        \n"
6387    "mov       %b0," MEMACCESS2(0x7,3) "       \n"
6388
6389    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
6390    "add       %5,%1                           \n"
6391    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
6392
6393    "movzb     " MEMACCESS2(0x8,2) ",%0        \n"
6394    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6395    "mov       %b0," MEMACCESS2(0x8,3) "       \n"
6396    "movzb     " MEMACCESS2(0x9,2) ",%0        \n"
6397    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6398    "mov       %b0," MEMACCESS2(0x9,3) "       \n"
6399    "movzb     " MEMACCESS2(0xa,2) ",%0        \n"
6400    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6401    "mov       %b0," MEMACCESS2(0xa,3) "       \n"
6402    "movzb     " MEMACCESS2(0xb,2) ",%0        \n"
6403    "mov       %b0," MEMACCESS2(0xb,3) "       \n"
6404
6405    "movd      %%xmm0,%k1                      \n"  // 32 bit offset
6406    "add       %5,%1                           \n"
6407
6408    "movzb     " MEMACCESS2(0xc,2) ",%0        \n"
6409    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6410    "mov       %b0," MEMACCESS2(0xc,3) "       \n"
6411    "movzb     " MEMACCESS2(0xd,2) ",%0        \n"
6412    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6413    "mov       %b0," MEMACCESS2(0xd,3) "       \n"
6414    "movzb     " MEMACCESS2(0xe,2) ",%0        \n"
6415    MEMOPARG(movzb,0x00,1,0,1,0) "             \n"  // movzb     (%1,%0,1),%0
6416    "mov       %b0," MEMACCESS2(0xe,3) "       \n"
6417    "movzb     " MEMACCESS2(0xf,2) ",%0        \n"
6418    "mov       %b0," MEMACCESS2(0xf,3) "       \n"
6419    "sub       $0x4,%4                         \n"
6420    "lea       " MEMLEA(0x10,2) ",%2           \n"
6421    "lea       " MEMLEA(0x10,3) ",%3           \n"
6422    "jg        1b                              \n"
6423  : "+d"(pixel_temp),  // %0
6424    "+a"(table_temp),  // %1
6425    "+r"(src_argb),    // %2
6426    "+r"(dst_argb),    // %3
6427    "+rm"(width)       // %4
6428  : "r"(luma),         // %5
6429    "rm"(lumacoeff)    // %6
6430  : "memory", "cc"
6431#if defined(__SSE2__)
6432    , "xmm0", "xmm3", "xmm4", "xmm5"
6433#endif
6434  );
6435}
6436#endif  // HAS_ARGBLUMACOLORTABLEROW_SSSE3
6437
6438#endif  // defined(__x86_64__) || defined(__i386__)
6439
6440#ifdef __cplusplus
6441}  // extern "C"
6442}  // namespace libyuv
6443#endif
6444