17cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde/*
233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
37cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *
47cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  Use of this source code is governed by a BSD-style license
57cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  that can be found in the LICENSE file in the root of the source
67cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  tree. An additional intellectual property rights grant can be found
77cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  in the file PATENTS.  All contributing project authors may
87cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  be found in the AUTHORS file in the root of the source tree.
97cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde */
107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h"
127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/basic_types.h"
1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampnamespace libyuv {
177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" {
1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
1933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
2033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// This module is for GCC x86 and x64
2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && (defined(__x86_64__) || defined(__i386__))
2233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
2333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// GCC 4.2 on OSX has link error when passing static or const to inline.
2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Use static const when gcc 4.2 support is dropped.
2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __APPLE__
2633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define CONST
2733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#else
2833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define CONST static const
2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#ifdef HAS_ARGBTOYROW_SSSE3
327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for ARGB
3433cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kARGBToY = {
3533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
3633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kARGBToU = {
3933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
4033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
4133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
4233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kARGBToV = {
4333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
4433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
4533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for BGRA
4733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kBGRAToY = {
4833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
4933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
5033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
5133cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kBGRAToU = {
5233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
5333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
5433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
5533cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kBGRAToV = {
5633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
5733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
5833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for ABGR
6033cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kABGRToY = {
6133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
6233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
6433cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kABGRToU = {
6533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
6733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
6833cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kABGRToV = {
6933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
7033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
7133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
7233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kAddY16 = {
7333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
7633cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kAddUV128 = {
7733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
7833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
8133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting RGB24 to ARGB.
8233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMaskRGB24ToARGB = {
837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Shuffle table for converting RAW to ARGB.
8733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMaskRAWToARGB = {
887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
9133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ABGR to ARGB.
9233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMaskABGRToARGB = {
9333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
9433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
9533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
9633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting BGRA to ARGB.
9733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMaskBGRAToARGB = {
9833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
9933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
10033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
10133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting RGBA to ARGB.
10233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMaskRGBAToARGB = {
10333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
10433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
10533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
10633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RGBA.
10733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMaskARGBToRGBA = {
10833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
10933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
11033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
11133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RGB24.
11233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMaskARGBToRGB24 = {
11333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
11433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
11533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
11633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RAW.
11733cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMaskARGBToRAW = {
11833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
11933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
12033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
12133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
12233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
12333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
12433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm5                    \n"
12533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
12633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
12733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      (%0),%%xmm0                     \n"
12833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%0),%0                      \n"
12933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm0                   \n"
13033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
13133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm0,%%xmm0                   \n"
13233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm1,%%xmm1                   \n"
13333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm0                   \n"
13433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm1                   \n"
13533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
13633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%1)                 \n"
13733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%1),%1                     \n"
13833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%2                         \n"
13933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
14033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_y),     // %0
1417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(dst_argb),  // %1
1427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(pix)        // %2
14333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
14433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
14633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm5"
14733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
14833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
14933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
15033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
15133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
15233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
15333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm5                       \n"
15433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
15633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
15733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
15833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm5,%%xmm0                   \n"
15933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
16133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
16233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
16333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
16433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_abgr),  // %0
16533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %1
16633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
16733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMaskABGRToARGB)  // %3
16833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
16933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
17033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm5"
17133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
17233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
17333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
17433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
17533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm5                       \n"
17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
17933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
18033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
18133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
18233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm5,%%xmm0                   \n"
18333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
18433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
18533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
18633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
18733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_bgra),  // %0
18833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %1
18933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
19033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMaskBGRAToARGB)  // %3
19133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
19233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
19333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm5"
19433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
19533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
19633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
19733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
19833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
19933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
20033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm5                       \n"
20133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
20233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
20333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
20433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
20533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm5,%%xmm0                   \n"
20633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
20733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
20833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
20933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
21033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
21133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_rgba),  // %0
21233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %1
21333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
21433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMaskRGBAToARGB)  // %3
21533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
21633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
21733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm5"
21833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
21933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
22033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
22133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
22233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
22333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
22433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm5                       \n"
22533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
22633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
22733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
22833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
22933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm5,%%xmm0                   \n"
23033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
23133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
23233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
23333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
23433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
23533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),  // %0
23633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_rgba),  // %1
23733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
23833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMaskARGBToRGBA)  // %3
23933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
24033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
24133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm5"
24233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
24333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
24433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
24533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
24633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
24733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
24833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
24933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm5                    \n"
25033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
25133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
25233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
25333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
25433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
25533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0),%%xmm3                 \n"
25633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x30(%0),%0                     \n"
25733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm2                   \n"
25833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr   $0x8,%%xmm1,%%xmm2              \n"
25933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm2                   \n"
26033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm2                   \n"
26133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr   $0xc,%%xmm0,%%xmm1              \n"
26233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm0                   \n"
26333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,0x20(%1)                 \n"
26433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm0                   \n"
26533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm1                   \n"
26633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
26733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm1                   \n"
26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr   $0x4,%%xmm3,%%xmm3              \n"
26933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm3                   \n"
27033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%1)                 \n"
27133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm3                   \n"
27233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,0x30(%1)                 \n"
27433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%1),%1                     \n"
27533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
27633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_rgb24),  // %0
27733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %1
27833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
27933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMaskRGB24ToARGB)  // %3
28033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
28133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
28233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
28433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
2857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
2867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, int pix) {
28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"  // generate mask 0xff000000
29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm5                    \n"
29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0),%%xmm3                 \n"
29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x30(%0),%0                     \n"
29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm2                   \n"
29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr   $0x8,%%xmm1,%%xmm2              \n"
30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm2                   \n"
30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm2                   \n"
30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr   $0xc,%%xmm0,%%xmm1              \n"
30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm0                   \n"
30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,0x20(%1)                 \n"
30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm0                   \n"
30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm1                   \n"
30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm1                   \n"
30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "palignr   $0x4,%%xmm3,%%xmm3              \n"
31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm3                   \n"
31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%1)                 \n"
31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm3                   \n"
31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,0x30(%1)                 \n"
31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%1),%1                     \n"
31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
3177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  : "+r"(src_raw),   // %0
3187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(dst_argb),  // %1
3197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(pix)        // %2
32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMaskRAWToARGB)  // %3
32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
3267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGB565ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       $0x1080108,%%eax                \n"
33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%eax,%%xmm5                    \n"
33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       $0x20802080,%%eax               \n"
33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%eax,%%xmm6                    \n"
33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm3,%%xmm3                   \n"
33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0xb,%%xmm3                     \n"
33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0xa,%%xmm4                     \n"
34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x5,%%xmm4                     \n"
34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm7,%%xmm7                   \n"
34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x8,%%xmm7                     \n"
34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm2                   \n"
35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm3,%%xmm1                   \n"
35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0xb,%%xmm2                     \n"
35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm5,%%xmm1                   \n"
35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm5,%%xmm2                   \n"
35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x8,%%xmm1                     \n"
35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm1                   \n"
35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm4,%%xmm0                   \n"
35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm6,%%xmm0                   \n"
35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm7,%%xmm0                   \n"
35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,%%xmm2                   \n"
36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm1                   \n"
36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm0,%%xmm2                   \n"
36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,(%1,%0,2)                \n"
36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%2                         \n"
36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)   // %2
37033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
37133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "eax"
37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
37333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
3747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
37533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
37633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
3777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGB1555ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       $0x1080108,%%eax                \n"
38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%eax,%%xmm5                    \n"
38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       $0x42004200,%%eax               \n"
38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%eax,%%xmm6                    \n"
38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm6,%%xmm6              \n"
38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm3,%%xmm3                   \n"
38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0xb,%%xmm3                     \n"
38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm4                   \n"
38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x6,%%xmm4                     \n"
39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm7,%%xmm7                   \n"
39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x8,%%xmm7                     \n"
39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
39533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm2                   \n"
39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x1,%%xmm1                     \n"
40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0xb,%%xmm2                     \n"
40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm3,%%xmm1                   \n"
40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm5,%%xmm2                   \n"
40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm5,%%xmm1                   \n"
40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x8,%%xmm1                     \n"
40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm1                   \n"
40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm2                   \n"
40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm4,%%xmm0                   \n"
40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm2                     \n"
40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm6,%%xmm0                   \n"
41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm7,%%xmm2                   \n"
41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm0                   \n"
41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,%%xmm2                   \n"
41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm1                   \n"
41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm0,%%xmm2                   \n"
41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,(%1,%0,2)                \n"
41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,0x10(%1,%0,2)            \n"
41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%2                         \n"
41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)   // %2
42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "eax"
42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
4307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGB4444ToARGBRow_SSE2(const uint8* src, uint8* dst, int pix) {
43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       $0xf0f0f0f,%%eax                \n"
43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%eax,%%xmm4                    \n"
43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm4,%%xmm5                   \n"
43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x4,%%xmm5                     \n"
43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm2                   \n"
44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm4,%%xmm0                   \n"
44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm2                   \n"
44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm3                   \n"
44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x4,%%xmm1                     \n"
44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x4,%%xmm3                     \n"
45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm1,%%xmm0                   \n"
45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm3,%%xmm2                   \n"
45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm2,%%xmm0                   \n"
45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm2,%%xmm1                   \n"
45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1,%0,2)                \n"
45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%1,%0,2)            \n"
45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%2                         \n"
45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)   // %2
46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "eax"
46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGB24Row_SSSE3(const uint8* src, uint8* dst, int pix) {
47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm6                       \n"
47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm3                 \n"
48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm6,%%xmm0                   \n"
48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm6,%%xmm1                   \n"
48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm6,%%xmm2                   \n"
48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm6,%%xmm3                   \n"
48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,%%xmm4                   \n"
48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrldq    $0x4,%%xmm1                     \n"
48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslldq    $0xc,%%xmm4                     \n"
48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm5                   \n"
48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslldq    $0x8,%%xmm5                     \n"
49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm1                   \n"
49333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrldq    $0x8,%%xmm2                     \n"
49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslldq    $0x4,%%xmm3                     \n"
49533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm3,%%xmm2                   \n"
49633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%1)                 \n"
49733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,0x20(%1)                 \n"
49833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x30(%1),%1                     \n"
49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
50033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
50133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)   // %2
50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMaskARGBToRGB24)  // %3
50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRAWRow_SSSE3(const uint8* src, uint8* dst, int pix) {
51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm6                       \n"
51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm3                 \n"
52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm6,%%xmm0                   \n"
52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm6,%%xmm1                   \n"
52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm6,%%xmm2                   \n"
52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm6,%%xmm3                   \n"
52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,%%xmm4                   \n"
52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrldq    $0x4,%%xmm1                     \n"
52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslldq    $0xc,%%xmm4                     \n"
52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm5                   \n"
53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslldq    $0x8,%%xmm5                     \n"
53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm5,%%xmm1                   \n"
53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrldq    $0x8,%%xmm2                     \n"
53533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslldq    $0x4,%%xmm3                     \n"
53633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm3,%%xmm2                   \n"
53733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%1)                 \n"
53833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,0x20(%1)                 \n"
53933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x30(%1),%1                     \n"
54033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
54133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
54233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
54333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
54433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)   // %2
54533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMaskARGBToRAW)  // %3
54633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
54733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
54833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
54933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
55033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
55133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
55233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
55333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGB565Row_SSE2(const uint8* src, uint8* dst, int pix) {
55433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
55533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm3,%%xmm3                   \n"
55633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x1b,%%xmm3                    \n"
55733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
55833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x1a,%%xmm4                    \n"
55933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x5,%%xmm4                     \n"
56033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
56133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0xb,%%xmm5                     \n"
56233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
56333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
56433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
56533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
56633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm2                   \n"
56733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x8,%%xmm0                     \n"
56833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x3,%%xmm1                     \n"
56933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x5,%%xmm2                     \n"
57033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrad     $0x10,%%xmm0                    \n"
57133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm3,%%xmm1                   \n"
57233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm4,%%xmm2                   \n"
57333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
57433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm1                   \n"
57533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm1,%%xmm0                   \n"
57633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packssdw  %%xmm0,%%xmm0                   \n"
57733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
57833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
57933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
58033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
58133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
58233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
58333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
58433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)   // %2
58533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
58633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
58733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
58833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
58933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
59033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
59133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
59233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
59333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToARGB1555Row_SSE2(const uint8* src, uint8* dst, int pix) {
59433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
59533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
59633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x1b,%%xmm4                    \n"
59733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm4,%%xmm5                   \n"
59833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x5,%%xmm5                     \n"
59933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm4,%%xmm6                   \n"
60033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0xa,%%xmm6                     \n"
60133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm7,%%xmm7                   \n"
60233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0xf,%%xmm7                     \n"
60333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
60433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
60533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
60633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
60733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm2                   \n"
60833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm3                   \n"
60933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrad     $0x10,%%xmm0                    \n"
61033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x3,%%xmm1                     \n"
61133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x6,%%xmm2                     \n"
61233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x9,%%xmm3                     \n"
61333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm7,%%xmm0                   \n"
61433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm4,%%xmm1                   \n"
61533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm2                   \n"
61633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm3                   \n"
61733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm1,%%xmm0                   \n"
61833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm3,%%xmm2                   \n"
61933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm0                   \n"
62033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packssdw  %%xmm0,%%xmm0                   \n"
62133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
62233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
62333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
62433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
62533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
62633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
62733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
62833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)   // %2
62933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
63033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
63133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
63233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
63333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
63433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
63533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
63633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
63733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToARGB4444Row_SSE2(const uint8* src, uint8* dst, int pix) {
63833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
63933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
64033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0xc,%%xmm4                     \n"
64133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm4,%%xmm3                   \n"
64233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm3                     \n"
64333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
64433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
64533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
64633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
64733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm3,%%xmm0                   \n"
64833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm4,%%xmm1                   \n"
64933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlq     $0x4,%%xmm0                     \n"
65033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlq     $0x8,%%xmm1                     \n"
65133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm1,%%xmm0                   \n"
65233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
65333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
65433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
65533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
65633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
65733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
65833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
65933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
66033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)   // %2
66133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
66233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
66333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
66433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
66533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
66633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
66733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
66833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
66933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
67033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
67133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm5                       \n"
67233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
67333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
67433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
67533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
67633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
67733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
67833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm3                 \n"
67933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
68033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
68133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
68233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm3                   \n"
68333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
68433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm0                   \n"
68533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm3,%%xmm2                   \n"
68633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
68733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm2                     \n"
68833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm2,%%xmm0                   \n"
68933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
69033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
69133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
69233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
69333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
69433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),  // %0
69533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
69633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
69733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kARGBToY),   // %3
69833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddY16)     // %4
69933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
70033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
70133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
70233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
70333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
70433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
70533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
70633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
70733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
70833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm5                       \n"
70933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
71033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
71133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
71233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
71333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
71433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0),%%xmm2                 \n"
71533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0),%%xmm3                 \n"
71633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
71733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
71833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
71933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm3                   \n"
72033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
72133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm0                   \n"
72233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm3,%%xmm2                   \n"
72333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
72433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm2                     \n"
72533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm2,%%xmm0                   \n"
72633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
72733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
72833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%1)                     \n"
72933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
73033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
73133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),  // %0
73233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
73333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
73433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kARGBToY),   // %3
73533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddY16)     // %4
73633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
73733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
73833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
73933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
74033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
74133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
74233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
74333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): pass xmm constants to single block of assembly.
74433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// fpic on GCC 4.2 for OSX runs out of GPR registers. "m" effectively takes
74533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 3 registers - ebx, ebp and eax. "m" can be passed with 3 normal registers,
74633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// or 4 if stack frame is disabled. Doing 2 assembly blocks is a work around
74733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// and considered unsafe.
74833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
74933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* dst_u, uint8* dst_v, int width) {
75033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
75133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %0,%%xmm4                       \n"
75233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %1,%%xmm3                       \n"
75333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %2,%%xmm5                       \n"
75433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
75533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kARGBToU),  // %0
75633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kARGBToV),  // %1
75733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddUV128)  // %2
75833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
75933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
76033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
76133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
76233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
76333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
76433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
76533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
76633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm6                 \n"
76733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     (%0,%4,1),%%xmm0                \n"
76833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
76933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
77033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
77133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
77233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm7                   \n"
77333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm1,%%xmm0             \n"
77433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
77533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
77633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm7                   \n"
77733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm6,%%xmm2             \n"
77833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
77933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
78033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
78133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm6                   \n"
78233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
78333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
78433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm1                   \n"
78533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm6                   \n"
78633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm2,%%xmm0                   \n"
78733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm6,%%xmm1                   \n"
78833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm0                     \n"
78933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm1                     \n"
79033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packsswb  %%xmm1,%%xmm0                   \n"
79133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
79233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
79333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlps    %%xmm0,(%1)                     \n"
79433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhps    %%xmm0,(%1,%2,1)                \n"
79533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
79633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
79733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb0),       // %0
79833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),           // %1
79933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),           // %2
80033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(width)           // %3
80133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(src_stride_argb))
80233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
80333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
80433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
80533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
80633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
80733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
80833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
80933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
81033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 uint8* dst_u, uint8* dst_v, int width) {
81133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
81233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %0,%%xmm4                       \n"
81333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %1,%%xmm3                       \n"
81433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %2,%%xmm5                       \n"
81533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
81633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kARGBToU),         // %0
81733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kARGBToV),         // %1
81833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddUV128)         // %2
81933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
82033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
82133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
82233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
82333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
82433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
82533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
82633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0),%%xmm2                 \n"
82733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0),%%xmm6                 \n"
82833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0,%4,1),%%xmm7                \n"
82933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
83033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
83133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm1                   \n"
83233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
83333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
83433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
83533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm6                   \n"
83633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
83733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm7                   \n"
83833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm1,%%xmm0             \n"
83933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
84033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
84133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm7                   \n"
84233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm6,%%xmm2             \n"
84333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
84433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
84533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
84633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm6                   \n"
84733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
84833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
84933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm1                   \n"
85033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm6                   \n"
85133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm2,%%xmm0                   \n"
85233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm6,%%xmm1                   \n"
85333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm0                     \n"
85433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm1                     \n"
85533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packsswb  %%xmm1,%%xmm0                   \n"
85633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
85733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
85833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlps    %%xmm0,(%1)                     \n"
85933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhps    %%xmm0,(%1,%2,1)                \n"
86033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
86133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
86233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb0),       // %0
86333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),           // %1
86433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),           // %2
86533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(width)           // %3
86633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(src_stride_argb))
86733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
86833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
86933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
87033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
87133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
87233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
87333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
87433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToYRow_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
87533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
87633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm5                       \n"
87733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
87833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
87933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
88033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
88133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
88233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
88333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm3                 \n"
88433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
88533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
88633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
88733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm3                   \n"
88833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
88933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm0                   \n"
89033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm3,%%xmm2                   \n"
89133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
89233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm2                     \n"
89333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm2,%%xmm0                   \n"
89433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
89533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
89633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
89733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
89833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
89933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_bgra),  // %0
90033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
90133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
90233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kBGRAToY),   // %3
90333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddY16)     // %4
90433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
90533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
90633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
90733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
90833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
90933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
91033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
91133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToYRow_Unaligned_SSSE3(const uint8* src_bgra, uint8* dst_y, int pix) {
91233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
91333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm5                       \n"
91433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
91533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
91633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
91733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
91833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
91933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0),%%xmm2                 \n"
92033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0),%%xmm3                 \n"
92133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
92233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
92333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
92433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm3                   \n"
92533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
92633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm0                   \n"
92733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm3,%%xmm2                   \n"
92833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
92933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm2                     \n"
93033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm2,%%xmm0                   \n"
93133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
93233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
93333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%1)                     \n"
93433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
93533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
93633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_bgra),  // %0
93733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
93833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
93933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kBGRAToY),   // %3
94033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddY16)     // %4
94133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
94233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
94333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
94433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
94533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
94633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
94733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
94833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToUVRow_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
94933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* dst_u, uint8* dst_v, int width) {
95033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
95133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %0,%%xmm4                       \n"
95233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %1,%%xmm3                       \n"
95333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %2,%%xmm5                       \n"
95433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
95533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kBGRAToU),         // %0
95633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kBGRAToV),         // %1
95733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddUV128)         // %2
95833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
95933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
96033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
96133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
96233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
96333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
96433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
96533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
96633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm6                 \n"
96733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     (%0,%4,1),%%xmm0                \n"
96833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
96933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
97033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
97133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
97233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm7                   \n"
97333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm1,%%xmm0             \n"
97433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
97533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
97633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm7                   \n"
97733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm6,%%xmm2             \n"
97833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
97933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
98033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
98133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm6                   \n"
98233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
98333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
98433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm1                   \n"
98533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm6                   \n"
98633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm2,%%xmm0                   \n"
98733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm6,%%xmm1                   \n"
98833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm0                     \n"
98933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm1                     \n"
99033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packsswb  %%xmm1,%%xmm0                   \n"
99133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
99233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
99333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlps    %%xmm0,(%1)                     \n"
99433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhps    %%xmm0,(%1,%2,1)                \n"
99533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
99633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
99733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_bgra0),       // %0
99833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),           // %1
99933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),           // %2
100033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(width)           // %3
100133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(src_stride_bgra))
100233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
100333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
100433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
100533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
100633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
100733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
100833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
100933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToUVRow_Unaligned_SSSE3(const uint8* src_bgra0, int src_stride_bgra,
101033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 uint8* dst_u, uint8* dst_v, int width) {
101133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
101233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %0,%%xmm4                       \n"
101333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %1,%%xmm3                       \n"
101433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %2,%%xmm5                       \n"
101533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
101633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kBGRAToU),         // %0
101733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kBGRAToV),         // %1
101833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddUV128)         // %2
101933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
102033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
102133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
102233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
102333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
102433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
102533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
102633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0),%%xmm2                 \n"
102733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0),%%xmm6                 \n"
102833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0,%4,1),%%xmm7                \n"
102933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
103033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
103133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm1                   \n"
103233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
103333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
103433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
103533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm6                   \n"
103633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
103733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm7                   \n"
103833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm1,%%xmm0             \n"
103933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
104033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
104133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm7                   \n"
104233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm6,%%xmm2             \n"
104333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
104433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
104533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
104633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm6                   \n"
104733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
104833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
104933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm1                   \n"
105033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm6                   \n"
105133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm2,%%xmm0                   \n"
105233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm6,%%xmm1                   \n"
105333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm0                     \n"
105433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm1                     \n"
105533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packsswb  %%xmm1,%%xmm0                   \n"
105633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
105733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
105833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlps    %%xmm0,(%1)                     \n"
105933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhps    %%xmm0,(%1,%2,1)                \n"
106033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
106133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
106233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_bgra0),       // %0
106333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),           // %1
106433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),           // %2
106533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(width)           // %3
106633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(src_stride_bgra))
106733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
106833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
106933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
107033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
107133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
107233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
107333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
107433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToYRow_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
107533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
107633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm5                       \n"
107733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
107833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
107933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
108033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
108133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
108233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
108333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm3                 \n"
108433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
108533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
108633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
108733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm3                   \n"
108833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
108933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm0                   \n"
109033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm3,%%xmm2                   \n"
109133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
109233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm2                     \n"
109333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm2,%%xmm0                   \n"
109433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
109533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
109633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
109733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
109833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
109933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_abgr),  // %0
110033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
110133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
110233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kABGRToY),   // %3
110333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddY16)     // %4
110433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
110533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
110633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
110733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
110833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
110933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
111033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
111133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToYRow_Unaligned_SSSE3(const uint8* src_abgr, uint8* dst_y, int pix) {
111233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
111333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm5                       \n"
111433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
111533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
111633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
111733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
111833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
111933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0),%%xmm2                 \n"
112033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0),%%xmm3                 \n"
112133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
112233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
112333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
112433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm3                   \n"
112533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
112633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm0                   \n"
112733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm3,%%xmm2                   \n"
112833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
112933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm2                     \n"
113033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm2,%%xmm0                   \n"
113133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
113233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
113333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%1)                     \n"
113433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
113533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
113633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_abgr),  // %0
113733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
113833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
113933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kABGRToY),   // %3
114033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddY16)     // %4
114133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
114233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
114333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
114433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
114533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
114633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
114733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
114833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToUVRow_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
114933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* dst_u, uint8* dst_v, int width) {
115033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
115133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %0,%%xmm4                       \n"
115233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %1,%%xmm3                       \n"
115333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %2,%%xmm5                       \n"
115433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
115533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kABGRToU),         // %0
115633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kABGRToV),         // %1
115733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddUV128)         // %2
115833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
115933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
116033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
116133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
116233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
116333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
116433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
116533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
116633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm6                 \n"
116733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     (%0,%4,1),%%xmm0                \n"
116833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x10(%0,%4,1),%%xmm1            \n"
116933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x20(%0,%4,1),%%xmm2            \n"
117033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     0x30(%0,%4,1),%%xmm6            \n"
117133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
117233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm7                   \n"
117333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm1,%%xmm0             \n"
117433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
117533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
117633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm7                   \n"
117733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm6,%%xmm2             \n"
117833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
117933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
118033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
118133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm6                   \n"
118233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
118333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
118433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm1                   \n"
118533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm6                   \n"
118633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm2,%%xmm0                   \n"
118733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm6,%%xmm1                   \n"
118833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm0                     \n"
118933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm1                     \n"
119033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packsswb  %%xmm1,%%xmm0                   \n"
119133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
119233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
119333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlps    %%xmm0,(%1)                     \n"
119433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhps    %%xmm0,(%1,%2,1)                \n"
119533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
119633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
119733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_abgr0),       // %0
119833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),           // %1
119933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),           // %2
120033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(width)           // %3
120133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(src_stride_abgr))
120233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
120333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
120433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
120533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
120633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
120733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
120833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
120933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToUVRow_Unaligned_SSSE3(const uint8* src_abgr0, int src_stride_abgr,
121033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 uint8* dst_u, uint8* dst_v, int width) {
121133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
121233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %0,%%xmm4                       \n"
121333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %1,%%xmm3                       \n"
121433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %2,%%xmm5                       \n"
121533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
121633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kABGRToU),         // %0
121733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kABGRToV),         // %1
121833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kAddUV128)         // %2
121933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
122033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
122133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
122233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
122333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
122433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
122533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
122633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0),%%xmm2                 \n"
122733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0),%%xmm6                 \n"
122833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0,%4,1),%%xmm7                \n"
122933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
123033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0,%4,1),%%xmm7            \n"
123133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm1                   \n"
123233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x20(%0,%4,1),%%xmm7            \n"
123333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
123433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x30(%0,%4,1),%%xmm7            \n"
123533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm6                   \n"
123633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
123733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm7                   \n"
123833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm1,%%xmm0             \n"
123933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm1,%%xmm7             \n"
124033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm0                   \n"
124133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm7                   \n"
124233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0x88,%%xmm6,%%xmm2             \n"
124333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shufps    $0xdd,%%xmm6,%%xmm7             \n"
124433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm7,%%xmm2                   \n"
124533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
124633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm6                   \n"
124733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
124833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm2                   \n"
124933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm1                   \n"
125033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm6                   \n"
125133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm2,%%xmm0                   \n"
125233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm6,%%xmm1                   \n"
125333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm0                     \n"
125433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x8,%%xmm1                     \n"
125533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packsswb  %%xmm1,%%xmm0                   \n"
125633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddb     %%xmm5,%%xmm0                   \n"
125733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
125833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlps    %%xmm0,(%1)                     \n"
125933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhps    %%xmm0,(%1,%2,1)                \n"
126033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
126133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
126233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_abgr0),       // %0
126333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),           // %1
126433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),           // %2
126533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(width)           // %3
126633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(src_stride_abgr))
126733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
126833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
126933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm6", "xmm7"
127033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
127133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
127233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
127333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBTOYROW_SSSE3
127433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
127533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TOARGBROW_SSSE3
127633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
127733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
127833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UR 0
127933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
128033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VB 0
128133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
128233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
128333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
128433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Bias
128533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BB UB * 128 + VB * 128
128633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BG UG * 128 + VG * 128
128733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BR UR * 128 + VR * 128
128833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
128933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
129033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
129133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstruct {
129233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec8 kUVToB;  // 0
129333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec8 kUVToG;  // 16
129433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec8 kUVToR;  // 32
129533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec16 kUVBiasB;  // 48
129633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec16 kUVBiasG;  // 64
129733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec16 kUVBiasR;  // 80
129833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec16 kYSub16;  // 96
129933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec16 kYToRgb;  // 112
130033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec8 kVUToB;  // 128
130133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec8 kVUToG;  // 144
130233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  vec8 kVUToR;  // 160
130333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} CONST SIMD_ALIGNED(kYuvConstants) = {
130433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB },
130533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG },
130633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR },
130733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { BB, BB, BB, BB, BB, BB, BB, BB },
130833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { BG, BG, BG, BG, BG, BG, BG, BG },
130933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { BR, BR, BR, BR, BR, BR, BR, BR },
131033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { 16, 16, 16, 16, 16, 16, 16, 16 },
131133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { YG, YG, YG, YG, YG, YG, YG, YG },
131233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB },
131333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG },
131433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  { VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR }
131533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
131633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
131733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
131833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 8 UV from 411
131933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV444                                                             \
132033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%[u_buf]),%%xmm0              \n"                             \
132133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
132233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x8(%[u_buf]),%[u_buf]         \n"                             \
132333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
132433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
132533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 4 UV from 422, upsample to 8 UV
132633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV422                                                             \
132733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd       (%[u_buf]),%%xmm0              \n"                             \
132833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
132933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x4(%[u_buf]),%[u_buf]         \n"                             \
133033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
133133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
133233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
133333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 2 UV from 411, upsample to 8 UV
133433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV411                                                             \
133533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd       (%[u_buf]),%%xmm0              \n"                             \
133633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd       (%[u_buf],%[v_buf],1),%%xmm1   \n"                             \
133733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x2(%[u_buf]),%[u_buf]         \n"                             \
133833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm1,%%xmm0                  \n"                             \
133933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
134033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq  %%xmm0,%%xmm0                  \n"                             \
134133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
134233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 4 UV from NV12, upsample to 8 UV
134333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READNV12                                                               \
134433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%[uv_buf]),%%xmm0             \n"                             \
134533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x8(%[uv_buf]),%[uv_buf]       \n"                             \
134633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd  %%xmm0,%%xmm0                  \n"                             \
134733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
134833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 pixels: 8 UV and 8 Y
134933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YUVTORGB                                                               \
135033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,%%xmm1                  \n"                             \
135133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,%%xmm2                  \n"                             \
135233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw  (%[kYuvConstants]),%%xmm0      \n"                             \
135333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw  16(%[kYuvConstants]),%%xmm1    \n"                             \
135433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw  32(%[kYuvConstants]),%%xmm2    \n"                             \
135533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
135633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
135733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
135833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%[y_buf]),%%xmm3              \n"                             \
135933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
136033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
136133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
136233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
136333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddsw     %%xmm3,%%xmm0                  \n"                             \
136433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddsw     %%xmm3,%%xmm1                  \n"                             \
136533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddsw     %%xmm3,%%xmm2                  \n"                             \
136633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw      $0x6,%%xmm0                    \n"                             \
136733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw      $0x6,%%xmm1                    \n"                             \
136833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw      $0x6,%%xmm2                    \n"                             \
136933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb   %%xmm0,%%xmm0                  \n"                             \
137033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb   %%xmm1,%%xmm1                  \n"                             \
137133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb   %%xmm2,%%xmm2                  \n"                             \
137233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
137333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 pixels: 8 VU and 8 Y
137433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YVUTORGB                                                               \
137533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,%%xmm1                  \n"                             \
137633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,%%xmm2                  \n"                             \
137733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw  128(%[kYuvConstants]),%%xmm0   \n"                             \
137833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw  144(%[kYuvConstants]),%%xmm1   \n"                             \
137933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw  160(%[kYuvConstants]),%%xmm2   \n"                             \
138033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubw      48(%[kYuvConstants]),%%xmm0    \n"                             \
138133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubw      64(%[kYuvConstants]),%%xmm1    \n"                             \
138233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubw      80(%[kYuvConstants]),%%xmm2    \n"                             \
138333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq       (%[y_buf]),%%xmm3              \n"                             \
138433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x8(%[y_buf]),%[y_buf]         \n"                             \
138533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw  %%xmm4,%%xmm3                  \n"                             \
138633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubsw     96(%[kYuvConstants]),%%xmm3    \n"                             \
138733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw     112(%[kYuvConstants]),%%xmm3   \n"                             \
138833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddsw     %%xmm3,%%xmm0                  \n"                             \
138933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddsw     %%xmm3,%%xmm1                  \n"                             \
139033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddsw     %%xmm3,%%xmm2                  \n"                             \
139133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw      $0x6,%%xmm0                    \n"                             \
139233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw      $0x6,%%xmm1                    \n"                             \
139333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw      $0x6,%%xmm2                    \n"                             \
139433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb   %%xmm0,%%xmm0                  \n"                             \
139533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb   %%xmm1,%%xmm1                  \n"                             \
139633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb   %%xmm2,%%xmm2                  \n"                             \
139733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
139833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I444ToARGBRow_SSSE3(const uint8* y_buf,
139933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* u_buf,
140033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* v_buf,
140133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* argb_buf,
140233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                int width) {
140333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
140433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
140533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
140633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
140733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
140833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
140933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV444
141033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
141133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
141233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
141333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
141433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
141533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
141633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%[argb_buf])            \n"
141733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
141833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
141933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
142033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
142133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
142233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
142333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
142433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
142533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
142633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
142733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
142833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
142933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
143033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
143133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
143233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
143333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
143433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I422ToARGBRow_SSSE3(const uint8* y_buf,
143533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* u_buf,
143633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* v_buf,
143733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* argb_buf,
143833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                int width) {
143933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
144033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
144133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
144233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
144333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
144433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
144533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
144633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
144733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
144833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
144933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
145033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
145133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
145233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%[argb_buf])            \n"
145333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
145433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
145533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
145633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
145733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
145833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
145933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
146033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
146133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
146233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
146333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
146433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
146533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
146633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
146733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
146833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
146933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
147033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I411ToARGBRow_SSSE3(const uint8* y_buf,
147133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* u_buf,
147233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* v_buf,
147333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* argb_buf,
147433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                int width) {
147533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
147633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
147733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
147833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
147933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
148033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
148133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV411
148233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
148333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
148433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
148533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
148633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
148733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
148833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%[argb_buf])            \n"
148933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
149033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
149133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
149233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
149333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
149433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
149533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
149633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
149733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
149833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
149933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
150033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
150133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
150233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
150333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
150433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
150533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
150633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP NV12ToARGBRow_SSSE3(const uint8* y_buf,
150733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* uv_buf,
150833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* argb_buf,
150933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                int width) {
151033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
151133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
151233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
151333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
151433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
151533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
151633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
151733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
151833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
151933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
152033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
152133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
152233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%[argb_buf])            \n"
152333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
152433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
152533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
152633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
152733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
152833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
152933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
153033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
153133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
153233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
153333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
153433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
153533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
153633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
153733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
153833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
153933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP NV21ToARGBRow_SSSE3(const uint8* y_buf,
154033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* vu_buf,
154133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* argb_buf,
154233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                int width) {
154333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
154433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
154533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
154633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
154733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
154833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
154933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YVUTORGB
155033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
155133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
155233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
155333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
155433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
155533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%[argb_buf])            \n"
155633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
155733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
155833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
155933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
156033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
156133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
156233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
156333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
156433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
156533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
156633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
156733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
156833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
156933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
157033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
157133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
157233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
157333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* u_buf,
157433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* v_buf,
157533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          uint8* argb_buf,
157633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          int width) {
157733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
157833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
157933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
158033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
158133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
158233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
158333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV444
158433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
158533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
158633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
158733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
158833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
158933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
159033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%[argb_buf])            \n"
159133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
159233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
159333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
159433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
159533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
159633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
159733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
159833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
159933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
160033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
160133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
160233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
160333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
160433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
160533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
160633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
160733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
160833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
160933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* u_buf,
161033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* v_buf,
161133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          uint8* argb_buf,
161233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          int width) {
161333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
161433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
161533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
161633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
161733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
161833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
161933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
162033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
162133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
162233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
162333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
162433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
162533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
162633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%[argb_buf])            \n"
162733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
162833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
162933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
163033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
163133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
163233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
163333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
163433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
163533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
163633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
163733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
163833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
163933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
164033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
164133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
164233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
164333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
164433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
164533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* u_buf,
164633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* v_buf,
164733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          uint8* argb_buf,
164833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          int width) {
164933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
165033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
165133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
165233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
165333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
165433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
165533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV411
165633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
165733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
165833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
165933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
166033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
166133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
166233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%[argb_buf])            \n"
166333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
166433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
166533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
166633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
166733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
166833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
166933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
167033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
167133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
167233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
167333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
167433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
167533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
167633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
167733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
167833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
167933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
168033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
168133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* uv_buf,
168233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          uint8* argb_buf,
168333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          int width) {
168433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
168533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
168633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
168733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
168833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
168933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
169033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
169133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
169233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
169333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
169433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
169533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
169633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%[argb_buf])            \n"
169733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
169833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
169933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
170033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
170133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
170233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [uv_buf]"+r"(uv_buf),    // %[uv_buf]
170333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
170433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
170533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
170633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
170733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
170833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
170933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
171033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
171133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
171233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
171333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
171433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* vu_buf,
171533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          uint8* argb_buf,
171633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          int width) {
171733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
171833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
171933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
172033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
172133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
172233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
172333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YVUTORGB
172433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm0                   \n"
172533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm2                   \n"
172633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
172733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm2,%%xmm0                   \n"
172833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm2,%%xmm1                   \n"
172933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%[argb_buf])            \n"
173033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
173133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
173233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
173333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
173433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
173533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [uv_buf]"+r"(vu_buf),    // %[uv_buf]
173633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(argb_buf),  // %[argb_buf]
173733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
173833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
173933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
174033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
174133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
174233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
174333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
174433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
174533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
174633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I422ToBGRARow_SSSE3(const uint8* y_buf,
174733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* u_buf,
174833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* v_buf,
174933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* bgra_buf,
175033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                int width) {
175133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
175233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
175333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
175433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
175533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
175633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
175733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
175833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
175933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
176033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm1                   \n"
176133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm2,%%xmm5                   \n"
176233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm5,%%xmm0                   \n"
176333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm1,%%xmm5                   \n"
176433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm1,%%xmm0                   \n"
176533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm5,(%[argb_buf])            \n"
176633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,0x10(%[argb_buf])        \n"
176733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
176833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
176933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
177033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
177133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
177233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
177333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
177433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
177533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
177633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
177733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
177833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
177933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
178033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
178133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
178233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
178333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I422ToABGRRow_SSSE3(const uint8* y_buf,
178433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* u_buf,
178533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                const uint8* v_buf,
178633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* abgr_buf,
178733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                int width) {
178833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
178933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
179033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
179133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
179233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
179333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
179433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
179533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
179633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm2                   \n"
179733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm0                   \n"
179833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm1                   \n"
179933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm0,%%xmm2                   \n"
180033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm0,%%xmm1                   \n"
180133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,(%[argb_buf])            \n"
180233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%[argb_buf])        \n"
180333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
180433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
180533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
180633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
180733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
180833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
180933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
181033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
181133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
181233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
181333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
181433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
181533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
181633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
181733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
181833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
181933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
182033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* u_buf,
182133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* v_buf,
182233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          uint8* bgra_buf,
182333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          int width) {
182433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
182533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
182633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
182733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
182833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
182933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
183033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
183133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
183233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
183333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm1                   \n"
183433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm2,%%xmm5                   \n"
183533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm5,%%xmm0                   \n"
183633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm1,%%xmm5                   \n"
183733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm1,%%xmm0                   \n"
183833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm5,(%[argb_buf])            \n"
183933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,0x10(%[argb_buf])        \n"
184033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
184133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
184233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
184333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
184433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
184533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
184633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(bgra_buf),  // %[argb_buf]
184733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
184833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
184933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
185033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
185133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
185233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
185333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
185433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
185533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
185633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid OMITFP I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
185733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* u_buf,
185833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          const uint8* v_buf,
185933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          uint8* abgr_buf,
186033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                          int width) {
186133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
186233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %[u_buf],%[v_buf]               \n"
186333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
186433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm4                   \n"
186533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
186633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
186733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
186833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
186933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm2                   \n"
187033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm0                   \n"
187133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm1                   \n"
187233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm0,%%xmm2                   \n"
187333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm0,%%xmm1                   \n"
187433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm2,(%[argb_buf])            \n"
187533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm1,0x10(%[argb_buf])        \n"
187633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%[argb_buf]),%[argb_buf]   \n"
187733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%[width]                   \n"
187833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
187933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [y_buf]"+r"(y_buf),    // %[y_buf]
188033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [u_buf]"+r"(u_buf),    // %[u_buf]
188133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [v_buf]"+r"(v_buf),    // %[v_buf]
188233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [argb_buf]"+r"(abgr_buf),  // %[argb_buf]
188333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    [width]"+rm"(width)    // %[width]
188433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : [kYuvConstants]"r"(&kYuvConstants.kUVToB) // %[kYuvConstants]
188533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
188633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
188733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
188833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
188933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
189033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
189133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_I422TOARGBROW_SSSE3
189233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
189333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YTOARGBROW_SSE2
189433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YToARGBRow_SSE2(const uint8* y_buf,
189533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     uint8* rgb_buf,
189633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     int width) {
189733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
189833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
189933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm4                    \n"
190033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       $0x10001000,%%eax               \n"
190133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%eax,%%xmm3                    \n"
190233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
190333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       $0x012a012a,%%eax               \n"
190433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%eax,%%xmm2                    \n"
190533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
190633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
190733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
190833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
190933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      (%0),%%xmm0                     \n"
191033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%0),%0                      \n"
191133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm0                   \n"
191233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubusw   %%xmm3,%%xmm0                   \n"
191333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm0                   \n"
191433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
191533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
191633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 2: Weave into ARGB
191733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm0                   \n"
191833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
191933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm0,%%xmm0                   \n"
192033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm1,%%xmm1                   \n"
192133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
192233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm1                   \n"
192333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
192433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,16(%1)                   \n"
192533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       32(%1),%1                       \n"
192633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
192733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%2                         \n"
192833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
19297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  : "+r"(y_buf),    // %0
19307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    "+r"(rgb_buf),  // %1
193133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(width)    // %2
193233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
193333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc", "eax"
193433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
193533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
193633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
193733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
193833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
193933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_YTOARGBROW_SSE2
19407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
194133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_SSSE3
194233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes.
194333cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMirror = {
194433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
194533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
194633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
194733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
194833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  intptr_t temp_width = static_cast<intptr_t>(width);
194933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
195033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm5                       \n"
195133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       -0x10(%0),%0                    \n"
195233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
195333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
195433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0,%2),%%xmm0                  \n"
195533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm5,%%xmm0                   \n"
195633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
195733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
195833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
195933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
196033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
196133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
196233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(temp_width)  // %2
196333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMirror) // %3
196433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
196533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
196633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm5"
196733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
196833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
19697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
197033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_MIRRORROW_SSSE3
19717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
197233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_SSE2
197333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
197433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  intptr_t temp_width = static_cast<intptr_t>(width);
197533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
197633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       -0x10(%0),%0                    \n"
197733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
197833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
197933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0,%2),%%xmm0                  \n"
198033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
198133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x8,%%xmm0                     \n"
198233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
198333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm1,%%xmm0                   \n"
198433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0x1b,%%xmm0,%%xmm0             \n"
198533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufhw   $0x1b,%%xmm0,%%xmm0             \n"
198633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x4e,%%xmm0,%%xmm0             \n"
198733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
198833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%1)                     \n"
198933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
199033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
199133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
199233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
199333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(temp_width)  // %2
199433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
199533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
199633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
199733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1"
199833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
199933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
200033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
200133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_MIRRORROW_SSE2
200233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
200333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_UV_SSSE3
200433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes of UV channels.
200533cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleMirrorUV = {
200633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
200733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
200833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
200933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       int width) {
201033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  intptr_t temp_width = static_cast<intptr_t>(width);
201133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
201233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm1                       \n"
201333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       -16(%0,%3,2),%0                 \n"
201433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
201533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
201633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
201733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
201833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       -16(%0),%0                      \n"
201933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm1,%%xmm0                   \n"
202033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $8,%3                           \n"
202133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlpd    %%xmm0,(%1)                     \n"
202233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movhpd    %%xmm0,(%1,%2)                  \n"
202333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       8(%1),%1                        \n"
202433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
202533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),      // %0
202633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),    // %1
202733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),    // %2
202833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(temp_width)  // %3
202933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleMirrorUV)  // %4
203033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
203133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
203233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1"
203333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
203433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
203533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
203633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_MIRRORROW_UV_SSSE3
203733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
203833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBMIRRORROW_SSSE3
203933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes.
204033cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kARGBShuffleMirror = {
204133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
204233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
204333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
204433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
204533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  intptr_t temp_width = static_cast<intptr_t>(width);
204633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
204733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm5                       \n"
204833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       -0x10(%0),%0                    \n"
204933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
205033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
205133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0,%2,4),%%xmm0                \n"
205233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm5,%%xmm0                   \n"
205333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
205433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
205533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
205633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
205733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),  // %0
205833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),  // %1
205933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(temp_width)  // %2
206033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kARGBShuffleMirror)  // %3
206133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
206233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
206333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm5"
206433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
206533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
206633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
206733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBMIRRORROW_SSSE3
206833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
206933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_SPLITUV_SSE2
207033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
207133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
207233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb    %%xmm5,%%xmm5                    \n"
207333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw      $0x8,%%xmm5                      \n"
207433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub        %1,%2                            \n"
207533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
207633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                            \n"
207733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     (%0),%%xmm0                      \n"
207833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     0x10(%0),%%xmm1                  \n"
207933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x20(%0),%0                      \n"
208033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,%%xmm2                    \n"
208133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm1,%%xmm3                    \n"
208233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand       %%xmm5,%%xmm0                    \n"
208333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand       %%xmm5,%%xmm1                    \n"
208433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb   %%xmm1,%%xmm0                    \n"
208533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw      $0x8,%%xmm2                      \n"
208633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw      $0x8,%%xmm3                      \n"
208733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb   %%xmm3,%%xmm2                    \n"
208833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm0,(%1)                      \n"
208933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa     %%xmm2,(%1,%2)                   \n"
209033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea        0x10(%1),%1                      \n"
209133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub        $0x10,%3                         \n"
209233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg         1b                               \n"
209333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uv),     // %0
209433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),      // %1
209533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),      // %2
209633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)         // %3
209733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
209833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
209933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
210033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
210133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
210233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
210333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
210433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_SPLITUV_SSE2
210533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
210633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COPYROW_SSE2
210733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
210833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
210933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub        %0,%1                          \n"
211033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
211133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
211233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
211333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
211433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1)                  \n"
211533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%0,%1)              \n"
211633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
211733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x20,%2                        \n"
211833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
211933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src),   // %0
212033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),   // %1
212133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(count)  // %2
212233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
212333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
212433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
212533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1"
212633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
212733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
212833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
212933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_COPYROW_SSE2
213033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
213133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COPYROW_X86
213233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CopyRow_X86(const uint8* src, uint8* dst, int width) {
213333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  size_t width_tmp = static_cast<size_t>(width);
213433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
213533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shr       $0x2,%2                         \n"
213633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "rep movsl                                 \n"
213733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+S"(src),  // %0
213833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+D"(dst),  // %1
213933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+c"(width_tmp) // %2
214033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
214133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
214233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
214333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
214433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_COPYROW_X86
214533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
214633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_SETROW_X86
214733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SetRow8_X86(uint8* dst, uint32 v32, int width) {
214833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  size_t width_tmp = static_cast<size_t>(width);
214933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
215033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shr       $0x2,%1                         \n"
215133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "rep stosl                                 \n"
215233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "+D"(dst),       // %0
215333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "+c"(width_tmp)  // %1
215433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "a"(v32)         // %2
215533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    : "memory", "cc");
215633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
215733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
215833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SetRows32_X86(uint8* dst, uint32 v32, int width,
215933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                   int dst_stride, int height) {
216033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  for (int y = 0; y < height; ++y) {
216133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    size_t width_tmp = static_cast<size_t>(width);
216233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    uint32* d = reinterpret_cast<uint32*>(dst);
216333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    asm volatile (
216433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      "rep stosl                               \n"
216533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      : "+D"(d),         // %0
216633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp        "+c"(width_tmp)  // %1
216733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      : "a"(v32)         // %2
216833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp      : "memory", "cc");
216933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    dst += dst_stride;
21707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
21717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
217233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_SETROW_X86
217333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
217433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YUY2TOYROW_SSE2
217533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix) {
217633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
217733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
217833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
217933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
218033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
218133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
218233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
218333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
218433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
218533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
218633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
218733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
218833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
218933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
219033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
219133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),  // %0
219233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
219333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
219433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
219533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
219633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
219733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm5"
219833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
219933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
220033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
220133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
220233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
220333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                      uint8* dst_u, uint8* dst_v, int pix) {
220433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
220533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
220633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
220733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
220833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
220933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
221033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
221133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
221233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0,%4,1),%%xmm2                \n"
221333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
221433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
221533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm2,%%xmm0                   \n"
221633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm3,%%xmm1                   \n"
221733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
221833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
221933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
222033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
222133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
222233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
222333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
222433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm1                   \n"
222533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
222633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%1,%2)                  \n"
222733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
222833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
222933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
223033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),    // %0
223133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),       // %1
223233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),       // %2
223333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)          // %3
223433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
223533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
223633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
223733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
223833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
223933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
224033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
224133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
224233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
224333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* dst_u, uint8* dst_v, int pix) {
224433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
224533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
224633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
224733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
224833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
224933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
225033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
225133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
225233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
225333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
225433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
225533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
225633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
225733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
225833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
225933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
226033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm1                   \n"
226133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
226233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%1,%2)                  \n"
226333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
226433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
226533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
226633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),    // %0
226733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),       // %1
226833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),       // %2
226933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)          // %3
227033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
227133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
227233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
227333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm5"
227433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
227533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
227633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
227733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
227833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
227933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               uint8* dst_y, int pix) {
228033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
228133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
228233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
228333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
228433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
228533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
228633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
228733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
228833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
228933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
229033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
229133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
229233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%1)                     \n"
229333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
229433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
229533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),  // %0
229633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
229733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
229833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
229933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
230033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
230133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm5"
230233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
230333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
230433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
230533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
230633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2,
230733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                int stride_yuy2,
230833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* dst_u, uint8* dst_v, int pix) {
230933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
231033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
231133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
231233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
231333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
231433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
231533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
231633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
231733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0,%4,1),%%xmm2                \n"
231833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
231933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
232033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm2,%%xmm0                   \n"
232133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm3,%%xmm1                   \n"
232233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
232333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
232433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
232533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
232633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
232733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
232833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
232933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm1                   \n"
233033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
233133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%1,%2)                  \n"
233233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
233333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
233433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
233533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),    // %0
233633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),       // %1
233733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),       // %2
233833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)          // %3
233933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(stride_yuy2))  // %4
234033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
234133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
234233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
234333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
234433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
234533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
234633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
234733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
234833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* dst_u, uint8* dst_v, int pix) {
234933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
235033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
235133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
235233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
235333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
235433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
235533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
235633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
235733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
235833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
235933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
236033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
236133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
236233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
236333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
236433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
236533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm1                   \n"
236633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
236733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%1,%2)                  \n"
236833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
236933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
237033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
237133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_yuy2),    // %0
237233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),       // %1
237333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),       // %2
237433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)          // %3
237533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
237633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
237733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
237833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm5"
237933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
238033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
238133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
238233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
238333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToYRow_SSE2(const uint8* src_uyvy, uint8* dst_y, int pix) {
238433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
238533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
238633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
238733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
238833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
238933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
239033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
239133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
239233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
239333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
239433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1)                     \n"
239533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
239633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
239733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),  // %0
239833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
239933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
240033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
240133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
240233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
240333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1"
240433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
240533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
240633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
240733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
240833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
240933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                      uint8* dst_u, uint8* dst_v, int pix) {
241033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
241133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
241233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
241333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
241433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
241533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
241633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
241733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
241833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0,%4,1),%%xmm2                \n"
241933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0,%4,1),%%xmm3            \n"
242033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
242133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm2,%%xmm0                   \n"
242233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm3,%%xmm1                   \n"
242333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
242433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
242533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
242633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
242733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
242833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
242933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
243033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm1                   \n"
243133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
243233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%1,%2)                  \n"
243333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
243433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
243533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
243633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),    // %0
243733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),       // %1
243833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),       // %2
243933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)          // %3
244033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
244133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
244233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
244333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
244433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
244533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
244633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
244733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
244833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUV422Row_SSE2(const uint8* src_uyvy,
244933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* dst_u, uint8* dst_v, int pix) {
245033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
245133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
245233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
245333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
245433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
245533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
245633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
245733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
245833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
245933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
246033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
246133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
246233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
246333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
246433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
246533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
246633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm1                   \n"
246733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
246833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%1,%2)                  \n"
246933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
247033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
247133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
247233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),    // %0
247333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),       // %1
247433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),       // %2
247533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)          // %3
247633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
247733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
247833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
247933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm5"
248033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
248133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
248233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
248333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
248433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
248533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               uint8* dst_y, int pix) {
248633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
248733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
248833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
248933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
249033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
249133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
249233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
249333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
249433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
249533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%2                        \n"
249633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%1)                     \n"
249733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
249833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
249933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),  // %0
250033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_y),     // %1
250133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)        // %2
250233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
250333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
250433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
250533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1"
250633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
250733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
250833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
250933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
251033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
251133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* dst_u, uint8* dst_v, int pix) {
251233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
251333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
251433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
251533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
251633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
251733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
251833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
251933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
252033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0,%4,1),%%xmm2                \n"
252133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0,%4,1),%%xmm3            \n"
252233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
252333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm2,%%xmm0                   \n"
252433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     %%xmm3,%%xmm1                   \n"
252533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
252633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
252733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
252833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
252933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
253033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
253133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
253233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm1                   \n"
253333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
253433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%1,%2)                  \n"
253533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
253633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
253733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
253833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),    // %0
253933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),       // %1
254033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),       // %2
254133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)          // %3
254233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(stride_uyvy))  // %4
254333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
254433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
254533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
254633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
254733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
254833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
254933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
255033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
255133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* dst_u, uint8* dst_v, int pix) {
255233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
255333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
255433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm5                     \n"
255533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
255633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
255733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
255833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm0                     \n"
255933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    0x10(%0),%%xmm1                 \n"
256033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
256133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
256233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
256333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
256433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
256533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
256633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
256733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
256833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm1                   \n"
256933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,(%1)                     \n"
257033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%1,%2)                  \n"
257133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x8(%1),%1                      \n"
257233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x10,%3                        \n"
257333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
257433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_uyvy),    // %0
257533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_u),       // %1
257633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_v),       // %2
257733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(pix)          // %3
257833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
257933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
258033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
258133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm5"
258233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
258333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
258433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
258533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_YUY2TOYROW_SSE2
258633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
258733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBBLENDROW_SSE2
258833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Blend 8 pixels at a time.
258933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
259033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* dst_argb, int width) {
259133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
259233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm7,%%xmm7                   \n"
259333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0xf,%%xmm7                     \n"
259433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm6,%%xmm6                   \n"
259533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm6                     \n"
259633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
259733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x8,%%xmm5                     \n"
259833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
259933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm4                    \n"
260033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%3                         \n"
260133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "je        91f                             \n"
260233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        99f                             \n"
260333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
260433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop until destination pointer is aligned.
260533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "10:                                         \n"
260633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "test      $0xf,%2                         \n"
260733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "je        19f                             \n"
260833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0),%%xmm3                     \n"
260933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%0),%0                      \n"
261033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm0                   \n"
261133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm3                   \n"
261233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%1),%%xmm2                     \n"
261333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm3                     \n"
261433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
261533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
261633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm2                   \n"
261733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm7,%%xmm3                   \n"
261833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm2                   \n"
261933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%1),%%xmm1                     \n"
262033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%1),%1                      \n"
262133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
262233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
262333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm1                   \n"
262433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm2                     \n"
262533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm2,%%xmm0                   \n"
262633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
262733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm1,%%xmm0                   \n"
262833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%3                         \n"
262933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,(%2)                     \n"
263033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%2),%2                      \n"
263133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       10b                             \n"
263233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
263333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "19:                                         \n"
263433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $1-4,%3                         \n"
263533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        49f                             \n"
26367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
263733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
263833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
263933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "41:                                         \n"
264033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm3                     \n"
264133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
264233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm0                   \n"
264333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm3                   \n"
264433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%1),%%xmm2                     \n"
264533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm3                     \n"
264633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
264733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
264833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm2                   \n"
264933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm7,%%xmm3                   \n"
265033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm2                   \n"
265133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%1),%%xmm1                     \n"
265233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
265333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
265433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
265533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm1                   \n"
265633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm2                     \n"
265733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm2,%%xmm0                   \n"
265833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
265933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm1,%%xmm0                   \n"
266033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%3                         \n"
266133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%2)                     \n"
266233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%2),%2                     \n"
266333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       41b                             \n"
266433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
266533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "49:                                         \n"
266633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $0x3,%3                         \n"
266733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        99f                             \n"
266833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
266933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop.
267033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "91:                                         \n"
267133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0),%%xmm3                     \n"
267233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%0),%0                      \n"
267333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm0                   \n"
267433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm3                   \n"
267533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%1),%%xmm2                     \n"
267633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm3                     \n"
267733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufhw   $0xf5,%%xmm3,%%xmm3             \n"
267833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xf5,%%xmm3,%%xmm3             \n"
267933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm2                   \n"
268033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm7,%%xmm3                   \n"
268133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm2                   \n"
268233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%1),%%xmm1                     \n"
268333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%1),%1                      \n"
268433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
268533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
268633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm1                   \n"
268733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm2                     \n"
268833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm2,%%xmm0                   \n"
268933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
269033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm1,%%xmm0                   \n"
269133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%3                         \n"
269233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,(%2)                     \n"
269333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%2),%2                      \n"
269433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       91b                             \n"
269533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "99:                                         \n"
269633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb0),    // %0
269733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(src_argb1),    // %1
269833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),     // %2
269933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)         // %3
270033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
270133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
270233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
270333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
270433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
270533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
270633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
270733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBBLENDROW_SSE2
270833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
270933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBBLENDROW_SSSE3
271033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for isolating alpha.
271133cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleAlpha = {
271233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
271333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
271433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
271533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
271633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Blend 8 pixels at a time
271733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes.
271833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
271933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Same as SSE2, but replaces
272033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    psrlw      xmm3, 8          // alpha
272133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
272233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    pshuflw    xmm3, xmm3,0F5h
272333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// with..
272433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    pshufb     xmm3, kShuffleAlpha // alpha
272533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
272633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
272733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* dst_argb, int width) {
272833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
272933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm7,%%xmm7                   \n"
273033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0xf,%%xmm7                     \n"
273133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm6,%%xmm6                   \n"
273233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm6                     \n"
273333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
273433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psllw     $0x8,%%xmm5                     \n"
273533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
273633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm4                    \n"
273733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%3                         \n"
273833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "je        91f                             \n"
273933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        99f                             \n"
274033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
274133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop until destination pointer is aligned.
274233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "10:                                         \n"
274333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "test      $0xf,%2                         \n"
274433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "je        19f                             \n"
274533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0),%%xmm3                     \n"
274633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%0),%0                      \n"
274733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm0                   \n"
274833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm3                   \n"
274933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%1),%%xmm2                     \n"
275033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %4,%%xmm3                       \n"
275133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm2                   \n"
275233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm7,%%xmm3                   \n"
275333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm2                   \n"
275433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%1),%%xmm1                     \n"
275533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%1),%1                      \n"
275633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
275733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
275833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm1                   \n"
275933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm2                     \n"
276033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm2,%%xmm0                   \n"
276133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
276233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm1,%%xmm0                   \n"
276333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%3                         \n"
276433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,(%2)                     \n"
276533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%2),%2                      \n"
276633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       10b                             \n"
276733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
276833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "19:                                         \n"
276933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $1-4,%3                         \n"
277033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        49f                             \n"
277133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "test      $0xf,%0                         \n"
277233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jne       41f                             \n"
277333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "test      $0xf,%1                         \n"
277433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jne       41f                             \n"
277533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
277633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
277733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
277833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "40:                                         \n"
277933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm3                     \n"
278033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
278133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm0                   \n"
278233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm3                   \n"
278333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%1),%%xmm2                     \n"
278433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %4,%%xmm3                       \n"
278533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm2                   \n"
278633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm7,%%xmm3                   \n"
278733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm2                   \n"
278833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%1),%%xmm1                     \n"
278933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
279033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
279133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
279233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm1                   \n"
279333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm2                     \n"
279433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm2,%%xmm0                   \n"
279533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
279633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm1,%%xmm0                   \n"
279733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%3                         \n"
279833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%2)                     \n"
279933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%2),%2                     \n"
280033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       40b                             \n"
280133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jmp       49f                             \n"
280233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
280333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel unaligned loop.
280433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
280533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "41:                                         \n"
280633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm3                     \n"
280733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
280833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm0                   \n"
280933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm3                   \n"
281033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%1),%%xmm2                     \n"
281133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %4,%%xmm3                       \n"
281233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm2                   \n"
281333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm7,%%xmm3                   \n"
281433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm2                   \n"
281533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%1),%%xmm1                     \n"
281633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
281733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
281833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
281933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm1                   \n"
282033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm2                     \n"
282133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm2,%%xmm0                   \n"
282233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
282333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm1,%%xmm0                   \n"
282433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%3                         \n"
282533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%2)                     \n"
282633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%2),%2                     \n"
282733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       41b                             \n"
282833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
282933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "49:                                         \n"
283033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $0x3,%3                         \n"
283133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        99f                             \n"
283233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
283333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop.
283433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "91:                                         \n"
283533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0),%%xmm3                     \n"
283633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%0),%0                      \n"
283733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,%%xmm0                   \n"
283833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm4,%%xmm3                   \n"
283933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%1),%%xmm2                     \n"
284033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %4,%%xmm3                       \n"
284133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm2                   \n"
284233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm7,%%xmm3                   \n"
284333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm2                   \n"
284433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%1),%%xmm1                     \n"
284533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%1),%1                      \n"
284633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
284733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm4,%%xmm0                   \n"
284833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm1                   \n"
284933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm2                     \n"
285033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm2,%%xmm0                   \n"
285133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm1                   \n"
285233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddusb   %%xmm1,%%xmm0                   \n"
285333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%3                         \n"
285433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,(%2)                     \n"
285533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%2),%2                      \n"
285633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       91b                             \n"
285733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "99:                                         \n"
285833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb0),    // %0
285933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(src_argb1),    // %1
286033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),     // %2
286133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)         // %3
286233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleAlpha)  // %4
286333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
286433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
286533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
286633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
286733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
286833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
286933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBBLENDROW_SSSE3
287033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
287133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBATTENUATE_SSE2
287233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Attenuate 4 pixels at a time.
287333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// aligned to 16 bytes
287433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
287533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
287633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
287733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
287833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm4                    \n"
287933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm5,%%xmm5                   \n"
288033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x8,%%xmm5                     \n"
288133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
288233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
288333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
288433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
288533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
288633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm0                   \n"
288733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufhw   $0xff,%%xmm0,%%xmm2             \n"
288833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
288933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm0                   \n"
289033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm1                     \n"
289133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm1,%%xmm1                   \n"
289233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufhw   $0xff,%%xmm1,%%xmm2             \n"
289333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xff,%%xmm2,%%xmm2             \n"
289433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm1                   \n"
289533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm2                     \n"
289633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
289733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm4,%%xmm2                   \n"
289833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
289933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
290033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm5,%%xmm0                   \n"
290133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm0                   \n"
290233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
290333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
290433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
290533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
290633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),    // %0
290733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),    // %1
290833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)        // %2
290933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
291033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
291133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
291233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
291333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
291433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
291533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
291633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBATTENUATE_SSE2
291733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
291833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBATTENUATEROW_SSSE3
291933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table duplicating alpha
292033cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleAlpha0 = {
292133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
292233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
292333cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST uvec8 kShuffleAlpha1 = {
292433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
292533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
292633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
292733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Attenuate 4 pixels at a time.
292833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// aligned to 16 bytes
292933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
293033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
293133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
293233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm3,%%xmm3                   \n"
293333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm3                    \n"
293433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
293533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm5                       \n"
293633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
293733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
293833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
293933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
294033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
294133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm4,%%xmm0                   \n"
294233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm1                     \n"
294333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm1                   \n"
294433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm1,%%xmm0                   \n"
294533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm1                     \n"
294633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufb    %%xmm5,%%xmm1                   \n"
294733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm2                     \n"
294833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm2,%%xmm2                   \n"
294933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm1                   \n"
295033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm2                     \n"
295133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm3,%%xmm2                   \n"
295233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
295333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
295433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
295533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm0                   \n"
295633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
295733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
295833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
295933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
296033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),    // %0
296133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),    // %1
296233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)        // %2
296333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kShuffleAlpha0),  // %3
296433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kShuffleAlpha1)  // %4
296533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
296633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
296733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
296833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
296933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
297033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
297133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBATTENUATEROW_SSSE3
297233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
297333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBUNATTENUATEROW_SSE2
297433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Unattenuate 4 pixels at a time.
297533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// aligned to 16 bytes
297633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
29777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             int width) {
297833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  uintptr_t alpha = 0;
297933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
298033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
298133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm4,%%xmm4                   \n"
298233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm4                    \n"
298333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
298433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
298533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
298633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
298733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
298833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movzb     0x3(%0),%3                      \n"
298933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm0                   \n"
299033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      0x0(%4,%3,4),%%xmm2             \n"
299133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movzb     0x7(%0),%3                      \n"
299233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      0x0(%4,%3,4),%%xmm3             \n"
299333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
299433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
299533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlhps   %%xmm3,%%xmm2                   \n"
299633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm0                   \n"
299733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm1                     \n"
299833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movzb     0xb(%0),%3                      \n"
299933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm1,%%xmm1                   \n"
300033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      0x0(%4,%3,4),%%xmm2             \n"
300133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movzb     0xf(%0),%3                      \n"
300233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      0x0(%4,%3,4),%%xmm3             \n"
300333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xc0,%%xmm2,%%xmm2             \n"
300433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0xc0,%%xmm3,%%xmm3             \n"
300533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlhps   %%xmm3,%%xmm2                   \n"
300633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm1                   \n"
300733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm2                     \n"
300833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm4,%%xmm2                   \n"
300933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
301033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm2,%%xmm0                   \n"
301133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
301233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
301333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
301433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
301533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),    // %0
301633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),    // %1
301733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width),       // %2
301833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(alpha)        // %3
301933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(fixed_invtbl8)  // %4
302033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
302133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
302233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
302333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
302433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
30257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
302633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBUNATTENUATEROW_SSE2
30277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
302833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBGRAYROW_SSSE3
302933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constant for ARGB color to gray scale. 0.11 * B + 0.59 * G + 0.30 * R
303033cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kARGBToGray = {
303133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
303233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
303333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
303433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
303533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
303633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
303733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm4                       \n"
303833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
303933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
304033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 8 pixel loop.
304133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
304233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
304333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
304433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
304533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm0                   \n"
304633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
304733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm0                   \n"
304833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
304933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
305033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm2                     \n"
305133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm3                 \n"
305233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x18,%%xmm2                    \n"
305333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x18,%%xmm3                    \n"
305433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm3,%%xmm2                   \n"
305533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm2,%%xmm2                   \n"
305633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm3                   \n"
305733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm0                   \n"
305833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm2,%%xmm3                   \n"
305933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
306033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm3,%%xmm0                   \n"
306133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm3,%%xmm1                   \n"
306233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%2                         \n"
306333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
306433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%0,%1,1)            \n"
306533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
306633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
306733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),   // %0
306833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),   // %1
306933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)       // %2
307033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kARGBToGray)  // %3
307133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
307233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
307333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
307433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
307533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
30767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
307733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBGRAYROW_SSSE3
30787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
307933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBSEPIAROW_SSSE3
308033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    b = (r * 35 + g * 68 + b * 17) >> 7
308133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    g = (r * 45 + g * 88 + b * 22) >> 7
308233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    r = (r * 50 + g * 98 + b * 24) >> 7
308333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constant for ARGB color to sepia tone
308433cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kARGBToSepiaB = {
308533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
308633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
308733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
308833cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kARGBToSepiaG = {
308933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
309033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
309133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
309233cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampCONST vec8 kARGBToSepiaR = {
309333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
309433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
309533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
309633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
309733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
309833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
309933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %2,%%xmm2                       \n"
310033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %3,%%xmm3                       \n"
310133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %4,%%xmm4                       \n"
310233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
310333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 8 pixel loop.
310433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
310533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
310633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
310733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm6                 \n"
310833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm2,%%xmm0                   \n"
310933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm2,%%xmm6                   \n"
311033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm6,%%xmm0                   \n"
311133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
311233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
311333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm5                     \n"
311433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
311533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm5                   \n"
311633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm1                   \n"
311733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm5                   \n"
311833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm5                     \n"
311933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm5,%%xmm5                   \n"
312033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm0                   \n"
312133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm5                     \n"
312233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
312333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm5                   \n"
312433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
312533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddw    %%xmm1,%%xmm5                   \n"
312633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm5                     \n"
312733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm5,%%xmm5                   \n"
312833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm6                     \n"
312933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
313033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x18,%%xmm6                    \n"
313133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x18,%%xmm1                    \n"
313233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm6                   \n"
313333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm6,%%xmm6                   \n"
313433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm6,%%xmm5                   \n"
313533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
313633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm5,%%xmm0                   \n"
313733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm5,%%xmm1                   \n"
313833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%1                         \n"
313933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0)                     \n"
314033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%0)                 \n"
314133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
314233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
314333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(dst_argb),      // %0
314433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)          // %1
314533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "m"(kARGBToSepiaB),  // %2
314633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kARGBToSepiaG),  // %3
314733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "m"(kARGBToSepiaR)   // %4
314833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
314933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
315033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
315133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
315233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
315333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
315433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBSEPIAROW_SSSE3
315533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
315633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
315733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Tranform 8 ARGB pixels (32 bytes) with color matrix.
315833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Same as Sepia except matrix is provided.
315933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
316033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                              int width) {
316133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
316233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%2),%%xmm2                     \n"
316333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      0x4(%2),%%xmm3                  \n"
316433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      0x8(%2),%%xmm4                  \n"
316533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm2,%%xmm2              \n"
316633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm3,%%xmm3              \n"
316733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
316833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
316933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 8 pixel loop.
317033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
317133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
317233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
317333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm6                 \n"
317433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm2,%%xmm0                   \n"
317533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm2,%%xmm6                   \n"
317633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm5                     \n"
317733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
317833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm5                   \n"
317933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm3,%%xmm1                   \n"
318033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddsw   %%xmm6,%%xmm0                   \n"
318133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddsw   %%xmm1,%%xmm5                   \n"
318233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x7,%%xmm0                     \n"
318333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x7,%%xmm5                     \n"
318433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
318533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm5,%%xmm5                   \n"
318633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm0                   \n"
318733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm5                     \n"
318833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
318933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm5                   \n"
319033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm4,%%xmm1                   \n"
319133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "phaddsw   %%xmm1,%%xmm5                   \n"
319233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psraw     $0x7,%%xmm5                     \n"
319333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm5,%%xmm5                   \n"
319433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm6                     \n"
319533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
319633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x18,%%xmm6                    \n"
319733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrld     $0x18,%%xmm1                    \n"
319833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm6                   \n"
319933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm6,%%xmm6                   \n"
320033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
320133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm6,%%xmm5                   \n"
320233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm5,%%xmm0                   \n"
320333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm5,%%xmm1                   \n"
320433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x8,%1                         \n"
320533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0)                     \n"
320633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm1,0x10(%0)                 \n"
320733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x20(%0),%0                     \n"
320833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
320933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(dst_argb),      // %0
321033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)          // %1
321133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(matrix_argb)     // %2
321233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
321333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
321433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
321533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
321633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
32177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
321833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
32197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
322033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBQUANTIZEROW_SSE2
322133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Quantize 4 ARGB pixels (16 bytes).
322233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// aligned to 16 bytes
322333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
322433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                          int interval_offset, int width) {
322533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
322633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %2,%%xmm2                       \n"
322733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %3,%%xmm3                       \n"
322833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %4,%%xmm4                       \n"
322933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0x40,%%xmm2,%%xmm2             \n"
323033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x44,%%xmm2,%%xmm2             \n"
323133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0x40,%%xmm3,%%xmm3             \n"
323233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x44,%%xmm3,%%xmm3             \n"
323333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshuflw   $0x40,%%xmm4,%%xmm4             \n"
323433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x44,%%xmm4,%%xmm4             \n"
323533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm5,%%xmm5                   \n"
323633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pcmpeqb   %%xmm6,%%xmm6                   \n"
323733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pslld     $0x18,%%xmm6                    \n"
323833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
323933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
324033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
324133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
324233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
324333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm5,%%xmm0                   \n"
324433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm0                   \n"
324533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm1                     \n"
324633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm5,%%xmm1                   \n"
324733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm1                   \n"
324833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm0                   \n"
324933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm7                     \n"
325033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmullw    %%xmm3,%%xmm1                   \n"
325133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pand      %%xmm6,%%xmm7                   \n"
325233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm4,%%xmm0                   \n"
325333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddw     %%xmm4,%%xmm1                   \n"
325433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
325533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "por       %%xmm7,%%xmm0                   \n"
325633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%1                         \n"
325733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0)                     \n"
325833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
325933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
326033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(dst_argb),       // %0
326133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)           // %1
326233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(scale),           // %2
326333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "r"(interval_size),   // %3
326433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "r"(interval_offset)  // %4
326533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
326633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
326733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
32687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
326933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
327033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
327133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBQUANTIZEROW_SSE2
327233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
327333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
327433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Creates a table of cumulative sums where each value is a sum of all values
327533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// above and to the left of the value, inclusive of the value.
327633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
327733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                  const int32* previous_cumsum, int width) {
327833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
327933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%2                           \n"
328033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm0,%%xmm0                   \n"
328133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pxor      %%xmm1,%%xmm1                   \n"
328233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%3                         \n"
328333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        49f                             \n"
328433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "test      $0xf,%1                         \n"
328533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jne       49f                             \n"
328633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
328733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // 4 pixel loop                              \n"
328833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
328933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "40:                                         \n"
329033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%0),%%xmm2                     \n"
329133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
329233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm4                   \n"
329333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm2                   \n"
329433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm3                   \n"
329533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm1,%%xmm2                   \n"
329633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm1,%%xmm3                   \n"
329733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm1,%%xmm4                   \n"
329833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm4,%%xmm5                   \n"
329933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm1,%%xmm4                   \n"
330033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhwd %%xmm1,%%xmm5                   \n"
330133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm2,%%xmm0                   \n"
330233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%1,%2,1),%%xmm2                \n"
330333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm0,%%xmm2                   \n"
330433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm3,%%xmm0                   \n"
330533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%1,%2,1),%%xmm3            \n"
330633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm0,%%xmm3                   \n"
330733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm4,%%xmm0                   \n"
330833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%1,%2,1),%%xmm4            \n"
330933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm0,%%xmm4                   \n"
331033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm5,%%xmm0                   \n"
331133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%1,%2,1),%%xmm5            \n"
331233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm0,%%xmm5                   \n"
331333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,(%1)                     \n"
331433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm3,0x10(%1)                 \n"
331533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm4,0x20(%1)                 \n"
331633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm5,0x30(%1)                 \n"
331733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%1),%1                     \n"
331833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%3                         \n"
331933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       40b                             \n"
332033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
332133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "49:                                         \n"
332233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $0x3,%3                         \n"
332333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        19f                             \n"
332433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
332533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // 1 pixel loop                              \n"
332633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
332733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "10:                                         \n"
332833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0),%%xmm2                     \n"
332933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%0),%0                      \n"
333033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm1,%%xmm2                   \n"
333133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm1,%%xmm2                   \n"
333233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm2,%%xmm0                   \n"
333333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    (%1,%2,1),%%xmm2                \n"
333433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     %%xmm0,%%xmm2                   \n"
333533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm2,(%1)                     \n"
333633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
333733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%3                         \n"
333833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       10b                             \n"
33397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
334033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "19:                                         \n"
334133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(row),  // %0
334233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(cumsum),  // %1
334333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(previous_cumsum),  // %2
334433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)  // %3
334533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
334633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
334733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
334833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5"
334933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
335033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
335133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
335233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
335333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
335433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
335533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
335633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 int width, int area, uint8* dst, int count) {
335733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
335833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %5,%%xmm4                       \n"
335933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtdq2ps  %%xmm4,%%xmm4                   \n"
336033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "rcpss     %%xmm4,%%xmm4                   \n"
336133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm4,%%xmm4              \n"
336233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%3                         \n"
336333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        49f                             \n"
336433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
336533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // 4 pixel loop                              \n"
336633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
336733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "40:                                         \n"
336833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
336933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x10(%0),%%xmm1                 \n"
337033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x20(%0),%%xmm2                 \n"
337133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    0x30(%0),%%xmm3                 \n"
337233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     (%0,%4,4),%%xmm0                \n"
337333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     0x10(%0,%4,4),%%xmm1            \n"
337433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     0x20(%0,%4,4),%%xmm2            \n"
337533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     0x30(%0,%4,4),%%xmm3            \n"
337633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%0),%0                     \n"
337733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     (%1),%%xmm0                     \n"
337833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     0x10(%1),%%xmm1                 \n"
337933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     0x20(%1),%%xmm2                 \n"
338033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     0x30(%1),%%xmm3                 \n"
338133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     (%1,%4,4),%%xmm0                \n"
338233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     0x10(%1,%4,4),%%xmm1            \n"
338333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     0x20(%1,%4,4),%%xmm2            \n"
338433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     0x30(%1,%4,4),%%xmm3            \n"
338533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x40(%1),%1                     \n"
338633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
338733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtdq2ps  %%xmm1,%%xmm1                   \n"
338833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mulps     %%xmm4,%%xmm0                   \n"
338933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mulps     %%xmm4,%%xmm1                   \n"
339033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtdq2ps  %%xmm2,%%xmm2                   \n"
339133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtdq2ps  %%xmm3,%%xmm3                   \n"
339233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mulps     %%xmm4,%%xmm2                   \n"
339333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mulps     %%xmm4,%%xmm3                   \n"
339433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtps2dq  %%xmm0,%%xmm0                   \n"
339533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtps2dq  %%xmm1,%%xmm1                   \n"
339633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtps2dq  %%xmm2,%%xmm2                   \n"
339733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtps2dq  %%xmm3,%%xmm3                   \n"
339833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packssdw  %%xmm1,%%xmm0                   \n"
339933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packssdw  %%xmm3,%%xmm2                   \n"
340033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm2,%%xmm0                   \n"
340133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqu    %%xmm0,(%2)                     \n"
340233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%2),%2                     \n"
340333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%3                         \n"
340433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       40b                             \n"
340533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
340633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "49:                                         \n"
340733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $0x3,%3                         \n"
340833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        19f                             \n"
340933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
341033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // 1 pixel loop                              \n"
341133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
341233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "10:                                         \n"
341333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
341433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     (%0,%4,4),%%xmm0                \n"
341533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
341633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psubd     (%1),%%xmm0                     \n"
341733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "paddd     (%1,%4,4),%%xmm0                \n"
341833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
341933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtdq2ps  %%xmm0,%%xmm0                   \n"
342033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mulps     %%xmm4,%%xmm0                   \n"
342133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvtps2dq  %%xmm0,%%xmm0                   \n"
342233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packssdw  %%xmm0,%%xmm0                   \n"
342333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm0,%%xmm0                   \n"
342433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,(%2)                     \n"
342533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%2),%2                      \n"
342633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%3                         \n"
342733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       10b                             \n"
342833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "19:                                         \n"
342933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(topleft),  // %0
343033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(botleft),  // %1
343133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst),      // %2
343233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(count)    // %3
343333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(width)),  // %4
343433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "rm"(area)     // %5
343533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
343633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
343733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4"
343833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
343933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
344033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
344133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
344233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBSHADE_SSE2
344333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shade 4 pixels at a time by specified value.
344433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes.
344533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
344633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint32 value) {
344733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
344833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %3,%%xmm2                       \n"
344933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %0,%1                           \n"
345033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm2,%%xmm2                   \n"
345133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklqdq %%xmm2,%%xmm2                  \n"
345233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
345333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
345433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  2                               \n"
345533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
345633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%0),%%xmm0                     \n"
345733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
345833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm0                   \n"
345933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm1,%%xmm1                   \n"
346033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm0                   \n"
346133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmulhuw   %%xmm2,%%xmm1                   \n"
346233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm0                     \n"
346333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x8,%%xmm1                     \n"
346433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
346533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
346633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%0,%1,1)                \n"
346733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%0),%0                     \n"
346833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
346933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),       // %0
347033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),       // %1
347133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(width)           // %2
347233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(value)            // %3
347333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
347433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
347533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2"
347633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
347733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
347833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
347933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBSHADE_SSE2
348033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
348133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBAFFINEROW_SSE2
348233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Find 64 bit way to avoid masking.
348333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Investigate why 4 pixels is slower than 2 on Core2.
348433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Copy ARGB pixels from source image with slope to a row of destination.
348533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Caveat - in 64 bit, movd is used with 64 bit gpr due to Mac gcc producing
348633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// an error if movq is used. movd  %%xmm0,%1
348733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
348833cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
348933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
349033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* dst_argb, const float* uv_dudv, int width) {
349133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  intptr_t src_argb_stride_temp = src_argb_stride;
349233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  intptr_t temp = 0;
349333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
349433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      (%3),%%xmm2                     \n"
349533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      0x8(%3),%%xmm7                  \n"
349633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shl       $0x10,%1                        \n"
349733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $0x4,%1                         \n"
349833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %1,%%xmm5                       \n"
349933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%4                         \n"
350033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        49f                             \n"
350133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
350233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x44,%%xmm7,%%xmm7             \n"
350333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
350433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm0                   \n"
350533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "addps     %%xmm7,%%xmm0                   \n"
350633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movlhps   %%xmm0,%%xmm2                   \n"
350733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm7,%%xmm4                   \n"
350833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "addps     %%xmm4,%%xmm4                   \n"
350933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm2,%%xmm3                   \n"
351033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "addps     %%xmm4,%%xmm3                   \n"
351133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "addps     %%xmm4,%%xmm4                   \n"
351233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
351333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // 4 pixel loop                              \n"
351433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
351533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "40:                                         \n"
351633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvttps2dq %%xmm2,%%xmm0                   \n"
351733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvttps2dq %%xmm3,%%xmm1                   \n"
351833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packssdw  %%xmm1,%%xmm0                   \n"
351933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddwd   %%xmm5,%%xmm0                   \n"
352033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__x86_64__)
352133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%1                       \n"
352233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       %1,%5                           \n"
352333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "and       $0x0fffffff,%1                  \n"
352433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shr       $32,%5                          \n"
352533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0xEE,%%xmm0,%%xmm0             \n"
352633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#else
352733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%1                       \n"
352833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
352933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%5                       \n"
353033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
353133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
353233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0,%1,1),%%xmm1                \n"
353333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0,%5,1),%%xmm6                \n"
353433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq %%xmm6,%%xmm1                   \n"
353533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "addps     %%xmm4,%%xmm2                   \n"
353633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm1,(%2)                     \n"
353733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__x86_64__)
353833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%1                       \n"
353933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "mov       %1,%5                           \n"
354033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "and       $0x0fffffff,%1                  \n"
354133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shr       $32,%5                          \n"
354233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#else
354333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%1                       \n"
354433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x39,%%xmm0,%%xmm0             \n"
354533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%5                       \n"
354633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
354733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0,%1,1),%%xmm0                \n"
354833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0,%5,1),%%xmm6                \n"
354933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckldq %%xmm6,%%xmm0                   \n"
355033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "addps     %%xmm4,%%xmm3                   \n"
355133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%4                         \n"
355233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movq      %%xmm0,0x08(%2)                 \n"
355333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%2),%2                     \n"
355433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       40b                             \n"
355533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
355633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "49:                                         \n"
355733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $0x3,%4                         \n"
355833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jl        19f                             \n"
355933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
356033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  // 1 pixel loop                              \n"
356133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
356233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "10:                                         \n"
356333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cvttps2dq %%xmm2,%%xmm0                   \n"
356433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packssdw  %%xmm0,%%xmm0                   \n"
356533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddwd   %%xmm5,%%xmm0                   \n"
356633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "addps     %%xmm7,%%xmm2                   \n"
356733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,%1                       \n"
356833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__x86_64__)
356933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "and       $0x0fffffff,%1                  \n"
357033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
357133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      (%0,%1,1),%%xmm0                \n"
357233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x1,%4                         \n"
357333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %%xmm0,(%2)                     \n"
357433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x4(%2),%2                      \n"
357533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jge       10b                             \n"
357633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "19:                                         \n"
357733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(src_argb),  // %0
357833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(src_argb_stride_temp),  // %1
357933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_argb),  // %2
358033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(uv_dudv),   // %3
358133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+rm"(width),    // %4
358233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(temp)   // %5
358333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  :
358433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
358533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
358633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
358733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
358833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
358933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
359033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBAFFINEROW_SSE2
359133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
359233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version
359333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
359433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                              ptrdiff_t src_stride, int dst_width,
359533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                              int source_y_fraction) {
359633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  asm volatile (
359733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       %1,%0                           \n"
359833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "shr       %3                              \n"
359933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cmp       $0x0,%3                         \n"
360033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "je        2f                              \n"
360133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "cmp       $0x40,%3                        \n"
360233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "je        3f                              \n"
360333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %3,%%xmm0                       \n"
360433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "neg       %3                              \n"
360533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "add       $0x80,%3                        \n"
360633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movd      %3,%%xmm5                       \n"
360733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm0,%%xmm5                   \n"
360833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklwd %%xmm5,%%xmm5                   \n"
360933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pshufd    $0x0,%%xmm5,%%xmm5              \n"
361033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
361133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "1:                                          \n"
361233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%1),%%xmm0                     \n"
361333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%1,%4,1),%%xmm2                \n"
361433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,%%xmm1                   \n"
361533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpcklbw %%xmm2,%%xmm0                   \n"
361633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "punpckhbw %%xmm2,%%xmm1                   \n"
361733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm5,%%xmm0                   \n"
361833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pmaddubsw %%xmm5,%%xmm1                   \n"
361933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm0                     \n"
362033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "psrlw     $0x7,%%xmm1                     \n"
362133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "packuswb  %%xmm1,%%xmm0                   \n"
362233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
362333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1,%0,1)                \n"
362433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
362533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        1b                              \n"
362633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jmp       4f                              \n"
362733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
362833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "2:                                          \n"
362933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%1),%%xmm0                     \n"
363033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
363133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1,%0,1)                \n"
363233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
363333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        2b                              \n"
363433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jmp       4f                              \n"
363533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
363633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "3:                                          \n"
363733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    (%1),%%xmm0                     \n"
363833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "pavgb     (%1,%4,1),%%xmm0                \n"
363933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "sub       $0x4,%2                         \n"
364033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "movdqa    %%xmm0,(%1,%0,1)                \n"
364133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "lea       0x10(%1),%1                     \n"
364233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "jg        3b                              \n"
364333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  "4:                                          \n"
364433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ".p2align  4                               \n"
364533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "+r"(dst_ptr),     // %0
364633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(src_ptr),     // %1
364733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(dst_width),   // %2
364833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    "+r"(source_y_fraction)  // %3
364933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "r"(static_cast<intptr_t>(src_stride))  // %4
365033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  : "memory", "cc"
365133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if defined(__SSE2__)
365233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    , "xmm0", "xmm1", "xmm2", "xmm5"
365333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
365433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  );
365533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
365633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
365733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // defined(__x86_64__) || defined(__i386__)
365833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
365933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
36607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}  // extern "C"
366133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // namespace libyuv
366233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
3663