17cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde/*
233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp *  Copyright 2011 The LibYuv Project Authors. All rights reserved.
37cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *
47cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  Use of this source code is governed by a BSD-style license
57cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  that can be found in the LICENSE file in the root of the source
67cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  tree. An additional intellectual property rights grant can be found
77cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  in the file PATENTS.  All contributing project authors may
87cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  be found in the AUTHORS file in the root of the source tree.
97cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde */
107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h"
127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampnamespace libyuv {
157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" {
1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
1733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// This module is for Visual C x86.
1933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86)
207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): I420ToRGB24, I420ToRAW
227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#ifdef HAS_ARGBTOYROW_SSSE3
237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for ARGB.
2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToY = {
267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToU = {
307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToV = {
347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for BGRA.
3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kBGRAToY = {
397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kBGRAToU = {
437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kBGRAToV = {
477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for ABGR.
5133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kABGRToY = {
527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kABGRToU = {
567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kABGRToV = {
607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for RGBA.
6433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kRGBAToY = {
6533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33
6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
6733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
6833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kRGBAToU = {
6933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38
7033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
7133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
7233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kRGBAToV = {
7333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112
7433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
7533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
7633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kAddY16 = {
7733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u
787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
8033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kAddUV128 = {
817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
8533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting RGB24 to ARGB.
8633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskRGB24ToARGB = {
877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Shuffle table for converting RAW to ARGB.
9133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskRAWToARGB = {
927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
9533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting BGRA to ARGB.
9633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskBGRAToARGB = {
9733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
9833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
9933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
10033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ABGR to ARGB.
10133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskABGRToARGB = {
10233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
10333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
10433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
10533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting RGBA to ARGB.
10633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskRGBAToARGB = {
10733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
10833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
10933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
11033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RGBA.
11133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskARGBToRGBA = {
11233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
11333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
11433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
11533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RGB24.
11633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskARGBToRGB24 = {
11733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u
11833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
11933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
12033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RAW.
12133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskARGBToRAW = {
12233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u
12333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
12433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
12533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
12633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) {
12733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
12833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]        // src_y
12933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]        // dst_argb
13033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]       // pix
13133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0xff000000
13233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld      xmm5, 24
13333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
13433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
13533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
13633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       xmm0, qword ptr [eax]
13733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 8]
13833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm0
13933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
14033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm0
14133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm1
14233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm5
14333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm1, xmm5
14433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm1
14633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 32]
14733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
14833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
14933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
15033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
15133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
15233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
15333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
15433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) {
15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
15633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_bgra
15733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_argb
15833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
15933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, kShuffleMaskBGRAToARGB
16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
16133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
16233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
16333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
16433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]
16533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm5
16633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 4
16733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax + edx], xmm0
16833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
16933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
17033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
17133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
17233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
17333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
17433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
17533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) {
17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_abgr
17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_argb
17933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
18033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, kShuffleMaskABGRToARGB
18133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
18233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
18333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
18433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
18533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]
18633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm5
18733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 4
18833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax + edx], xmm0
18933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
19033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
19133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
19233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
19333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
19433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
19533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
19633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) {
19733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
19833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_rgba
19933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_argb
20033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
20133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, kShuffleMaskRGBAToARGB
20233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
20333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
20433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
20533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
20633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]
20733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm5
20833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 4
20933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax + edx], xmm0
21033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
21133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
21233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
21333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
21433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
21533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
21633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
21733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) {
21833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
21933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_argb
22033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_rgba
22133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
22233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, kShuffleMaskARGBToRGBA
22333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
22433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
22533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
22633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
22733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]
22833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm5
22933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 4
23033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax + edx], xmm0
23133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
23233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
23333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
23433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
23533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
23633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
23733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
23833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) {
23933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
24033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_rgb24
24133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_argb
24233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
24333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
24433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm5, 24
24533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm4, kShuffleMaskRGB24ToARGB
24633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
24733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
24833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
24933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm0, [eax]
25033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm1, [eax + 16]
25133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm3, [eax + 32]
25233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 48]
25333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm3
25433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
25533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm2, xmm4
25633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm2, xmm5
25733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
25833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm4
25933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 32], xmm2
26033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm5
26133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm1, xmm4
26233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx], xmm0
26333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm1, xmm5
26433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
26533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm3, xmm4
26633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 16], xmm1
26733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm3, xmm5
26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 16
26933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 48], xmm3
27033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 64]
27133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
27233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
27433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
27533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
27633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
27733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
27833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        int pix) {
27933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
28033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_raw
28133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_argb
28233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm5, xmm5       // generate mask 0xff000000
28433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm5, 24
28533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm4, kShuffleMaskRAWToARGB
28633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
28733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm0, [eax]
29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm1, [eax + 16]
29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm3, [eax + 32]
29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 48]
29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm3
29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm2, xmm4
29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm2, xmm5
29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm4
29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 32], xmm2
30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm5
30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm1, xmm4
30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx], xmm0
30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm1, xmm5
30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm3, xmm4
30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 16], xmm1
30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm3, xmm5
30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 16
30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 48], xmm3
31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 64]
31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// pmul method to replicate bits.
31733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Math to replicate bits:
31833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// (v << 8) | (v << 3)
31933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// v * 256 + v * 8
32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// v * (256 + 8)
32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3
32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 20 instructions.
32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb,
32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                          int pix) {
32633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd      xmm5, eax
32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd    xmm5, xmm5, 0
33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, 0x20802080  // multiplier shift by 5 and then repeat 6 bits
33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd      xmm6, eax
33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd    xmm6, xmm6, 0
33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm3, 11
33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm4, xmm4       // generate mask 0x07e007e0 for Green
33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm4, 10
33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw     xmm4, 5
33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm7, 8
34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_rgb565
34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_argb
34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm0, [eax]   // fetch 8 pixels of bgr565
35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, xmm0
35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm0
35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm1, xmm3    // R in upper 5 bits
35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm2, 11      // B in upper 5 bits
35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw   xmm1, xmm5    // * (256 + 8)
35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw   xmm2, xmm5    // * (256 + 8)
35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm1, 8
35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm1, xmm2    // RB
35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm0, xmm4    // G in middle 6 bits
35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw   xmm0, xmm6    // << 5 * (256 + 4)
36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm7    // AG
36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm1
36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw xmm1, xmm0
36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw xmm2, xmm0
36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 8
36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
37033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
37133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
37333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 24 instructions
37433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
37533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb,
37633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                            int pix) {
37733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, 0x01080108  // generate multiplier to repeat 5 bits
37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd      xmm5, eax
38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd    xmm5, xmm5, 0
38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, 0x42004200  // multiplier shift by 6 and then repeat 5 bits
38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd      xmm6, eax
38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd    xmm6, xmm6, 0
38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm3, xmm3       // generate mask 0xf800f800 for Red
38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm3, 11
38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm4, xmm3       // generate mask 0x03e003e0 for Green
38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw     xmm4, 6
38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm7, xmm7       // generate mask 0xff00ff00 for Alpha
38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm7, 8
39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_argb1555
39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_argb
39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
39533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm0, [eax]   // fetch 8 pixels of 1555
40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, xmm0
40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm0
40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm1, 1       // R in upper 5 bits
40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm2, 11      // B in upper 5 bits
40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm1, xmm3
40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw   xmm2, xmm5    // * (256 + 8)
40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw   xmm1, xmm5    // * (256 + 8)
40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm1, 8
40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm1, xmm2    // RB
40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm0
41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm0, xmm4    // G in middle 5 bits
41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw     xmm2, 8       // A
41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw   xmm0, xmm6    // << 6 * (256 + 8)
41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm2, xmm7
41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm2    // AG
41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm1
41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw xmm1, xmm0
41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw xmm2, xmm0
41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax * 2 + edx], xmm1  // store 4 pixels of ARGB
41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax * 2 + edx + 16], xmm2  // store next 4 pixels of ARGB
42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 8
42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 18 instructions.
42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb,
43033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                            int pix) {
43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, 0x0f0f0f0f  // generate mask 0x0f0f0f0f
43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd      xmm4, eax
43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd    xmm4, xmm4, 0
43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, xmm4       // 0xf0f0f0f0 for high nibbles
43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm5, 4
43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_argb4444
43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_argb
43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edx, eax
44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm0, [eax]   // fetch 8 pixels of bgra4444
44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm0
44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm0, xmm4    // mask low nibbles
44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm2, xmm5    // mask high nibbles
44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, xmm0
45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm3, xmm2
45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm1, 4
45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw     xmm3, 4
45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm1
45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm2, xmm3
45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, xmm0
45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw xmm0, xmm2
45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw xmm1, xmm2
45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax * 2 + edx], xmm0  // store 4 pixels of ARGB
45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [eax * 2 + edx + 16], xmm1  // store next 4 pixels of ARGB
46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 8
46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_argb
47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_rgb
47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm6, kShuffleMaskARGBToRGB24
47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, [eax + 16]
47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, [eax + 32]
48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm3, [eax + 48]
48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 64]
48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm1, xmm6
48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm2, xmm6
48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm3, xmm6
48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrldq    xmm1, 4      // 8 bytes from 1
48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslldq    xmm4, 12     // 4 bytes from 1 for 0
48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm4   // 4 bytes from 1 for 0
49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslldq    xmm5, 8      // 8 bytes from 2 for 1
49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx], xmm0  // store 0
49333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm1, xmm5   // 8 bytes from 2 for 1
49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrldq    xmm2, 8      // 4 bytes from 2
49533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslldq    xmm3, 4      // 12 bytes from 3 for 2
49633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm2, xmm3   // 12 bytes from 3 for 2
49733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 16], xmm1   // store 1
49833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 32], xmm2   // store 2
49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 48]
50033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 16
50133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) {
50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_argb
51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_rgb
51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm6, kShuffleMaskARGBToRAW
51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]   // fetch 16 pixels of argb
51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, [eax + 16]
51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, [eax + 32]
51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm3, [eax + 48]
52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 64]
52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm6    // pack 16 bytes of ARGB to 12 bytes of RGB
52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm1, xmm6
52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm2, xmm6
52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm3, xmm6
52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm4, xmm1   // 4 bytes from 1 for 0
52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrldq    xmm1, 4      // 8 bytes from 1
52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslldq    xmm4, 12     // 4 bytes from 1 for 0
52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, xmm2   // 8 bytes from 2 for 1
52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm4   // 4 bytes from 1 for 0
53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslldq    xmm5, 8      // 8 bytes from 2 for 1
53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx], xmm0  // store 0
53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm1, xmm5   // 8 bytes from 2 for 1
53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrldq    xmm2, 8      // 4 bytes from 2
53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslldq    xmm3, 4      // 12 bytes from 3 for 2
53533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm2, xmm3   // 12 bytes from 3 for 2
53633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 16], xmm1   // store 1
53733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx + 32], xmm2   // store 2
53833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 48]
53933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 16
54033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
54133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
54233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
54333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
54433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
54533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
54633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
54733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
54833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_argb
54933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_rgb
55033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
55133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm3, xmm3       // generate mask 0x0000001f
55233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld     xmm3, 27
55333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm4, xmm4       // generate mask 0x000007e0
55433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld     xmm4, 26
55533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm4, 5
55633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm5, xmm5       // generate mask 0xfffff800
55733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm5, 11
55833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
55933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
56033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
56133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
56233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, xmm0    // B
56333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm0    // G
56433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm0, 8       // R
56533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld     xmm1, 3       // B
56633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld     xmm2, 5       // G
56733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrad     xmm0, 16      // R
56833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm1, xmm3    // B
56933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm2, xmm4    // G
57033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm0, xmm5    // R
57133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm1, xmm2    // BG
57233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm1    // BGR
57333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packssdw  xmm0, xmm0
57433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
57533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
57633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 8]
57733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 4
57833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
57933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
58033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
58133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
58233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
58333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Improve sign extension/packing.
58433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
58533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
58633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
58733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_argb
58833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_rgb
58933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
59033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm4, xmm4       // generate mask 0x0000001f
59133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld     xmm4, 27
59233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, xmm4       // generate mask 0x000003e0
59333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm5, 5
59433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm6, xmm4       // generate mask 0x00007c00
59533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm6, 10
59633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm7, xmm7       // generate mask 0xffff8000
59733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld     xmm7, 15
59833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
59933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
60033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
60133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
60233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, xmm0    // B
60333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm2, xmm0    // G
60433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm3, xmm0    // R
60533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrad     xmm0, 16      // A
60633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld     xmm1, 3       // B
60733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld     xmm2, 6       // G
60833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld     xmm3, 9       // R
60933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm0, xmm7    // A
61033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm1, xmm4    // B
61133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm2, xmm5    // G
61233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm3, xmm6    // R
61333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm1    // BA
61433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm2, xmm3    // GR
61533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm2    // BGRA
61633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packssdw  xmm0, xmm0
61733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
61833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB1555
61933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 8]
62033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 4
62133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
62233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
62333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
62433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
62533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
62633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
62733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) {
62833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
62933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src_argb
63033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst_rgb
63133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // pix
63233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb   xmm4, xmm4       // generate mask 0xf000f000
63333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm4, 12
63433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm3, xmm4       // generate mask 0x00f000f0
63533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw     xmm3, 8
63633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
63733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
63833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
63933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]   // fetch 4 pixels of argb
64033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, xmm0
64133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm0, xmm3    // low nibble
64233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand      xmm1, xmm4    // high nibble
64333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrl      xmm0, 4
64433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrl      xmm1, 8
64533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm1
64633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb  xmm0, xmm0
64733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + 16]
64833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq      qword ptr [edx], xmm0  // store 4 pixels of ARGB4444
64933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 8]
65033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 4
65133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
65233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
65333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
65433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
65533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
65633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 16 ARGB pixels (64 bytes) to 16 Y values.
65733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
6587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
6597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
6607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 4]   /* src_argb */
6617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8]   /* dst_y */
6627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 12]  /* pix */
66333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddY16
66433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kARGBToY
6657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
66633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
66733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
6687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
6697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
6707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
6717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
67233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
67333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
67433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm4
67533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm4
6767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax, [eax + 64]
6777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm1
6787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm2, xmm3
6797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm0, 7
6807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm2, 7
6817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb   xmm0, xmm2
68233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5
68333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
6847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     [edx], xmm0
6857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 16]
68633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
68733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
68833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
68933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
69033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
69133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
69233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
69333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
69433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   /* src_argb */
69533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   /* dst_y */
69633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  /* pix */
69733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddY16
69833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kARGBToY
69933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
70033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
70133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
70233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
70333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
70433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + 32]
70533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + 48]
70633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
70733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
70833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm4
70933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm4
71033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 64]
71133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm1
71233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm2, xmm3
71333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 7
71433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 7
71533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm2
71633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5
7177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
71833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
71933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
72033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
7217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
7227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
7237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
7247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
72533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
7267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
7277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
7287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 4]   /* src_argb */
7297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8]   /* dst_y */
7307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 12]  /* pix */
73133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddY16
73233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kBGRAToY
7337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
73433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
73533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
7367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
7377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
7387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
7397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
74033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
74133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
74233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm4
74333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm4
7447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax, [eax + 64]
7457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm1
7467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm2, xmm3
7477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm0, 7
7487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm2, 7
7497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb   xmm0, xmm2
75033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5
75133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
7527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     [edx], xmm0
7537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 16]
75433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
75533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
75633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
75733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
75833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
75933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
76033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
76133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
76233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   /* src_argb */
76333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   /* dst_y */
76433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  /* pix */
76533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddY16
76633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kBGRAToY
76733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
76833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
76933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
77033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
77133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
77233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + 32]
77333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + 48]
77433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
77533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
77633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm4
77733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm4
77833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 64]
77933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm1
78033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm2, xmm3
78133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 7
78233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 7
78333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm2
78433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5
7857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
78633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
78733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
78833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
7897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
7907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
7917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
7927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
79333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
7947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
7957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
7967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 4]   /* src_argb */
7977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8]   /* dst_y */
7987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 12]  /* pix */
79933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddY16
80033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kABGRToY
8017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
80233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
80333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
8047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
8057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
8067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
8077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
80833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
80933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
81033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm4
81133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm4
8127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax, [eax + 64]
8137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm1
8147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm2, xmm3
8157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm0, 7
8167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm2, 7
8177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb   xmm0, xmm2
81833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5
81933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
8207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     [edx], xmm0
8217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 16]
82233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
82333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
82433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
82533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
82633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
82733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
82833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
82933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
83033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   /* src_argb */
83133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   /* dst_y */
83233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  /* pix */
83333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddY16
83433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kABGRToY
83533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
83633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
83733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
83833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
83933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
84033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + 32]
84133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + 48]
84233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
84333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
84433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm4
84533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm4
84633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 64]
84733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm1
84833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm2, xmm3
84933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 7
85033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 7
85133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm2
85233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5
8537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
85433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
85533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
85633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
8577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
8587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
8597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
8607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
86133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
86233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
8637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
86433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   /* src_argb */
86533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   /* dst_y */
86633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  /* pix */
86733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddY16
86833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kRGBAToY
8697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
87033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
87133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
8727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
8737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
8747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
8757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
87633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
87733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
87833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm4
87933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm4
88033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 64]
88133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm1
88233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm2, xmm3
88333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 7
88433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 7
88533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm2
88633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5
88733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
88833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
88933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
89033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
89133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
89233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
89333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
89433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
89533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
89633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
89733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
89833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   /* src_argb */
89933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   /* dst_y */
90033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  /* pix */
90133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddY16
90233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kRGBAToY
90333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
90433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
90533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
90633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
90733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
90833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + 32]
90933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + 48]
91033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
91133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
91233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm4
91333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm4
91433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 64]
91533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm1
91633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm2, xmm3
91733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 7
91833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 7
91933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm2
92033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5
92133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
92233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
92333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
92433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
92533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
92633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
92733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
92833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
92933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
93033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
93133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* dst_u, uint8* dst_v, int width) {
93233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
93333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
93433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
93533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // src_argb
93633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // src_stride_argb
93733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]  // dst_u
93833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]  // dst_v
93933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // pix
94033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, kARGBToU
94133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kARGBToV
94233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddUV128
94333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx             // stride from u to v
94433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
94533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
94633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
94733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* step 1 - subsample 16x2 argb pixels to 8x1 */
94833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
94933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
95033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax + 32]
95133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, [eax + 48]
95233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, [eax + esi]
95333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, [eax + esi + 16]
95433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, [eax + esi + 32]
95533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm3, [eax + esi + 48]
95633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 64]
95733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm0
95833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm0, xmm1, 0x88
95933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm1, 0xdd
96033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
96133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm2
9627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm2, xmm3, 0x88
9637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm3, 0xdd
9647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, xmm4
9657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
9667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 2 - convert to U and V
9677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // from here down is very similar to Y code except
9687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // instead of 16 different pixels, its 8 pixels of U and 8 of V
9697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, xmm0
9707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, xmm2
9717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7  // U
9727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
9737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm6  // V
9747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm6
9757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm2
9767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm1, xmm3
9777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm0, 8
9787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm1, 8
9797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packsswb   xmm0, xmm1
9807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm5            // -> unsigned
9817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
9827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 3 - store 8 U and 8 V values
98333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
9847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlps     qword ptr [edx], xmm0 // U
9857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhps     qword ptr [edx + edi], xmm0 // V
9867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 8]
98733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
98833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
98933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
99033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
99133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
99233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
99333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
99433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
99533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
99633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
99733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 uint8* dst_u, uint8* dst_v, int width) {
99833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
99933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
100033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
100133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // src_argb
100233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // src_stride_argb
100333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]  // dst_u
100433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]  // dst_v
100533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // pix
100633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, kARGBToU
100733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kARGBToV
100833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddUV128
100933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx             // stride from u to v
101033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
101133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
101233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
101333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* step 1 - subsample 16x2 argb pixels to 8x1 */
101433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
101533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
101633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + 32]
101733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + 48]
101833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi]
101933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
102033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 16]
102133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, xmm4
102233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 32]
102333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
102433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 48]
102533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm3, xmm4
102633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 64]
102733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm0
102833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm0, xmm1, 0x88
102933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm1, 0xdd
103033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
103133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm2
103233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm2, xmm3, 0x88
103333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm3, 0xdd
103433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
103533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
103633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 2 - convert to U and V
103733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // from here down is very similar to Y code except
103833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // instead of 16 different pixels, its 8 pixels of U and 8 of V
103933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
104033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm2
104133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm7  // U
104233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm7
104333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm6  // V
104433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm6
104533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm2
104633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm1, xmm3
104733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm0, 8
104833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm1, 8
104933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packsswb   xmm0, xmm1
105033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5            // -> unsigned
105133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
105233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 3 - store 8 U and 8 V values
10537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
105433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlps     qword ptr [edx], xmm0 // U
105533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movhps     qword ptr [edx + edi], xmm0 // V
105633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
105733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
105833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
10597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        edi
10607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        esi
10617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
10627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
10637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
10647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
106533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
10667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
10677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                       uint8* dst_u, uint8* dst_v, int width) {
10687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
10697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       esi
10707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       edi
10717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 8 + 4]   // src_argb
10727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        esi, [esp + 8 + 8]   // src_stride_argb
10737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8 + 12]  // dst_u
10747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edi, [esp + 8 + 16]  // dst_v
10757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 8 + 20]  // pix
107633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, kBGRAToU
107733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kBGRAToV
107833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddUV128
10797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        edi, edx             // stride from u to v
10807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
108133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
108233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
10837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    /* step 1 - subsample 16x2 argb pixels to 8x1 */
10847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
10857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
10867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
10877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
10887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, [eax + esi]
10897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm1, [eax + esi + 16]
10907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, [eax + esi + 32]
10917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm3, [eax + esi + 48]
10927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax,  [eax + 64]
10937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm0
10947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm0, xmm1, 0x88
10957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm1, 0xdd
10967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, xmm4
10977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm2
10987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm2, xmm3, 0x88
10997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm3, 0xdd
11007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, xmm4
11017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
11027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 2 - convert to U and V
11037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // from here down is very similar to Y code except
11047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // instead of 16 different pixels, its 8 pixels of U and 8 of V
11057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, xmm0
11067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, xmm2
11077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7  // U
11087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
11097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm6  // V
11107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm6
11117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm2
11127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm1, xmm3
11137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm0, 8
11147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm1, 8
11157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packsswb   xmm0, xmm1
11167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm5            // -> unsigned
11177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
11187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 3 - store 8 U and 8 V values
111933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
11207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlps     qword ptr [edx], xmm0 // U
11217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhps     qword ptr [edx + edi], xmm0 // V
11227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 8]
112333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
112433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
112533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
112633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
112733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
112833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
112933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
113033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
113133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
113233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
113333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 uint8* dst_u, uint8* dst_v, int width) {
113433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
113533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
113633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
113733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // src_argb
113833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // src_stride_argb
113933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]  // dst_u
114033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]  // dst_v
114133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // pix
114233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, kBGRAToU
114333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kBGRAToV
114433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddUV128
114533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx             // stride from u to v
114633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
114733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
114833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
114933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* step 1 - subsample 16x2 argb pixels to 8x1 */
115033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
115133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
115233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + 32]
115333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + 48]
115433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi]
115533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
115633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 16]
115733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, xmm4
115833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 32]
115933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
116033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 48]
116133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm3, xmm4
116233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 64]
116333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm0
116433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm0, xmm1, 0x88
116533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm1, 0xdd
116633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
116733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm2
116833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm2, xmm3, 0x88
116933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm3, 0xdd
117033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
117133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
117233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 2 - convert to U and V
117333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // from here down is very similar to Y code except
117433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // instead of 16 different pixels, its 8 pixels of U and 8 of V
117533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
117633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm2
117733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm7  // U
117833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm7
117933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm6  // V
118033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm6
118133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm2
118233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm1, xmm3
118333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm0, 8
118433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm1, 8
118533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packsswb   xmm0, xmm1
118633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5            // -> unsigned
118733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
118833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 3 - store 8 U and 8 V values
11897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
119033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlps     qword ptr [edx], xmm0 // U
119133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movhps     qword ptr [edx + edi], xmm0 // V
119233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
119333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
119433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
11957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        edi
11967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        esi
11977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
11987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
11997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
12007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
120133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
12027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
12037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                       uint8* dst_u, uint8* dst_v, int width) {
12047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
12057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       esi
12067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       edi
12077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 8 + 4]   // src_argb
12087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        esi, [esp + 8 + 8]   // src_stride_argb
12097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8 + 12]  // dst_u
12107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edi, [esp + 8 + 16]  // dst_v
12117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 8 + 20]  // pix
121233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, kABGRToU
121333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kABGRToV
121433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddUV128
12157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        edi, edx             // stride from u to v
12167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
121733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
121833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
12197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    /* step 1 - subsample 16x2 argb pixels to 8x1 */
12207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
12217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
12227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
12237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
12247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, [eax + esi]
12257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm1, [eax + esi + 16]
12267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, [eax + esi + 32]
12277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm3, [eax + esi + 48]
12287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax,  [eax + 64]
12297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm0
12307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm0, xmm1, 0x88
12317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm1, 0xdd
12327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, xmm4
12337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm2
12347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm2, xmm3, 0x88
12357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm3, 0xdd
12367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, xmm4
12377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
12387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 2 - convert to U and V
12397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // from here down is very similar to Y code except
12407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // instead of 16 different pixels, its 8 pixels of U and 8 of V
12417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, xmm0
12427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, xmm2
12437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7  // U
12447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
12457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm6  // V
12467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm6
12477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm2
12487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm1, xmm3
12497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm0, 8
12507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm1, 8
12517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packsswb   xmm0, xmm1
12527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm5            // -> unsigned
12537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
12547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 3 - store 8 U and 8 V values
125533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
12567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlps     qword ptr [edx], xmm0 // U
12577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhps     qword ptr [edx + edi], xmm0 // V
12587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 8]
125933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
126033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
12617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        edi
12627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        esi
12637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
12647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
12657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
12667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
126733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
126833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
126933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 uint8* dst_u, uint8* dst_v, int width) {
12707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
127133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
127233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
127333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // src_argb
127433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // src_stride_argb
127533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]  // dst_u
127633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]  // dst_v
127733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // pix
127833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, kABGRToU
127933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kABGRToV
128033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddUV128
128133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx             // stride from u to v
12827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
128333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
128433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
128533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* step 1 - subsample 16x2 argb pixels to 8x1 */
128633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
128733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
128833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + 32]
128933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + 48]
129033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi]
129133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
129233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 16]
129333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, xmm4
129433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 32]
129533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
129633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 48]
129733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm3, xmm4
129833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 64]
129933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm0
130033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm0, xmm1, 0x88
130133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm1, 0xdd
130233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
130333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm2
130433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm2, xmm3, 0x88
130533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm3, 0xdd
130633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
13077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
130833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 2 - convert to U and V
130933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // from here down is very similar to Y code except
131033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // instead of 16 different pixels, its 8 pixels of U and 8 of V
131133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
131233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm2
131333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm7  // U
131433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm7
131533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm6  // V
131633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm6
131733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm2
131833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm1, xmm3
131933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm0, 8
132033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm1, 8
132133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packsswb   xmm0, xmm1
132233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5            // -> unsigned
13237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
132433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 3 - store 8 U and 8 V values
132533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
132633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlps     qword ptr [edx], xmm0 // U
132733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movhps     qword ptr [edx + edi], xmm0 // V
132833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
132933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
133033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
133133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
133233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
13337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
13347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
13357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
13367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
133733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
133833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
133933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* dst_u, uint8* dst_v, int width) {
134033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
134133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
134233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
134333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // src_argb
134433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // src_stride_argb
134533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]  // dst_u
134633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]  // dst_v
134733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // pix
134833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, kRGBAToU
134933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kRGBAToV
135033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddUV128
135133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx             // stride from u to v
135233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
135333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
135433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
135533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* step 1 - subsample 16x2 argb pixels to 8x1 */
135633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
135733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
135833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax + 32]
135933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, [eax + 48]
136033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, [eax + esi]
136133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, [eax + esi + 16]
136233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, [eax + esi + 32]
136333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm3, [eax + esi + 48]
136433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 64]
136533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm0
136633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm0, xmm1, 0x88
136733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm1, 0xdd
136833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
136933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm2
137033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm2, xmm3, 0x88
137133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm3, 0xdd
137233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
137333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
137433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 2 - convert to U and V
137533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // from here down is very similar to Y code except
137633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // instead of 16 different pixels, its 8 pixels of U and 8 of V
137733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
137833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm2
137933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm7  // U
138033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm7
138133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm6  // V
138233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm6
138333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm2
138433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm1, xmm3
138533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm0, 8
138633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm1, 8
138733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packsswb   xmm0, xmm1
138833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5            // -> unsigned
138933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
139033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 3 - store 8 U and 8 V values
139133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
139233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlps     qword ptr [edx], xmm0 // U
139333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movhps     qword ptr [edx + edi], xmm0 // V
139433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
139533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
139633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
139733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
139833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
139933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
140033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
140133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
140233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
140333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
140433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
140533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 uint8* dst_u, uint8* dst_v, int width) {
140633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
140733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
140833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
140933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // src_argb
141033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // src_stride_argb
141133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]  // dst_u
141233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]  // dst_v
141333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // pix
141433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, kRGBAToU
141533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, kRGBAToV
141633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kAddUV128
141733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx             // stride from u to v
141833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
141933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
142033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
142133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* step 1 - subsample 16x2 argb pixels to 8x1 */
142233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
142333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
142433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + 32]
142533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + 48]
142633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi]
142733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
142833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 16]
142933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, xmm4
143033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 32]
143133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
143233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm4, [eax + esi + 48]
143333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm3, xmm4
143433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 64]
143533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm0
143633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm0, xmm1, 0x88
143733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm1, 0xdd
143833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm4
143933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm2
144033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm2, xmm3, 0x88
144133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shufps     xmm4, xmm3, 0xdd
144233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm2, xmm4
144333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
144433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 2 - convert to U and V
144533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // from here down is very similar to Y code except
144633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // instead of 16 different pixels, its 8 pixels of U and 8 of V
144733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
144833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm2
144933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm7  // U
145033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm2, xmm7
145133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm6  // V
145233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm3, xmm6
145333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm2
145433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm1, xmm3
145533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm0, 8
145633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm1, 8
145733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packsswb   xmm0, xmm1
145833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddb      xmm0, xmm5            // -> unsigned
145933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
146033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // step 3 - store 8 U and 8 V values
146133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
146233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlps     qword ptr [edx], xmm0 // U
146333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movhps     qword ptr [edx + edi], xmm0 // V
146433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
146533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
146633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
146733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
146833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
146933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
147033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
147133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
147233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBTOYROW_SSSE3
147333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
147433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TOARGBROW_SSSE3
147533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
147633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */
147733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
147833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */
147933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */
148033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UR 0
148133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
148233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VB 0
148333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */
148433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */
148533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
148633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Bias
148733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BB UB * 128 + VB * 128
148833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BG UG * 128 + VG * 128
148933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BR UR * 128 + VR * 128
149033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
149133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kUVToB = {
149233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
149333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
149433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
149533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kUVToR = {
149633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
149733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
149833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
149933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kUVToG = {
150033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
150133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
150233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
150333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kVUToB = {
150433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
150533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
150633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
150733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kVUToR = {
150833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
150933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
151033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
151133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kVUToG = {
151233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
151333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
151433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
151533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
151633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
151733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
151833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
151933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
152033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
152133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): NV12/NV21 fetch UV and use directly.
152233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
152333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
152433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 8 UV from 411.
152533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV444 __asm {                                                     \
152633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movq       xmm0, qword ptr [esi] /* U */                /* NOLINT */ \
152733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movq       xmm1, qword ptr [esi + edi] /* V */          /* NOLINT */ \
152833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm lea        esi,  [esi + 8]                                           \
152933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
153033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
153133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
153233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 4 UV from 422, upsample to 8 UV.
153333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV422 __asm {                                                     \
153433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movd       xmm0, [esi]          /* U */                              \
153533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movd       xmm1, [esi + edi]    /* V */                              \
153633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm lea        esi,  [esi + 4]                                           \
153733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
153833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
153933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
154033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
154133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 2 UV from 411, upsample to 8 UV.
154233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV411 __asm {                                                     \
154333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movd       xmm0, [esi]          /* U */                              \
154433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movd       xmm1, [esi + edi]    /* V */                              \
154533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm lea        esi,  [esi + 2]                                           \
154633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpcklbw  xmm0, xmm1           /* UV */                             \
154733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
154833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpckldq  xmm0, xmm0           /* UVUV (upsample) */                \
154933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
155033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
155133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 4 UV from NV12, upsample to 8 UV.
155233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READNV12 __asm {                                                       \
155333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movq       xmm0, qword ptr [esi] /* UV */               /* NOLINT */ \
155433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm lea        esi,  [esi + 8]                                           \
155533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpcklwd  xmm0, xmm0           /* UVUV (upsample) */                \
155633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
155733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
155833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 pixels: 8 UV and 8 Y.
155933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YUVTORGB __asm {                                                       \
156033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
156133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movdqa     xmm1, xmm0                                                \
156233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movdqa     xmm2, xmm0                                                \
156333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm pmaddubsw  xmm0, kUVToB        /* scale B UV */                      \
156433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm pmaddubsw  xmm1, kUVToG        /* scale G UV */                      \
156533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm pmaddubsw  xmm2, kUVToR        /* scale R UV */                      \
156633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
156733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psubw      xmm1, kUVBiasG                                            \
156833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psubw      xmm2, kUVBiasR                                            \
156933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
157033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
157133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm lea        eax, [eax + 8]                                            \
157233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpcklbw  xmm3, xmm4                                                \
157333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psubsw     xmm3, kYSub16                                             \
157433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm pmullw     xmm3, kYToRgb                                             \
157533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
157633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
157733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
157833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psraw      xmm0, 6                                                   \
157933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psraw      xmm1, 6                                                   \
158033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psraw      xmm2, 6                                                   \
158133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm packuswb   xmm0, xmm0           /* B */                              \
158233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm packuswb   xmm1, xmm1           /* G */                              \
158333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm packuswb   xmm2, xmm2           /* R */                              \
158433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
158533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
158633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 pixels: 8 VU and 8 Y.
158733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YVUTORGB __asm {                                                       \
158833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* Step 1: Find 4 UV contributions to 8 R,G,B values */                    \
158933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movdqa     xmm1, xmm0                                                \
159033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movdqa     xmm2, xmm0                                                \
159133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm pmaddubsw  xmm0, kVUToB        /* scale B UV */                      \
159233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm pmaddubsw  xmm1, kVUToG        /* scale G UV */                      \
159333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm pmaddubsw  xmm2, kVUToR        /* scale R UV */                      \
159433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psubw      xmm0, kUVBiasB      /* unbias back to signed */           \
159533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psubw      xmm1, kUVBiasG                                            \
159633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psubw      xmm2, kUVBiasR                                            \
159733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    /* Step 2: Find Y contribution to 8 R,G,B values */                        \
159833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm movq       xmm3, qword ptr [eax]                        /* NOLINT */ \
159933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm lea        eax, [eax + 8]                                            \
160033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm punpcklbw  xmm3, xmm4                                                \
160133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psubsw     xmm3, kYSub16                                             \
160233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm pmullw     xmm3, kYToRgb                                             \
160333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm paddsw     xmm0, xmm3           /* B += Y */                         \
160433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm paddsw     xmm1, xmm3           /* G += Y */                         \
160533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm paddsw     xmm2, xmm3           /* R += Y */                         \
160633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psraw      xmm0, 6                                                   \
160733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psraw      xmm1, 6                                                   \
160833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm psraw      xmm2, 6                                                   \
160933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm packuswb   xmm0, xmm0           /* B */                              \
161033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm packuswb   xmm1, xmm1           /* G */                              \
161133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    __asm packuswb   xmm2, xmm2           /* R */                              \
161233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
161333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
161433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16.
161533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
161633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
161733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I444ToARGBRow_SSSE3(const uint8* y_buf,
161833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* u_buf,
161933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* v_buf,
162033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* argb_buf,
162133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) {
162233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
162333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
162433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
162533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
162633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
162733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
162833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // argb
162933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
163033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
163133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
163233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
163333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
163433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
163533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
163633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV444
163733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
163833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
163933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
164033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
164133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
164233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
164333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
164433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
164533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
164633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm1
164733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
164833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
164933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
165033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
165133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
165233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
165333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
165433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
165533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
165633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
165733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16.
165833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
165933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
166033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToARGBRow_SSSE3(const uint8* y_buf,
166133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* u_buf,
166233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* v_buf,
166333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* argb_buf,
166433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) {
166533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
166633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
166733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
166833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
166933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
167033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
167133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // argb
167233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
167333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
167433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
167533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
167633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
167733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
167833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
167933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
168033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
168133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
168233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
168333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
168433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
168533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
168633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
168733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
168833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
168933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm1
169033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
169133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
169233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
169333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
169433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
169533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
169633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
169733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
169833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
169933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
170033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16.
170133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
170233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Similar to I420 but duplicate UV once more.
170333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
170433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I411ToARGBRow_SSSE3(const uint8* y_buf,
170533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* u_buf,
170633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* v_buf,
170733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* argb_buf,
170833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) {
170933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
171033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
171133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
171233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
171333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
171433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
171533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // argb
171633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
171733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
171833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
171933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
172033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
172133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
172233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
172333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV411
172433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
172533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
172633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
172733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
172833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
172933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
173033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
173133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
173233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
173333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm1
173433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
173533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
173633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
173733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
173833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
173933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
174033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
174133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
174233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
174333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
174433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16.
174533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
174633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
174733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV12ToARGBRow_SSSE3(const uint8* y_buf,
174833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* uv_buf,
174933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* argb_buf,
175033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) {
175133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
175233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
175333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]   // Y
175433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 4 + 8]   // UV
175533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 12]  // argb
175633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]  // width
175733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
175833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
175933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
176033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
176133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
176233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
176333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
176433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
176533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
176633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
176733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
176833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
176933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
177033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
177133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
177233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm1
177333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
177433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
177533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
177633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
177733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
177833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
177933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
178033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
178133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
178233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16.
178333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
178433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
178533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV21ToARGBRow_SSSE3(const uint8* y_buf,
178633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* uv_buf,
178733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* argb_buf,
178833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) {
178933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
179033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
179133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]   // Y
179233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 4 + 8]   // VU
179333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 12]  // argb
179433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]  // width
179533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
179633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
179733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
179833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
179933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
180033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
180133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YVUTORGB
180233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
180333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
180433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
180533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
180633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
180733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
180833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
180933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
181033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm1
181133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
181233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
181333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
181433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
181533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
181633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
181733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
181833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
181933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
182033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, unaligned.
182133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes).
182233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
182333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
182433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* u_buf,
182533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* v_buf,
182633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* argb_buf,
182733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   int width) {
182833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
182933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
183033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
183133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
183233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
183333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
183433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // argb
183533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
183633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
183733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
183833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
183933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
184033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
184133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
184233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV444
184333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
184433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
184533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
184633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
184733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
184833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
184933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
185033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
185133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
185233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx + 16], xmm1
185333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
185433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
185533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
185633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
185733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
185833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
185933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
186033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
186133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
186233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
186333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, unaligned.
186433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
186533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
186633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
186733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* u_buf,
186833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* v_buf,
186933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* argb_buf,
187033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   int width) {
187133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
187233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
187333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
187433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
187533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
187633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
187733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // argb
187833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
187933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
188033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
188133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
188233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
188333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
188433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
188533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
188633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
188733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
188833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
188933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
189033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
189133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
189233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
189333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
189433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
189533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx + 16], xmm1
189633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
189733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
189833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
189933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
190033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
190133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
190233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
190333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
190433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
190533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
190633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, unaligned.
190733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
190833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Similar to I420 but duplicate UV once more.
190933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
191033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
191133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* u_buf,
191233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* v_buf,
191333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* argb_buf,
191433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   int width) {
191533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
191633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
191733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
191833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
191933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
192033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
192133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // argb
192233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
192333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
192433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
192533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
192633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
192733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
192833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
192933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV411
193033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
193133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
193233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
193333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
193433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
193533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
193633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
193733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
193833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
193933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx + 16], xmm1
194033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
194133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
194233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
194333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
194433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
194533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
194633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
194733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
194833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
194933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
195033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
195133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16.
195233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
195333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
195433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
195533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* uv_buf,
195633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* argb_buf,
195733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   int width) {
195833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
195933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
196033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]   // Y
196133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 4 + 8]   // UV
196233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 12]  // argb
196333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]  // width
196433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
196533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
196633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
196733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
196833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
196933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
197033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
197133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
197233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
197333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
197433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
197533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
197633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
197733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
197833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
197933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx + 16], xmm1
198033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
198133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
198233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
198333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
198433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
198533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
198633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
198733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
198833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
198933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16.
199033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes).
199133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
199233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
199333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* uv_buf,
199433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* argb_buf,
199533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   int width) {
199633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
199733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
199833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]   // Y
199933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 4 + 8]   // VU
200033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 12]  // argb
200133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]  // width
200233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
200333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
200433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
200533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
200633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
200733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READNV12
200833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YVUTORGB
200933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
201033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
201133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm1           // BG
201233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm5           // RA
201333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
201433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm2           // BGRA first 4 pixels
201533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm2           // BGRA next 4 pixels
201633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
201733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx + 16], xmm1
201833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
201933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
202033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
202133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
202233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
202333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
202433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
202533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
202633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
202733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
202833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToBGRARow_SSSE3(const uint8* y_buf,
202933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* u_buf,
203033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* v_buf,
203133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* bgra_buf,
203233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) {
203333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
203433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
203533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
203633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
203733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
203833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
203933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // bgra
204033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
204133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
204233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
204333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
204433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
204533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
204633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
204733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
204833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
204933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into BGRA
205033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
205133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm1, xmm0           // GB
205233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm5, xmm2           // AR
205333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm5
205433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
205533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
205633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm5
205733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm0
205833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
205933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
206033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
206133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
206233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
206333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
206433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
206533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
206633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
206733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
206833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
206933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf,
207033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* u_buf,
207133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* v_buf,
207233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* bgra_buf,
207333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   int width) {
207433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
207533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
207633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
207733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
207833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
207933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
208033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // bgra
208133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
208233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
208333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
208433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
208533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
208633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
208733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
208833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
208933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
209033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into BGRA
209133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
209233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm1, xmm0           // GB
209333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm5, xmm2           // AR
209433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm5
209533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm5, xmm1           // BGRA first 4 pixels
209633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm0, xmm1           // BGRA next 4 pixels
209733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm5
209833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx + 16], xmm0
209933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
210033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
210133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
210233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
210333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
210433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
210533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
210633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
210733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
210833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
210933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
211033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToABGRRow_SSSE3(const uint8* y_buf,
211133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* u_buf,
211233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* v_buf,
211333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* abgr_buf,
211433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) {
211533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
211633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
211733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
211833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
211933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
212033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
212133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // abgr
212233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
212333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
212433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
212533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
212633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
212733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
212833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
212933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
213033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
213133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
213233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
213333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm1           // RG
213433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm5           // BA
213533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm2
213633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
213733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
213833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm2
213933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm1
214033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
214133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
214233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
214333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
214433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
214533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
214633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
214733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
214833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
214933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
215033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
215133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf,
215233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* u_buf,
215333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* v_buf,
215433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* abgr_buf,
215533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   int width) {
215633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
215733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
215833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
215933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
216033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
216133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
216233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // abgr
216333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
216433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
216533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
216633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
216733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
216833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
216933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
217033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
217133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
217233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
217333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into ARGB
217433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm1           // RG
217533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm5           // BA
217633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm2
217733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm2, xmm0           // RGBA first 4 pixels
217833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm0           // RGBA next 4 pixels
217933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm2
218033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx + 16], xmm1
218133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
218233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
218333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
218433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
218533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
218633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
218733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
218833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
218933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
219033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
219133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
219233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToRGBARow_SSSE3(const uint8* y_buf,
219333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* u_buf,
219433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         const uint8* v_buf,
219533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* rgba_buf,
219633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         int width) {
219733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
219833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
219933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
220033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
220133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
220233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
220333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // rgba
220433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
220533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
220633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
220733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
220833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
220933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
221033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
221133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
221233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
221333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into RGBA
221433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
221533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm1, xmm2           // GR
221633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm5, xmm0           // AB
221733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm5
221833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
221933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
222033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm5
222133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm0
222233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
222333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
222433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
222533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
222633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
222733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
222833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
222933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
223033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
223133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
223233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
223333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf,
223433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* u_buf,
223533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   const uint8* v_buf,
223633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* rgba_buf,
223733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   int width) {
223833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
223933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
224033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
224133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // Y
224233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // U
224333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 12]  // V
224433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 16]  // rgba
224533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]  // width
224633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
224733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm4, xmm4
224833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
224933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
225033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
225133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    READYUV422
225233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    YUVTORGB
225333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
225433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 3: Weave into RGBA
225533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5           // generate 0xffffffff for alpha
225633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm1, xmm2           // GR
225733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm5, xmm0           // AB
225833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm5
225933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm5, xmm1           // RGBA first 4 pixels
226033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm0, xmm1           // RGBA next 4 pixels
226133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm5
226233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx + 16], xmm0
226333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
226433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
226533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
226633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
226733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
226833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
226933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
227033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
227133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
227233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
227333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_I422TOARGBROW_SSSE3
227433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
227533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YTOARGBROW_SSE2
227633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
227733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YToARGBRow_SSE2(const uint8* y_buf,
227833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     uint8* rgb_buf,
227933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     int width) {
228033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
228133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm4, xmm4           // generate mask 0xff000000
228233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld      xmm4, 24
228333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax,0x10001000
228433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3,eax
228533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm3,xmm3,0
228633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax,0x012a012a
228733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2,eax
228833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm2,xmm2,0
228933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]       // Y
229033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]       // rgb
229133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]      // width
229233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
229333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
229433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
229533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164
229633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       xmm0, qword ptr [eax]
229733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 8]
229833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm0           // Y.Y
229933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubusw    xmm0, xmm3
230033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm0, xmm2
230133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0           // G
230233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
230333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // Step 2: Weave into ARGB
230433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm0           // GG
230533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
230633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm0           // BGRA first 4 pixels
230733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm1           // BGRA next 4 pixels
230833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm4
230933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm1, xmm4
231033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
231133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm1
231233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx,  [edx + 32]
231333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
231433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
231533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
231633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
231733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
231833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
231933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_YTOARGBROW_SSE2
232033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
232133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_SSSE3
232233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
232333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes.
232433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMirror = {
232533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u
232633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
232733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
232833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
232933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
233033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
233133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src
233233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst
233333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // width
233433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, kShuffleMirror
233533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax - 16]
233633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
233733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
233833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
233933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax + ecx]
234033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm5
234133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 16
234233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx], xmm0
234333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 16]
234433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
234533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
234633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
234733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
234833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_MIRRORROW_SSSE3
234933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
235033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_SSE2
235133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3
235233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// version can not.
235333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
235433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRow_SSE2(const uint8* src, uint8* dst, int width) {
235533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
235633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src
235733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst
235833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // width
235933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax - 16]
236033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
236133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
236233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
236333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    xmm0, [eax + ecx]
236433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, xmm0        // swap bytes
236533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw     xmm0, 8
236633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw     xmm1, 8
236733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por       xmm0, xmm1
236833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw   xmm0, xmm0, 0x1b  // swap words
236933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufhw   xmm0, xmm0, 0x1b
237033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd    xmm0, xmm0, 0x4e  // swap qwords
237133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 16
237233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu    [edx], xmm0
237333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 16]
237433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
237533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
237633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
237733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
237833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_MIRRORROW_SSE2
237933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
238033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_UV_SSSE3
238133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes of UV channels.
238233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMirrorUV = {
238333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u
238433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
238533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
238633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
238733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v,
238833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       int width) {
238933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
239033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push      edi
239133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4 + 4]   // src
239233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 4 + 8]   // dst_u
239333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edi, [esp + 4 + 12]  // dst_v
239433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 4 + 16]  // width
239533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm1, kShuffleMirrorUV
239633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax + ecx * 2 - 16]
239733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       edi, edx
239833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
239933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
240033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
240133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax]
240233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax - 16]
240333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm1
240433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 8
240533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlpd    qword ptr [edx], xmm0
240633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movhpd    qword ptr [edx + edi], xmm0
24077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 8]
240833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
240933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
241033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop       edi
241133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
241233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
241333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
241433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_MIRRORROW_UV_SSSE3
241533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
241633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBMIRRORROW_SSSE3
241733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
241833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes.
241933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kARGBShuffleMirror = {
242033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u
242133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
242233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
242333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
242433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) {
242533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm {
242633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       eax, [esp + 4]   // src
242733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       edx, [esp + 8]   // dst
242833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov       ecx, [esp + 12]  // width
242933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm5, kARGBShuffleMirror
243033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       eax, [eax - 16]
243133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
243233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
243333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
243433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    xmm0, [eax + ecx * 4]
243533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb    xmm0, xmm5
243633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub       ecx, 4
243733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa    [edx], xmm0
243833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea       edx, [edx + 16]
243933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg        convertloop
244033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
244133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
244233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
244333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBMIRRORROW_SSSE3
244433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
244533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_SPLITUV_SSE2
244633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
244733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) {
244833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
244933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
245033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]    // src_uv
245133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 8]    // dst_u
245233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 4 + 12]   // dst_v
245333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]   // pix
245433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
245533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
245633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
245733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
245833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
245933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
246033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
246133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
246233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
246333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, xmm0
246433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm1
246533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5   // even bytes
246633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5
246733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
246833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 8      // odd bytes
246933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm3, 8
247033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm2, xmm3
247133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
247233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + edi], xmm2
247333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
247433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
247533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
247633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
247733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
247833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
247933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
248033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
248133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_SPLITUV_SSE2
248233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
248333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COPYROW_SSE2
248433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time.
248533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
248633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CopyRow_SSE2(const uint8* src, uint8* dst, int count) {
248733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
248833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   // src
248933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   // dst
249033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  // count
249133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edx, eax
249233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
249333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
249433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
249533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
249633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
249733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + edx], xmm0
249833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + edx + 16], xmm1
249933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 32]
250033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 32
250133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
250233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
250333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
250433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
250533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_COPYROW_SSE2
250633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
250733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COPYROW_X86
250833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
250933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CopyRow_X86(const uint8* src, uint8* dst, int count) {
251033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
251133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, esi
251233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, edi
251333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 4]   // src
251433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8]   // dst
251533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  // count
251633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shr        ecx, 2
251733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    rep movsd
251833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, edx
251933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, eax
252033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
252133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
252233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
252333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_COPYROW_X86
252433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
252533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_SETROW_X86
252633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// SetRow8 writes 'count' bytes using a 32 bit value repeated.
252733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
252833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SetRow8_X86(uint8* dst, uint32 v32, int count) {
252933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
253033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, edi
253133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 4]   // dst
253233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8]   // v32
253333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  // count
253433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shr        ecx, 2
253533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    rep stosd
253633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, edx
253733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
253833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
253933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
254033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
254133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// SetRow32 writes 'count' words using a 32 bit value repeated.
254233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
254333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SetRows32_X86(uint8* dst, uint32 v32, int width,
254433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                   int dst_stride, int height) {
254533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
254633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
254733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
254833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       ebp
254933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 12 + 4]   // dst
255033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 12 + 8]   // v32
255133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ebp, [esp + 12 + 12]  // width
255233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 12 + 16]  // dst_stride
255333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 12 + 20]  // height
255433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        ecx, [ebp * 4]
255533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edx, ecx             // stride - width * 4
255633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
255733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
255833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
255933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, ebp
256033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    rep stosd
256133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        edi, edx
256233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        esi, 1
256333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
256433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
256533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        ebp
256633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
256733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
256833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
256933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
257033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
257133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_SETROW_X86
257233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
257333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YUY2TOYROW_SSE2
257433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
257533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToYRow_SSE2(const uint8* src_yuy2,
257633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     uint8* dst_y, int pix) {
257733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
257833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]    // src_yuy2
257933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]    // dst_y
258033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]   // pix
258133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
258233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
258333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
258433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
258533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
258633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
258733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
258833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
258933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5   // even bytes are Y
259033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5
259133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
259233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
259333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
259433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
259533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
259633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
259733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
259833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
259933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
260033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
260133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2,
260233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                      uint8* dst_u, uint8* dst_v, int pix) {
260333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
260433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
260533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
260633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]    // src_yuy2
260733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]    // stride_yuy2
260833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]   // dst_u
260933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]   // dst_v
261033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]   // pix
261133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
261233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
261333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
261433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
261533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
261633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
261733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
261833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
261933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax + esi]
262033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, [eax + esi + 16]
262133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
262233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm2
262333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, xmm3
262433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8      // YUYV -> UVUV
262533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
262633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
262733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
262833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5  // U
262933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
263033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8     // V
263133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm1, xmm1
263233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm0
263333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx + edi], xmm1
263433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
263533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
263633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
263733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
263833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
263933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
264033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
264133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
264233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
264333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
264433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
264533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUV422Row_SSE2(const uint8* src_yuy2,
264633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* dst_u, uint8* dst_v, int pix) {
264733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
264833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
264933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]    // src_yuy2
265033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 8]    // dst_u
265133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 4 + 12]   // dst_v
265233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]   // pix
265333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
265433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
265533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
265633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
265733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
265833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
265933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
266033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
266133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
266233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8      // YUYV -> UVUV
266333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
266433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
266533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
266633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5  // U
266733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
266833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8     // V
266933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm1, xmm1
267033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm0
267133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx + edi], xmm1
267233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
267333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
267433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
267533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
267633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
267733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
267833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
267933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
268033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
268133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
268233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2,
268333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               uint8* dst_y, int pix) {
268433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
268533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]    // src_yuy2
268633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]    // dst_y
268733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]   // pix
268833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5        // generate mask 0x00ff00ff
268933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
269033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
269133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
269233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
269333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
269433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
269533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
269633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5   // even bytes are Y
269733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5
269833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
269933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
270033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
270133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
270233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
270333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
270433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
270533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
270633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
270733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
270833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2,
270933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* dst_u, uint8* dst_v, int pix) {
271033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
271133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
271233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
271333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]    // src_yuy2
271433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]    // stride_yuy2
271533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]   // dst_u
271633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]   // dst_v
271733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]   // pix
271833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
271933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
272033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
272133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
272233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
272333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
272433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
272533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
272633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + esi]
272733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + esi + 16]
272833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
272933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm2
273033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, xmm3
273133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8      // YUYV -> UVUV
273233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
273333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
273433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
273533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5  // U
273633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
273733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8     // V
273833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm1, xmm1
273933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm0
274033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx + edi], xmm1
274133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
274233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
274333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
274433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
274533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
274633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
274733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
274833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
274933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
275033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
275133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
275233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2,
275333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* dst_u, uint8* dst_v, int pix) {
275433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
275533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
275633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]    // src_yuy2
275733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 8]    // dst_u
275833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 4 + 12]   // dst_v
275933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]   // pix
276033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
276133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
276233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
276333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
276433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
276533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
276633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
276733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
276833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
276933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8      // YUYV -> UVUV
277033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
277133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
277233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
277333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5  // U
277433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
277533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8     // V
277633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm1, xmm1
277733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm0
277833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx + edi], xmm1
277933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
278033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
278133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
278233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
278333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
278433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
278533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
278633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
278733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
278833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
278933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToYRow_SSE2(const uint8* src_uyvy,
279033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                     uint8* dst_y, int pix) {
279133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
279233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]    // src_uyvy
279333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]    // dst_y
279433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]   // pix
279533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
279633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
279733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
279833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
279933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
280033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
280133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8    // odd bytes are Y
280233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
280333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
280433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
280533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
280633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
280733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
280833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
280933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
281033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
281133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
281233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
281333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy,
281433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                      uint8* dst_u, uint8* dst_v, int pix) {
281533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
281633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
281733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
281833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]    // src_yuy2
281933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]    // stride_yuy2
282033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]   // dst_u
282133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]   // dst_v
282233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]   // pix
282333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
282433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
282533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
282633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
282733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
282833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
282933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
283033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
283133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax + esi]
283233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, [eax + esi + 16]
283333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
283433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm2
283533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, xmm3
283633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5   // UYVY -> UVUV
283733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5
283833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
283933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
284033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5  // U
284133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
284233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8     // V
284333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm1, xmm1
284433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm0
284533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx + edi], xmm1
284633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
284733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
284833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
284933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
285033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
285133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
285233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
285333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
285433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
285533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
285633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
285733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUV422Row_SSE2(const uint8* src_uyvy,
285833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                         uint8* dst_u, uint8* dst_v, int pix) {
285933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
286033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
286133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]    // src_yuy2
286233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 8]    // dst_u
286333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 4 + 12]   // dst_v
286433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]   // pix
286533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
286633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
286733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
28687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
286933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
287033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
287133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
287233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
287333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
287433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5   // UYVY -> UVUV
287533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5
287633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
287733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
287833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5  // U
287933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
288033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8     // V
288133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm1, xmm1
288233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm0
288333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx + edi], xmm1
288433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
288533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
288633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
288733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
288833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
28897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
28907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
28917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
28927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
289333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
289433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy,
289533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                               uint8* dst_y, int pix) {
289633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
289733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]    // src_uyvy
289833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]    // dst_y
289933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]   // pix
290033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
290133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
290233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
290333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
290433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
290533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
290633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8    // odd bytes are Y
290733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
290833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
290933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
291033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm0
291133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
291233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
291333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
291433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
291533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
291633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
291733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
291833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy,
291933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                uint8* dst_u, uint8* dst_v, int pix) {
292033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
292133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
292233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
292333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]    // src_yuy2
292433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]    // stride_yuy2
292533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]   // dst_u
292633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 16]   // dst_v
292733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 20]   // pix
292833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
292933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
293033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
293133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
293233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
293333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
293433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
293533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
293633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax + esi]
293733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax + esi + 16]
293833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
293933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, xmm2
294033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm1, xmm3
294133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5   // UYVY -> UVUV
294233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5
294333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
294433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
294533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5  // U
294633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
294733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8     // V
294833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm1, xmm1
294933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm0
295033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx + edi], xmm1
295133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
295233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
295333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
295433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
295533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
295633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
295733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
295833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
295933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
296033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
296133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
296233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy,
296333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                   uint8* dst_u, uint8* dst_v, int pix) {
296433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
296533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
296633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]    // src_yuy2
296733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 8]    // dst_u
296833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 4 + 12]   // dst_v
296933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]   // pix
297033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5            // generate mask 0x00ff00ff
297133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 8
297233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, edx
297333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
297433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
297533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop:
297633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm0, [eax]
297733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [eax + 16]
297833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax,  [eax + 32]
297933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5   // UYVY -> UVUV
298033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5
298133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
298233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
298333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5  // U
298433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
298533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8     // V
298633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm1, xmm1
298733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm0
298833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx + edi], xmm1
298933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 8]
299033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 16
299133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
299233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
299333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
299433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
299533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
299633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
299733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_YUY2TOYROW_SSE2
299833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
299933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBBLENDROW_SSE2
300033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Blend 8 pixels at a time.
300133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
300233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1,
300333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint8* dst_argb, int width) {
300433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
300533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
300633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]   // src_argb0
300733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 4 + 8]   // src_argb1
300833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 12]  // dst_argb
300933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]  // width
301033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm7, xmm7       // generate constant 1
301133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm7, 15
301233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
301333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm6, 8
301433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
301533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw      xmm5, 8
301633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
301733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld      xmm4, 24
301833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
301933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
302033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    je         convertloop1     // only 1 pixel?
302133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         convertloop1b
302233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
302333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop until destination pointer is aligned.
302433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  alignloop1:
302533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    test       edx, 15          // aligned?
302633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    je         alignloop1b
302733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3, [eax]
302833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 4]
302933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm3       // src argb
303033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm3, xmm4       // ~alpha
303133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, [esi]      // _r_b
303233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm3, 8          // alpha
303333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
303433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm3, xmm3,0F5h
303533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm6       // _r_b
303633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm3, xmm7       // 256 - alpha
303733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm2, xmm3       // _r_b * alpha
303833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm1, [esi]      // _a_g
303933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 4]
304033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8          // _a_g
304133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm4       // set alpha to 255
304233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm1, xmm3       // _a_g * alpha
304333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 8          // _r_b convert to 8 bits again
304433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm2       // + src argb
304533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
304633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm1       // + src argb
304733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
304833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       [edx], xmm0
304933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 4]
305033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        alignloop1
305133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
305233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  alignloop1b:
305333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        ecx, 1 - 4
305433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         convertloop4b
305533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
305633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
305733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop4:
305833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax]      // src argb
305933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
306033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm3       // src argb
306133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm3, xmm4       // ~alpha
306233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [esi]      // _r_b
306333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm3, 8          // alpha
306433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
306533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm3, xmm3,0F5h
306633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm6       // _r_b
306733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm3, xmm7       // 256 - alpha
306833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm2, xmm3       // _r_b * alpha
306933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [esi]      // _a_g
307033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 16]
307133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8          // _a_g
307233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm4       // set alpha to 255
307333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm1, xmm3       // _a_g * alpha
307433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 8          // _r_b convert to 8 bits again
307533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm2       // + src argb
307633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
307733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm1       // + src argb
307833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
307933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
308033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
308133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        convertloop4
308233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
308333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop4b:
308433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        ecx, 4 - 1
308533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         convertloop1b
308633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
308733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop.
308833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop1:
308933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3, [eax]      // src argb
309033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 4]
309133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm3       // src argb
309233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm3, xmm4       // ~alpha
309333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, [esi]      // _r_b
309433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm3, 8          // alpha
309533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
309633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm3, xmm3,0F5h
309733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm6       // _r_b
309833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm3, xmm7       // 256 - alpha
309933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm2, xmm3       // _r_b * alpha
310033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm1, [esi]      // _a_g
310133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 4]
310233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8          // _a_g
310333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm4       // set alpha to 255
310433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm1, xmm3       // _a_g * alpha
310533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 8          // _r_b convert to 8 bits again
310633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm2       // + src argb
310733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
310833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm1       // + src argb
310933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
311033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       [edx], xmm0
311133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 4]
311233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        convertloop1
311333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
311433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop1b:
311533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
311633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
311733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
311833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
311933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBBLENDROW_SSE2
312033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
312133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBBLENDROW_SSSE3
312233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for isolating alpha.
312333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleAlpha = {
312433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80,
312533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80
312633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
312733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Same as SSE2, but replaces:
312833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    psrlw      xmm3, 8          // alpha
312933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    pshufhw    xmm3, xmm3,0F5h  // 8 alpha words
313033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    pshuflw    xmm3, xmm3,0F5h
313133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// with..
313233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    pshufb     xmm3, kShuffleAlpha // alpha
313333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Blend 8 pixels at a time.
313433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
313533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
313633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1,
313733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* dst_argb, int width) {
313833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
313933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
314033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4 + 4]   // src_argb0
314133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 4 + 8]   // src_argb1
314233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 4 + 12]  // dst_argb
314333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 4 + 16]  // width
314433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm7, xmm7       // generate constant 1
314533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm7, 15
314633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm6, xmm6       // generate mask 0x00ff00ff
314733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm6, 8
314833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5       // generate mask 0xff00ff00
314933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psllw      xmm5, 8
315033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
315133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld      xmm4, 24
315233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
315333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
315433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    je         convertloop1     // only 1 pixel?
315533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         convertloop1b
315633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
315733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop until destination pointer is aligned.
315833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  alignloop1:
315933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    test       edx, 15          // aligned?
316033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    je         alignloop1b
316133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3, [eax]
316233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 4]
316333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm3       // src argb
316433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm3, xmm4       // ~alpha
316533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, [esi]      // _r_b
316633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb     xmm3, kShuffleAlpha // alpha
316733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm6       // _r_b
316833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm3, xmm7       // 256 - alpha
316933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm2, xmm3       // _r_b * alpha
317033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm1, [esi]      // _a_g
317133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 4]
317233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8          // _a_g
317333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm4       // set alpha to 255
317433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm1, xmm3       // _a_g * alpha
317533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 8          // _r_b convert to 8 bits again
317633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm2       // + src argb
317733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
317833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm1       // + src argb
317933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
318033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       [edx], xmm0
318133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 4]
318233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        alignloop1
318333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
318433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  alignloop1b:
318533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        ecx, 1 - 4
318633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         convertloop4b
318733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
318833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    test       eax, 15          // unaligned?
318933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jne        convertuloop4
319033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    test       esi, 15          // unaligned?
319133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jne        convertuloop4
319233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
319333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop.
319433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop4:
319533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, [eax]      // src argb
319633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
319733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm3       // src argb
319833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm3, xmm4       // ~alpha
319933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [esi]      // _r_b
320033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb     xmm3, kShuffleAlpha // alpha
320133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm6       // _r_b
320233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm3, xmm7       // 256 - alpha
320333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm2, xmm3       // _r_b * alpha
320433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [esi]      // _a_g
320533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 16]
320633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8          // _a_g
320733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm4       // set alpha to 255
320833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm1, xmm3       // _a_g * alpha
320933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 8          // _r_b convert to 8 bits again
321033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm2       // + src argb
321133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
321233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm1       // + src argb
321333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
321433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
321533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
321633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        convertloop4
321733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jmp        convertloop4b
321833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
321933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel unaligned loop.
322033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertuloop4:
322133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm3, [eax]      // src argb
322233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
322333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm3       // src argb
322433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm3, xmm4       // ~alpha
322533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [esi]      // _r_b
322633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb     xmm3, kShuffleAlpha // alpha
322733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm6       // _r_b
322833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm3, xmm7       // 256 - alpha
322933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm2, xmm3       // _r_b * alpha
323033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm1, [esi]      // _a_g
323133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 16]
323233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8          // _a_g
323333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm4       // set alpha to 255
323433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm1, xmm3       // _a_g * alpha
323533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 8          // _r_b convert to 8 bits again
323633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm2       // + src argb
323733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
323833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm1       // + src argb
323933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
324033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm0
324133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
324233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        convertuloop4
324333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
324433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop4b:
324533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        ecx, 4 - 1
324633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         convertloop1b
324733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
324833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop.
324933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop1:
325033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3, [eax]      // src argb
325133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 4]
325233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm3       // src argb
325333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm3, xmm4       // ~alpha
325433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, [esi]      // _r_b
325533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb     xmm3, kShuffleAlpha // alpha
325633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm6       // _r_b
325733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm3, xmm7       // 256 - alpha
325833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm2, xmm3       // _r_b * alpha
325933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm1, [esi]      // _a_g
326033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 4]
326133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8          // _a_g
326233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm4       // set alpha to 255
326333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm1, xmm3       // _a_g * alpha
326433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm2, 8          // _r_b convert to 8 bits again
326533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm2       // + src argb
326633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm1, xmm5       // a_g_ convert to 8 bits again
326733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddusb    xmm0, xmm1       // + src argb
326833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
326933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       [edx], xmm0
327033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 4]
327133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        convertloop1
327233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
327333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  convertloop1b:
327433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
327533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
327633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
327733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
327833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBBLENDROW_SSSE3
327933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
328033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBATTENUATE_SSE2
328133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Attenuate 4 pixels at a time.
328233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes.
328333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
328433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) {
328533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
328633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   // src_argb0
328733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   // dst_argb
328833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  // width
328933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edx, eax
329033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
329133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld      xmm4, 24
329233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm5, xmm5       // generate mask 0x00ffffff
329333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld      xmm5, 8
329433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
329533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
329633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
329733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]      // read 4 pixels
329833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm0       // first 2
329933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufhw    xmm2, xmm0,0FFh  // 8 alpha words
330033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm2, xmm2,0FFh
330133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm0, xmm2       // rgb * a
330233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax]      // read 4 pixels
330333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm1, xmm1       // next 2 pixels
330433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufhw    xmm2, xmm1,0FFh  // 8 alpha words
330533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm2, xmm2,0FFh
330633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm1, xmm2       // rgb * a
330733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax]      // alphas
330833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8
330933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm4
331033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
331133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
331233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm0, xmm5       // keep original alphas
331333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm2
331433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
331533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + edx], xmm0
331633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
331733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
331833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
331933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
332033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
332133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
332233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBATTENUATE_SSE2
332333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
332433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBATTENUATEROW_SSSE3
332533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table duplicating alpha.
332633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleAlpha0 = {
332733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u,
332833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
332933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleAlpha1 = {
333033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u,
333133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u,
333233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
333333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
333433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
333533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
333633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   // src_argb0
333733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   // dst_argb
333833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  // width
333933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edx, eax
334033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm3, xmm3       // generate mask 0xff000000
334133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld      xmm3, 24
334233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kShuffleAlpha0
334333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, kShuffleAlpha1
334433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
334533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
334633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
334733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]      // read 4 pixels
334833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb     xmm0, xmm4       // isolate first 2 alphas
334933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax]      // read 4 pixels
335033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm1, xmm1       // first 2 pixel rgbs
335133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm0, xmm1       // rgb * a
335233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax]      // read 4 pixels
335333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufb     xmm1, xmm5       // isolate next 2 alphas
335433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax]      // read 4 pixels
335533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm2, xmm2       // next 2 pixel rgbs
335633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm1, xmm2       // rgb * a
335733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax]      // mask original alpha
335833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm3
335933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8
336033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
336133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
336233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm2       // copy original alpha
336333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
336433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + edx], xmm0
336533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
336633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
336733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
336833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
336933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
337033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
337133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBATTENUATEROW_SSSE3
337233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
337333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBUNATTENUATEROW_SSE2
337433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Unattenuate 4 pixels at a time.
337533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes.
337633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
337733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb,
337833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                             int width) {
337933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
338033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
338133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
338233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 4]   // src_argb0
338333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 8]   // dst_argb
338433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 12]  // width
338533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edx, eax
338633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm4, xmm4       // generate mask 0xff000000
338733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld      xmm4, 24
338833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
338933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
339033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
339133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]      // read 4 pixels
339233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movzx      esi, byte ptr [eax + 3]  // first alpha
339333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movzx      edi, byte ptr [eax + 7]  // second alpha
339433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm0       // first 2
339533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
339633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
339733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
339833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
339933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlhps    xmm2, xmm3
340033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm0, xmm2       // rgb * a
340133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
340233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax]      // read 4 pixels
340333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movzx      esi, byte ptr [eax + 11]  // third alpha
340433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movzx      edi, byte ptr [eax + 15]  // forth alpha
340533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm1, xmm1       // next 2
340633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, dword ptr fixed_invtbl8[esi * 4]
340733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3, dword ptr fixed_invtbl8[edi * 4]
340833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm2, xmm2,0C0h  // first 4 inv_alpha words
340933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm3, xmm3,0C0h  // next 4 inv_alpha words
341033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlhps    xmm2, xmm3
341133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm1, xmm2       // rgb * a
341233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
341333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax]      // alphas
341433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm2, xmm4
341533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
341633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm2
341733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
341833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + edx], xmm0
341933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
342033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
342133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
342233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
342333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
342433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
342533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
342633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBUNATTENUATEROW_SSE2
342733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
342833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBGRAYROW_SSSE3
342933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R
343033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToGray = {
343133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0
343233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
343333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
343433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels.
343533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
343633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) {
343733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
343833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   /* src_argb */
343933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   /* dst_argb */
344033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  /* width */
344133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kARGBToGray
344233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edx, eax
344333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
344433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
344533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
344633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]  // G
344733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
344833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm4
344933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
345033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm1
345133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 7
345233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0   // 8 G bytes
345333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax]  // A
345433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, [eax + 16]
345533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld      xmm2, 24
345633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld      xmm3, 24
345733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm2, xmm3
345833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm2, xmm2   // 8 A bytes
345933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm0   // Weave into GG, GA, then GGGA
346033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm0   // 8 GG words
346133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm3, xmm2   // 8 GA words
346233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
346333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm3   // GGGA first 4
346433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm3   // GGGA next 4
346533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
346633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + edx], xmm0
346733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + edx + 16], xmm1
346833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 32]
346933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
347033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
347133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
347233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
347333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBGRAYROW_SSSE3
347433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
347533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBSEPIAROW_SSSE3
347633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    b = (r * 35 + g * 68 + b * 17) >> 7
347733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    g = (r * 45 + g * 88 + b * 22) >> 7
347833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//    r = (r * 50 + g * 98 + b * 24) >> 7
347933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constant for ARGB color to sepia tone.
348033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToSepiaB = {
348133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0
348233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
348333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
348433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToSepiaG = {
348533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0
348633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
348733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
348833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToSepiaR = {
348933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0
349033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp};
349133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
349233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
349333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
349433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) {
349533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
349633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   /* dst_argb */
349733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8]   /* width */
349833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, kARGBToSepiaB
349933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, kARGBToSepiaG
350033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, kARGBToSepiaR
350133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
350233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
350333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
350433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]  // B
350533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, [eax + 16]
350633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm2
350733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm6, xmm2
350833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm0, xmm6
350933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 7
351033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0   // 8 B values
351133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, [eax]  // G
351233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
351333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm5, xmm3
351433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm3
351533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm5, xmm1
351633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 7
351733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm5, xmm5   // 8 G values
351833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm5   // 8 BG values
351933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, [eax]  // R
352033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
352133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm5, xmm4
352233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
352333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddw     xmm5, xmm1
352433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm5, 7
352533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm5, xmm5   // 8 R values
352633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, [eax]  // A
352733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
352833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld      xmm6, 24
352933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld      xmm1, 24
353033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm6, xmm1
353133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm6, xmm6   // 8 A values
353233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm5, xmm6   // 8 RA values
353333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0   // Weave BG, RA together
353433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm5   // BGRA first 4
353533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm5   // BGRA next 4
353633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
353733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax], xmm0
353833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + 16], xmm1
353933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 32]
354033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
354133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
354233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
354333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
354433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBSEPIAROW_SSSE3
354533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
354633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3
354733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Tranform 8 ARGB pixels (32 bytes) with color matrix.
354833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Same as Sepia except matrix is provided.
354933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R
355033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd.
355133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
355233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb,
355333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                              int width) {
355433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
355533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   /* dst_argb */
355633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   /* matrix_argb */
355733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  /* width */
355833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, [edx]
355933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3, [edx + 4]
356033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm4, [edx + 8]
356133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm2, xmm2, 0
356233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm3, xmm3, 0
356333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm4, xmm4, 0
356433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
356533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
356633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
356733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]  // B
356833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, [eax + 16]
356933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm2
357033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm6, xmm2
357133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, [eax]  // G
357233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
357333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm5, xmm3
357433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm3
357533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddsw    xmm0, xmm6   // B
357633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddsw    xmm5, xmm1   // G
357733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm0, 7      // B
357833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm5, 7      // G
357933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0   // 8 B values
358033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm5, xmm5   // 8 G values
358133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm5   // 8 BG values
358233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, [eax]  // R
358333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
358433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm5, xmm4
358533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm4
358633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    phaddsw    xmm5, xmm1
358733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psraw      xmm5, 7
358833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm5, xmm5   // 8 R values
358933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm6, [eax]  // A
359033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
359133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld      xmm6, 24
359233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrld      xmm1, 24
359333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm6, xmm1
359433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm6, xmm6   // 8 A values
359533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0   // Weave BG, RA together
359633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm5, xmm6   // 8 RA values
359733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm0, xmm5   // BGRA first 4
359833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm1, xmm5   // BGRA next 4
359933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 8
360033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax], xmm0
360133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + 16], xmm1
360233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 32]
360333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
360433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
360533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
360633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
360733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBCOLORMATRIXROW_SSSE3
360833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
360933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBCOLORTABLEROW_X86
361033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Tranform ARGB pixels with color table.
361133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
361233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb,
361333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                           int width) {
361433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
361533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       ebx
361633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
361733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
361833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       ebp
361933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 16 + 4]   /* dst_argb */
362033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 16 + 8]   /* table_argb */
362133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 16 + 12]  /* width */
362233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    xor        ebx, ebx
362333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    xor        edx, edx
362433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
362533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
362633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
362733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ebp, dword ptr [eax]  // BGRA
362833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, ebp
362933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    and        ebp, 255
363033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shr        esi, 8
363133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    and        esi, 255
363233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        bl, [edi + ebp * 4 + 0]  // B
363333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        dl, [edi + esi * 4 + 1]  // G
363433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ebp, dword ptr [eax]  // BGRA
363533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, ebp
363633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shr        ebp, 16
363733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shr        esi, 24
363833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    and        ebp, 255
363933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        [eax], bl
364033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        [eax + 1], dl
364133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        bl, [edi + ebp * 4 + 2]  // R
364233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        dl, [edi + esi * 4 + 3]  // A
364333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        [eax + 2], bl
364433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        [eax + 3], dl
364533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 4]
364633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
364733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
364833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        ebp
364933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
365033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
365133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        ebx
365233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
365333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
365433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
365533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBCOLORTABLEROW_X86
365633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
365733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBQUANTIZEROW_SSE2
365833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Quantize 4 ARGB pixels (16 bytes).
365933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes.
366033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
366133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size,
366233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                          int interval_offset, int width) {
366333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
366433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]    /* dst_argb */
366533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, [esp + 8]   /* scale */
366633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm3, [esp + 12]  /* interval_size */
366733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm4, [esp + 16]  /* interval_offset */
366833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 20]   /* width */
366933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm2, xmm2, 040h
367033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm2, xmm2, 044h
367133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm3, xmm3, 040h
367233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm3, xmm3, 044h
367333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshuflw    xmm4, xmm4, 040h
367433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm4, xmm4, 044h
367533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm5, xmm5  // constant 0
367633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pcmpeqb    xmm6, xmm6  // generate mask 0xff000000
367733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pslld      xmm6, 24
367833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
367933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
368033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
368133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]  // read 4 pixels
368233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm5   // first 2 pixels
368333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm0, xmm2   // pixel * scale >> 16
368433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax]  // read 4 pixels
368533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm1, xmm5   // next 2 pixels
368633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm1, xmm2
368733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm0, xmm3   // * interval_size
368833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm7, [eax]  // read 4 pixels
368933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmullw     xmm1, xmm3
369033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pand       xmm7, xmm6   // mask alpha
369133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm0, xmm4   // + interval_size / 2
369233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddw      xmm1, xmm4
369333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
369433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    por        xmm0, xmm7
369533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
369633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax], xmm0
369733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
369833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
369933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
370033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
370133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
370233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBQUANTIZEROW_SSE2
370333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
370433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2
370533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Consider float CumulativeSum.
370633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Consider calling CumulativeSum one row at time as needed.
370733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Consider circular CumulativeSum buffer of radius * 2 + 1 height.
370833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert cumulative sum for an area to an average for 1 pixel.
370933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// topleft is pointer to top left of CumulativeSum buffer for area.
371033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// botleft is pointer to bottom left of CumulativeSum buffer.
371133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// width is offset from left to right of area in CumulativeSum buffer measured
371233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp//   in number of ints.
371333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// area is the number of pixels in the area being averaged.
371433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// dst points to pixel to store result to.
371533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// count is number of averaged pixels to produce.
371633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte
371733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// aligned.
371833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft,
371933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                 int width, int area, uint8* dst, int count) {
372033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
372133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, topleft  // eax topleft
372233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, botleft  // esi botleft
372333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, width
372433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm4, area
372533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, dst
372633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, count
372733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtdq2ps   xmm4, xmm4
372833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    rcpss      xmm4, xmm4  // 1.0f / area
372933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm4, xmm4, 0
373033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
373133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         l4b
373233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
373333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop
373433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      4
373533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l4:
373633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // top left
373733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
373833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, [eax + 16]
373933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [eax + 32]
374033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, [eax + 48]
374133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
374233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // - top right
374333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm0, [eax + edx * 4]
374433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm1, [eax + edx * 4 + 16]
374533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm2, [eax + edx * 4 + 32]
374633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm3, [eax + edx * 4 + 48]
374733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 64]
374833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
374933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // - bottom left
375033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm0, [esi]
375133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm1, [esi + 16]
375233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm2, [esi + 32]
375333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm3, [esi + 48]
375433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
375533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // + bottom right
375633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, [esi + edx * 4]
375733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm1, [esi + edx * 4 + 16]
375833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm2, [esi + edx * 4 + 32]
375933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm3, [esi + edx * 4 + 48]
376033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 64]
376133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
376233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtdq2ps   xmm0, xmm0   // Average = Sum * 1 / Area
376333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtdq2ps   xmm1, xmm1
376433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mulps      xmm0, xmm4
376533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mulps      xmm1, xmm4
376633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtdq2ps   xmm2, xmm2
376733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtdq2ps   xmm3, xmm3
376833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mulps      xmm2, xmm4
376933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mulps      xmm3, xmm4
377033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtps2dq   xmm0, xmm0
377133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtps2dq   xmm1, xmm1
377233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtps2dq   xmm2, xmm2
377333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtps2dq   xmm3, xmm3
377433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packssdw   xmm0, xmm1
377533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packssdw   xmm2, xmm3
377633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm2
377733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edi], xmm0
377833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edi, [edi + 16]
377933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
378033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        l4
378133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
378233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l4b:
378333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        ecx, 4 - 1
378433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         l1b
378533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
378633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop
378733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      4
378833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l1:
378933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]
379033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm0, [eax + edx * 4]
379133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
379233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psubd      xmm0, [esi]
379333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, [esi + edx * 4]
379433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 16]
379533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtdq2ps   xmm0, xmm0
379633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mulps      xmm0, xmm4
379733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvtps2dq   xmm0, xmm0
379833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packssdw   xmm0, xmm0
379933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm0
380033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       dword ptr [edi], xmm0
380133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edi, [edi + 4]
380233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
380333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        l1
380433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l1b:
380533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
380633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
380733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_CUMULATIVESUMTOAVERAGE_SSE2
380833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
380933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2
381033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Creates a table of cumulative sums where each value is a sum of all values
381133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// above and to the left of the value.
381233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum,
381333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                                  const int32* previous_cumsum, int width) {
381433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
381533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, row
381633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, cumsum
381733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, previous_cumsum
381833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, width
381933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        esi, edx
382033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm0, xmm0
382133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pxor       xmm1, xmm1
382233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
382333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
382433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         l4b
382533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    test       edx, 15
382633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jne        l4b
38277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
382833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop
382933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      4
383033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l4:
383133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [eax]  // 4 argb pixels 16 bytes.
383233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
383333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm2
383433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
383533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm1
383633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm2
383733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm2, xmm1
383833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm3, xmm1
383933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
384033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm4, xmm1
384133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, xmm4
384233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm4, xmm1
384333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhwd  xmm5, xmm1
384433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
384533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm2
384633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [edx + esi]  // previous row above.
384733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm2, xmm0
384833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
384933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm3
385033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, [edx + esi + 16]
385133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm3, xmm0
385233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
385333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm4
385433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, [edx + esi + 32]
385533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm4, xmm0
385633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
385733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm5
385833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm5, [edx + esi + 48]
385933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm5, xmm0
386033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
386133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx], xmm2
386233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 16], xmm3
386333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 32], xmm4
386433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [edx + 48], xmm5
386533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
386633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 64]
386733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
386833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        l4
386933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
387033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l4b:
387133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        ecx, 4 - 1
387233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         l1b
387333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
387433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop
387533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      4
387633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l1:
387733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, dword ptr [eax]  // 1 argb pixel 4 bytes.
387833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 4]
387933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm1
388033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm2, xmm1
388133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm0, xmm2
388233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     xmm2, [edx + esi]
388333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    paddd      xmm2, xmm0
388433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqu     [edx], xmm2
388533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
388633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
388733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        l1
388833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
388933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l1b:
389033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
389133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
389233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_COMPUTECUMULATIVESUMROW_SSE2
389333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
389433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBSHADE_SSE2
389533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shade 4 pixels at a time by specified value.
389633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes.
389733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
389833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width,
389933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                       uint32 value) {
390033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
390133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 4]   // src_argb
390233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8]   // dst_argb
390333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 12]  // width
390433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm2, [esp + 16]  // value
390533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edx, eax
390633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm2, xmm2
390733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklqdq xmm2, xmm2
390833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
390933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
391033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop:
391133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [eax]      // read 4 pixels
391233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
391333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm0       // first 2
391433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm1, xmm1       // next 2
391533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm0, xmm2       // argb * value
391633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmulhuw    xmm1, xmm2       // argb * value
391733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 8
391833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 8
391933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
392033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
392133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [eax + edx], xmm0
392233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        eax, [eax + 16]
392333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         convertloop
392433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
392533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
392633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
392733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
392833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBSHADE_SSE2
392933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
393033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBAFFINEROW_SSE2
393133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Copy ARGB pixels from source image with slope to a row of destination.
393233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
393333cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API
393433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride,
393533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                        uint8* dst_argb, const float* uv_dudv, int width) {
393633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
393733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
393833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
393933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 12]   // src_argb
394033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 16]  // stride
394133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 20]  // dst_argb
394233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 24]  // pointer to uv_dudv
394333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       xmm2, qword ptr [ecx]  // uv
394433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       xmm7, qword ptr [ecx + 8]  // dudv
394533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 28]  // width
394633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shl        esi, 16          // 4, stride
394733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        esi, 4
394833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm5, esi
394933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
395033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         l4b
395133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
395233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // setup for 4 pixel loop
395333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm7, xmm7, 0x44  // dup dudv
395433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm5, xmm5, 0  // dup 4, stride
395533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, xmm2    // x0, y0, x1, y1
395633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    addps      xmm0, xmm7
395733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movlhps    xmm2, xmm0
395833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm4, xmm7
395933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    addps      xmm4, xmm4    // dudv *= 2
396033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm3, xmm2    // x2, y2, x3, y3
396133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    addps      xmm3, xmm4
396233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    addps      xmm4, xmm4    // dudv *= 4
396333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
396433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 4 pixel loop
396533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      4
396633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l4:
396733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvttps2dq  xmm0, xmm2    // x, y float to int first 2
396833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvttps2dq  xmm1, xmm3    // x, y float to int next 2
396933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packssdw   xmm0, xmm1    // x, y as 8 shorts
397033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddwd    xmm0, xmm5    // offsets = x * 4 + y * stride.
397133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       esi, xmm0
397233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm0, xmm0, 0x39  // shift right
397333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       edi, xmm0
397433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm0, xmm0, 0x39  // shift right
397533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm1, [eax + esi]  // read pixel 0
397633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm6, [eax + edi]  // read pixel 1
397733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckldq  xmm1, xmm6     // combine pixel 0 and 1
397833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    addps      xmm2, xmm4    // x, y += dx, dy first 2
397933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr [edx], xmm1
398033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       esi, xmm0
398133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm0, xmm0, 0x39  // shift right
398233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       edi, xmm0
398333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm6, [eax + esi]  // read pixel 2
398433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm0, [eax + edi]  // read pixel 3
398533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckldq  xmm6, xmm0     // combine pixel 2 and 3
398633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    addps      xmm3, xmm4    // x, y += dx, dy next 2
398733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
398833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movq       qword ptr 8[edx], xmm6
398933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 16]
399033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        l4
399133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
399233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l4b:
399333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        ecx, 4 - 1
399433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jl         l1b
399533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
399633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    // 1 pixel loop
399733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      4
399833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l1:
399933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cvttps2dq  xmm0, xmm2    // x, y float to int
400033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packssdw   xmm0, xmm0    // x, y as shorts
400133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddwd    xmm0, xmm5    // offset = x * 4 + y * stride
400233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    addps      xmm2, xmm7    // x, y += dx, dy
400333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       esi, xmm0
400433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm0, [eax + esi]  // copy a pixel
400533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 1
400633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       [edx], xmm0
400733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        edx, [edx + 4]
400833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jge        l1
400933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  l1b:
401033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
401133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
401233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
401333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
401433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
401533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // HAS_ARGBAFFINEROW_SSE2
401633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
401733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version.
401833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16))
401933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr,
402033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                              ptrdiff_t src_stride, int dst_width,
402133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp                              int source_y_fraction) {
402233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  __asm {
402333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       esi
402433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    push       edi
402533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edi, [esp + 8 + 4]   // dst_ptr
402633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        esi, [esp + 8 + 8]   // src_ptr
402733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        edx, [esp + 8 + 12]  // src_stride
402833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        ecx, [esp + 8 + 16]  // dst_width
402933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    mov        eax, [esp + 8 + 20]  // source_y_fraction (0..255)
403033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        edi, esi
403133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    shr        eax, 1
403233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cmp        eax, 0
403333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    je         xloop1
403433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    cmp        eax, 64
403533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    je         xloop2
403633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm0, eax  // high fraction 0..127
403733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    neg        eax
403833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    add        eax, 128
403933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movd       xmm5, eax  // low fraction 128..1
404033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm5, xmm0
404133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklwd  xmm5, xmm5
404233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pshufd     xmm5, xmm5, 0
404333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
404433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
404533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  xloop:
404633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [esi]
404733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm2, [esi + edx]
404833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm1, xmm0
404933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpcklbw  xmm0, xmm2
405033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    punpckhbw  xmm1, xmm2
405133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm0, xmm5
405233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pmaddubsw  xmm1, xmm5
405333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm0, 7
405433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    psrlw      xmm1, 7
405533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    packuswb   xmm0, xmm1
405633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
405733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [esi + edi], xmm0
405833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 16]
405933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         xloop
406033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
406133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
406233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
406333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
406433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
406533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
406633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  xloop1:
406733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [esi]
406833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
406933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [esi + edi], xmm0
407033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 16]
407133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         xloop1
407233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
407333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
407433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
407533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
407633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
407733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    align      16
407833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  xloop2:
407933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     xmm0, [esi]
408033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pavgb      xmm0, [esi + edx]
408133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    sub        ecx, 4
408233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    movdqa     [esi + edi], xmm0
408333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    lea        esi, [esi + 16]
408433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    jg         xloop2
408533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
408633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        edi
408733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    pop        esi
408833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp    ret
408933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp  }
409033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}
409133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
409233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif  // _M_IX86
409333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp
409433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus
40957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}  // extern "C"
409633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}  // namespace libyuv
409733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif
4098