17cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde/*
27cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  Copyright (c) 2011 The LibYuv project authors. All Rights Reserved.
37cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *
47cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  Use of this source code is governed by a BSD-style license
57cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  that can be found in the LICENSE file in the root of the source
67cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  tree. An additional intellectual property rights grant can be found
77cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  in the file PATENTS.  All contributing project authors may
87cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde *  be found in the AUTHORS file in the root of the source tree.
97cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde */
107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#include "row.h"
127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" {
147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#ifdef HAS_ARGBTOYROW_SSSE3
167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var
177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Constant multiplication table for converting ARGB to I400.
197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kARGBToY[16]) = {
207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0
217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kARGBToU[16]) = {
247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0
257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kARGBToV[16]) = {
287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0,
297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Constants for BGRA
327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kBGRAToY[16]) = {
337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13
347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kBGRAToU[16]) = {
377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112
387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kBGRAToV[16]) = {
417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18
427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Constants for ABGR
457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kABGRToY[16]) = {
467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0
477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kABGRToU[16]) = {
507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0
517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kABGRToV[16]) = {
547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0
557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const uint8, kAddY16[16]) = {
587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u,
607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const uint8, kAddUV128[16]) = {
637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u,
647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u
657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Shuffle table for converting BG24 to ARGB.
687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = {
697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u
707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Shuffle table for converting RAW to ARGB.
737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = {
747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u
757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde};
767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Convert 16 ARGB pixels (64 bytes) to 16 Y values
787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 4]   /* src_argb */
827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8]   /* dst_y */
837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 12]  /* pix */
847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm7, _kARGBToY
857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm6, _kAddY16
867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7
937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm7
947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm7
967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax, [eax + 64]
977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm1
987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm2, xmm3
997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm0, 7
1007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm2, 7
1017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb   xmm0, xmm2
1027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm6
1037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     [edx], xmm0
1047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 16]
1057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
1067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja         convertloop
1077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
1087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
1097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
1107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
1127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
1147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 4]   /* src_argb */
1157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8]   /* dst_y */
1167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 12]  /* pix */
1177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm7, _kBGRAToY
1187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm6, _kAddY16
1197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
1217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
1227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
1237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
1247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
1257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7
1267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm7
1277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
1287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm7
1297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax, [eax + 64]
1307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm1
1317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm2, xmm3
1327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm0, 7
1337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm2, 7
1347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb   xmm0, xmm2
1357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm6
1367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     [edx], xmm0
1377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 16]
1387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
1397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja         convertloop
1407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
1417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
1427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
1437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
1457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) {
1467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
1477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 4]   /* src_argb */
1487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8]   /* dst_y */
1497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 12]  /* pix */
1507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm7, _kABGRToY
1517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm6, _kAddY16
1527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
1547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
1557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
1567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
1577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
1587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7
1597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm7
1607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
1617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm7
1627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax, [eax + 64]
1637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm1
1647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm2, xmm3
1657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm0, 7
1667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psrlw      xmm2, 7
1677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb   xmm0, xmm2
1687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm6
1697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     [edx], xmm0
1707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 16]
1717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
1727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja         convertloop
1737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
1747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
1757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
1767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
1787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
1797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                       uint8* dst_u, uint8* dst_v, int width) {
1807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
1817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       esi
1827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       edi
1837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 8 + 4]   // src_argb
1847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        esi, [esp + 8 + 8]   // src_stride_argb
1857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8 + 12]  // dst_u
1867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edi, [esp + 8 + 16]  // dst_v
1877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 8 + 20]  // pix
1887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm7, _kARGBToU
1897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm6, _kARGBToV
1907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm5, _kAddUV128
1917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        edi, edx             // stride from u to v
1927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
1937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
1947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    /* step 1 - subsample 16x2 argb pixels to 8x1 */
1957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
1967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
1977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
1987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
1997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, [eax + esi]
2007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm1, [eax + esi + 16]
2017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, [eax + esi + 32]
2027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm3, [eax + esi + 48]
2037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax,  [eax + 64]
2047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm0
2057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm0, xmm1, 0x88
2067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm1, 0xdd
2077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, xmm4
2087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm2
2097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm2, xmm3, 0x88
2107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm3, 0xdd
2117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, xmm4
2127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 2 - convert to U and V
2147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // from here down is very similar to Y code except
2157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // instead of 16 different pixels, its 8 pixels of U and 8 of V
2167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, xmm0
2177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, xmm2
2187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7  // U
2197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
2207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm6  // V
2217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm6
2227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm2
2237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm1, xmm3
2247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm0, 8
2257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm1, 8
2267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packsswb   xmm0, xmm1
2277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm5            // -> unsigned
2287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 3 - store 8 U and 8 V values
2307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlps     qword ptr [edx], xmm0 // U
2317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhps     qword ptr [edx + edi], xmm0 // V
2327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 8]
2337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
2347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja         convertloop
2357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        edi
2367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        esi
2377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
2387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
2397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
2407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
2427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
2437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                       uint8* dst_u, uint8* dst_v, int width) {
2447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
2457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       esi
2467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       edi
2477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 8 + 4]   // src_argb
2487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        esi, [esp + 8 + 8]   // src_stride_argb
2497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8 + 12]  // dst_u
2507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edi, [esp + 8 + 16]  // dst_v
2517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 8 + 20]  // pix
2527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm7, _kBGRAToU
2537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm6, _kBGRAToV
2547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm5, _kAddUV128
2557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        edi, edx             // stride from u to v
2567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
2587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    /* step 1 - subsample 16x2 argb pixels to 8x1 */
2597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
2607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
2617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
2627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
2637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, [eax + esi]
2647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm1, [eax + esi + 16]
2657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, [eax + esi + 32]
2667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm3, [eax + esi + 48]
2677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax,  [eax + 64]
2687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm0
2697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm0, xmm1, 0x88
2707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm1, 0xdd
2717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, xmm4
2727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm2
2737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm2, xmm3, 0x88
2747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm3, 0xdd
2757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, xmm4
2767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 2 - convert to U and V
2787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // from here down is very similar to Y code except
2797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // instead of 16 different pixels, its 8 pixels of U and 8 of V
2807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, xmm0
2817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, xmm2
2827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7  // U
2837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
2847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm6  // V
2857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm6
2867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm2
2877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm1, xmm3
2887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm0, 8
2897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm1, 8
2907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packsswb   xmm0, xmm1
2917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm5            // -> unsigned
2927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
2937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 3 - store 8 U and 8 V values
2947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlps     qword ptr [edx], xmm0 // U
2957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhps     qword ptr [edx + edi], xmm0 // V
2967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 8]
2977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
2987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja         convertloop
2997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        edi
3007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        esi
3017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
3027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
3037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
3047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
3067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb,
3077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                       uint8* dst_u, uint8* dst_v, int width) {
3087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
3097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       esi
3107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push       edi
3117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        eax, [esp + 8 + 4]   // src_argb
3127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        esi, [esp + 8 + 8]   // src_stride_argb
3137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edx, [esp + 8 + 12]  // dst_u
3147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        edi, [esp + 8 + 16]  // dst_v
3157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov        ecx, [esp + 8 + 20]  // pix
3167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm7, _kABGRToU
3177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm6, _kABGRToV
3187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm5, _kAddUV128
3197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        edi, edx             // stride from u to v
3207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
3227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    /* step 1 - subsample 16x2 argb pixels to 8x1 */
3237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm0, [eax]
3247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, [eax + 16]
3257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm2, [eax + 32]
3267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, [eax + 48]
3277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, [eax + esi]
3287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm1, [eax + esi + 16]
3297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, [eax + esi + 32]
3307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm3, [eax + esi + 48]
3317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        eax,  [eax + 64]
3327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm0
3337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm0, xmm1, 0x88
3347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm1, 0xdd
3357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm0, xmm4
3367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm4, xmm2
3377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm2, xmm3, 0x88
3387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    shufps     xmm4, xmm3, 0xdd
3397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pavgb      xmm2, xmm4
3407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 2 - convert to U and V
3427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // from here down is very similar to Y code except
3437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // instead of 16 different pixels, its 8 pixels of U and 8 of V
3447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm1, xmm0
3457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa     xmm3, xmm2
3467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm0, xmm7  // U
3477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm2, xmm7
3487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm1, xmm6  // V
3497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pmaddubsw  xmm3, xmm6
3507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm0, xmm2
3517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    phaddw     xmm1, xmm3
3527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm0, 8
3537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw      xmm1, 8
3547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packsswb   xmm0, xmm1
3557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddb      xmm0, xmm5            // -> unsigned
3567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    // step 3 - store 8 U and 8 V values
3587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movlps     qword ptr [edx], xmm0 // U
3597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movhps     qword ptr [edx + edi], xmm0 // V
3607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea        edx, [edx + 8]
3617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub        ecx, 16
3627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja         convertloop
3637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        edi
3647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop        esi
3657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
3667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
3677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
3687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
3707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) {
3717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
3727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       eax, [esp + 4]   // src_bg24
3737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 8]   // dst_argb
3747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [esp + 12]  // pix
3757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
3767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pslld     xmm7, 24
3777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, _kShuffleMaskBG24ToARGB
3787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
3797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
3807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm0, [eax]
3817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm1, [eax + 16]
3827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm3, [eax + 32]
3837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 48]
3847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm2, xmm3
3857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
3867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pshufb    xmm2, xmm6
3877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    por       xmm2, xmm7
3887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
3897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pshufb    xmm0, xmm6
3907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [edx + 32], xmm2
3917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    por       xmm0, xmm7
3927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pshufb    xmm1, xmm6
3937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [edx], xmm0
3947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    por       xmm1, xmm7
3957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
3967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pshufb    xmm3, xmm6
3977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [edx + 16], xmm1
3987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    por       xmm3, xmm7
3997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [edx + 48], xmm3
4007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 64]
4017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub       ecx, 16
4027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja        convertloop
4037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
4047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
4057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
4067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
4087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb,
4097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                        int pix) {
4107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm {
4117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       eax, [esp + 4]   // src_raw
4127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 8]   // dst_argb
4137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [esp + 12]  // pix
4147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pcmpeqb   xmm7, xmm7       // generate mask 0xff000000
4157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pslld     xmm7, 24
4167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm6, _kShuffleMaskRAWToARGB
4177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
4197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm0, [eax]
4207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm1, [eax + 16]
4217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm3, [eax + 32]
4227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 48]
4237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    xmm2, xmm3
4247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm2, xmm1, 8    // xmm2 = { xmm3[0:3] xmm1[8:15]}
4257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pshufb    xmm2, xmm6
4267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    por       xmm2, xmm7
4277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm1, xmm0, 12   // xmm1 = { xmm3[0:7] xmm0[12:15]}
4287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pshufb    xmm0, xmm6
4297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [edx + 32], xmm2
4307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    por       xmm0, xmm7
4317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pshufb    xmm1, xmm6
4327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [edx], xmm0
4337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    por       xmm1, xmm7
4347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    palignr   xmm3, xmm3, 4    // xmm3 = { xmm3[4:15]}
4357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pshufb    xmm3, xmm6
4367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [edx + 16], xmm1
4377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    por       xmm3, xmm7
4387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movdqa    [edx + 48], xmm3
4397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 64]
4407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub       ecx, 16
4417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja        convertloop
4427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
4437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
4447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
4457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
4477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYUVToRGB32Row(const uint8* y_buf,
4487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                              const uint8* u_buf,
4497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                              const uint8* v_buf,
4507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                              uint8* rgb_buf,
4517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                              int width) {
4527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  __asm {
4537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pushad
4547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 32 + 4]
4557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edi, [esp + 32 + 8]
4567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       esi, [esp + 32 + 12]
4577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ebp, [esp + 32 + 16]
4587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [esp + 32 + 20]
4597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
4617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     eax, byte ptr [edi]
4627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edi, [edi + 1]
4637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [esi]
4647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       esi, [esi + 1]
4657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
4667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     eax, byte ptr [edx]
4677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
4687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [edx + 1]
4697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm1, [_kCoefficientsRgbY + 8 * eax]
4707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2]
4717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm2, [_kCoefficientsRgbY + 8 * ebx]
4727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm1, mm0
4737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm2, mm0
4747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm1, 6
4757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm2, 6
4767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb  mm1, mm2
4777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movntq    [ebp], mm1
4787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebp, [ebp + 8]
4797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub       ecx, 2
4807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja        convertloop
4817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    popad
4837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
4847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
4857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
4867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
4877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
4887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYUVToBGRARow(const uint8* y_buf,
4897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             const uint8* u_buf,
4907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             const uint8* v_buf,
4917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             uint8* rgb_buf,
4927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             int width) {
4937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  __asm {
4947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pushad
4957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 32 + 4]
4967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edi, [esp + 32 + 8]
4977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       esi, [esp + 32 + 12]
4987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ebp, [esp + 32 + 16]
4997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [esp + 32 + 20]
5007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
5027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     eax, byte ptr [edi]
5037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edi, [edi + 1]
5047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [esi]
5057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       esi, [esi + 1]
5067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm0, [_kCoefficientsBgraY + 2048 + 8 * eax]
5077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     eax, byte ptr [edx]
5087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx]
5097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [edx + 1]
5107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm1, [_kCoefficientsBgraY + 8 * eax]
5117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2]
5127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm2, [_kCoefficientsBgraY + 8 * ebx]
5137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm1, mm0
5147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm2, mm0
5157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm1, 6
5167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm2, 6
5177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb  mm1, mm2
5187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movntq    [ebp], mm1
5197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebp, [ebp + 8]
5207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub       ecx, 2
5217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja        convertloop
5227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    popad
5247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
5257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
5267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
5277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
5297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYUVToABGRRow(const uint8* y_buf,
5307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             const uint8* u_buf,
5317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             const uint8* v_buf,
5327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             uint8* rgb_buf,
5337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                             int width) {
5347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  __asm {
5357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pushad
5367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 32 + 4]
5377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edi, [esp + 32 + 8]
5387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       esi, [esp + 32 + 12]
5397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ebp, [esp + 32 + 16]
5407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [esp + 32 + 20]
5417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
5437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     eax, byte ptr [edi]
5447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edi, [edi + 1]
5457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [esi]
5467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       esi, [esi + 1]
5477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax]
5487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     eax, byte ptr [edx]
5497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx]
5507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [edx + 1]
5517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm1, [_kCoefficientsAbgrY + 8 * eax]
5527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 2]
5537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm2, [_kCoefficientsAbgrY + 8 * ebx]
5547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm1, mm0
5557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm2, mm0
5567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm1, 6
5577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm2, 6
5587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb  mm1, mm2
5597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movntq    [ebp], mm1
5607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebp, [ebp + 8]
5617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub       ecx, 2
5627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja        convertloop
5637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    popad
5657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
5667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
5677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
5687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
5707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYUV444ToRGB32Row(const uint8* y_buf,
5717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                 const uint8* u_buf,
5727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                 const uint8* v_buf,
5737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                 uint8* rgb_buf,
5747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                                 int width) {
5757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  __asm {
5767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pushad
5777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 32 + 4]   // Y
5787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edi, [esp + 32 + 8]   // U
5797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       esi, [esp + 32 + 12]  // V
5807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ebp, [esp + 32 + 16]  // rgb
5817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [esp + 32 + 20]  // width
5827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
5837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
5847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     eax, byte ptr [edi]
5857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edi, [edi + 1]
5867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [esi]
5877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       esi, [esi + 1]
5887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm0, [_kCoefficientsRgbY + 2048 + 8 * eax]
5897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     eax, byte ptr [edx]
5907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx]
5917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 1]
5927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    paddsw    mm0, [_kCoefficientsRgbY + 8 * eax]
5937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm0, 6
5947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb  mm0, mm0
5957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movd      [ebp], mm0
5967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       ebp, [ebp + 4]
5977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub       ecx, 1
5987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja        convertloop
5997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    popad
6017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
6027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
6037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
6047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked)
6067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYToRGB32Row(const uint8* y_buf,
6077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                            uint8* rgb_buf,
6087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde                            int width) {
6097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  __asm {
6107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    push      ebx
6117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       eax, [esp + 4 + 4]   // Y
6127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       edx, [esp + 4 + 8]   // rgb
6137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    mov       ecx, [esp + 4 + 12]  // width
6147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop :
6167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [eax]
6177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm0, [_kCoefficientsRgbY + 8 * ebx]
6187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm0, 6
6197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movzx     ebx, byte ptr [eax + 1]
6207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      mm1, [_kCoefficientsRgbY + 8 * ebx]
6217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    psraw     mm1, 6
6227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    packuswb  mm0, mm1
6237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       eax, [eax + 2]
6247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    movq      [edx], mm0
6257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    lea       edx, [edx + 8]
6267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    sub       ecx, 2
6277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ja        convertloop
6287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    pop       ebx
6307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde    ret
6317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde  }
6327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}
6337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif
6357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde
6367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}  // extern "C"
637