17cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde/* 27cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * Copyright (c) 2011 The LibYuv project authors. All Rights Reserved. 37cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * 47cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * Use of this source code is governed by a BSD-style license 57cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * that can be found in the LICENSE file in the root of the source 67cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * tree. An additional intellectual property rights grant can be found 77cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * in the file PATENTS. All contributing project authors may 87cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * be found in the AUTHORS file in the root of the source tree. 97cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde */ 107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#include "row.h" 127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" { 147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#ifdef HAS_ARGBTOYROW_SSSE3 167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#define TALIGN16(t, var) static __declspec(align(16)) t _ ## var 177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Constant multiplication table for converting ARGB to I400. 197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kARGBToY[16]) = { 207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kARGBToU[16]) = { 247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kARGBToV[16]) = { 287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Constants for BGRA 327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kBGRAToY[16]) = { 337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kBGRAToU[16]) = { 377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kBGRAToV[16]) = { 417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Constants for ABGR 457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kABGRToY[16]) = { 467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kABGRToU[16]) = { 507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const int8, kABGRToV[16]) = { 547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const uint8, kAddY16[16]) = { 587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const uint8, kAddUV128[16]) = { 637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Shuffle table for converting BG24 to ARGB. 687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const uint8, kShuffleMaskBG24ToARGB[16]) = { 697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Shuffle table for converting RAW to ARGB. 737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" TALIGN16(const uint8, kShuffleMaskRAWToARGB[16]) = { 747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Convert 16 ARGB pixels (64 bytes) to 16 Y values 787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4] /* src_argb */ 827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8] /* dst_y */ 837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 12] /* pix */ 847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm7, _kARGBToY 857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm6, _kAddY16 867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm7 947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm7 967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm1 987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm2, xmm3 997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm0, 7 1007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm2, 7 1017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb xmm0, xmm2 1027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm6 1037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx], xmm0 1047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 16] 1057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 1067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 1077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 1087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 1097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 1107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 1117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 1127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 1147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4] /* src_argb */ 1157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8] /* dst_y */ 1167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 12] /* pix */ 1177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm7, _kBGRAToY 1187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm6, _kAddY16 1197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 1207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 1217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 1227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 1237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 1247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 1257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 1267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm7 1277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 1287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm7 1297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 1307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm1 1317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm2, xmm3 1327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm0, 7 1337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm2, 7 1347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb xmm0, xmm2 1357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm6 1367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx], xmm0 1377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 16] 1387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 1397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 1407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 1417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 1427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 1437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 1447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 1457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 1467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 1477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4] /* src_argb */ 1487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8] /* dst_y */ 1497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 12] /* pix */ 1507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm7, _kABGRToY 1517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm6, _kAddY16 1527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 1537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 1547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 1557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 1567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 1577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 1587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 1597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm7 1607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 1617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm7 1627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 1637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm1 1647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm2, xmm3 1657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm0, 7 1667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm2, 7 1677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb xmm0, xmm2 1687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm6 1697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx], xmm0 1707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 16] 1717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 1727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 1737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 1747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 1757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 1767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 1777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 1787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 1797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* dst_u, uint8* dst_v, int width) { 1807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 1817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push esi 1827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push edi 1837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 8 + 4] // src_argb 1847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 8 + 8] // src_stride_argb 1857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8 + 12] // dst_u 1867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 8 + 16] // dst_v 1877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 8 + 20] // pix 1887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm7, _kARGBToU 1897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm6, _kARGBToV 1907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm5, _kAddUV128 1917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub edi, edx // stride from u to v 1927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 1937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 1947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde /* step 1 - subsample 16x2 argb pixels to 8x1 */ 1957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 1967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 1977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 1987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 1997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, [eax + esi] 2007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm1, [eax + esi + 16] 2017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, [eax + esi + 32] 2027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm3, [eax + esi + 48] 2037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 2047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm0 2057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm0, xmm1, 0x88 2067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm1, 0xdd 2077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, xmm4 2087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm2 2097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm2, xmm3, 0x88 2107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm3, 0xdd 2117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, xmm4 2127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 2 - convert to U and V 2147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // from here down is very similar to Y code except 2157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // instead of 16 different pixels, its 8 pixels of U and 8 of V 2167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, xmm0 2177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, xmm2 2187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 // U 2197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 2207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm6 // V 2217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm6 2227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm2 2237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm1, xmm3 2247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm0, 8 2257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm1, 8 2267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packsswb xmm0, xmm1 2277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm5 // -> unsigned 2287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 3 - store 8 U and 8 V values 2307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movlps qword ptr [edx], xmm0 // U 2317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movhps qword ptr [edx + edi], xmm0 // V 2327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 8] 2337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 2347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 2357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop edi 2367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop esi 2377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 2387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 2397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 2407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 2427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 2437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* dst_u, uint8* dst_v, int width) { 2447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 2457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push esi 2467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push edi 2477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 8 + 4] // src_argb 2487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 8 + 8] // src_stride_argb 2497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8 + 12] // dst_u 2507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 8 + 16] // dst_v 2517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 8 + 20] // pix 2527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm7, _kBGRAToU 2537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm6, _kBGRAToV 2547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm5, _kAddUV128 2557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub edi, edx // stride from u to v 2567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 2587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde /* step 1 - subsample 16x2 argb pixels to 8x1 */ 2597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 2607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 2617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 2627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 2637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, [eax + esi] 2647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm1, [eax + esi + 16] 2657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, [eax + esi + 32] 2667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm3, [eax + esi + 48] 2677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 2687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm0 2697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm0, xmm1, 0x88 2707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm1, 0xdd 2717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, xmm4 2727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm2 2737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm2, xmm3, 0x88 2747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm3, 0xdd 2757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, xmm4 2767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 2 - convert to U and V 2787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // from here down is very similar to Y code except 2797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // instead of 16 different pixels, its 8 pixels of U and 8 of V 2807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, xmm0 2817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, xmm2 2827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 // U 2837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 2847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm6 // V 2857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm6 2867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm2 2877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm1, xmm3 2887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm0, 8 2897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm1, 8 2907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packsswb xmm0, xmm1 2917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm5 // -> unsigned 2927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 3 - store 8 U and 8 V values 2947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movlps qword ptr [edx], xmm0 // U 2957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movhps qword ptr [edx + edi], xmm0 // V 2967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 8] 2977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 2987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 2997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop edi 3007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop esi 3017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 3027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 3037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 3047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 3057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 3067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 3077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* dst_u, uint8* dst_v, int width) { 3087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 3097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push esi 3107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push edi 3117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 8 + 4] // src_argb 3127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 8 + 8] // src_stride_argb 3137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8 + 12] // dst_u 3147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 8 + 16] // dst_v 3157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 8 + 20] // pix 3167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm7, _kABGRToU 3177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm6, _kABGRToV 3187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm5, _kAddUV128 3197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub edi, edx // stride from u to v 3207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 3217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 3227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde /* step 1 - subsample 16x2 argb pixels to 8x1 */ 3237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 3247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 3257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 3267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 3277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, [eax + esi] 3287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm1, [eax + esi + 16] 3297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, [eax + esi + 32] 3307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm3, [eax + esi + 48] 3317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 3327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm0 3337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm0, xmm1, 0x88 3347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm1, 0xdd 3357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, xmm4 3367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm2 3377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm2, xmm3, 0x88 3387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm3, 0xdd 3397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, xmm4 3407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 3417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 2 - convert to U and V 3427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // from here down is very similar to Y code except 3437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // instead of 16 different pixels, its 8 pixels of U and 8 of V 3447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, xmm0 3457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, xmm2 3467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 // U 3477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 3487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm6 // V 3497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm6 3507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm2 3517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm1, xmm3 3527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm0, 8 3537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm1, 8 3547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packsswb xmm0, xmm1 3557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm5 // -> unsigned 3567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 3577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 3 - store 8 U and 8 V values 3587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movlps qword ptr [edx], xmm0 // U 3597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movhps qword ptr [edx + edi], xmm0 // V 3607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 8] 3617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 3627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 3637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop edi 3647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop esi 3657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 3667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 3677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 3687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 3697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 3707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BG24ToARGBRow_SSSE3(const uint8* src_bg24, uint8* dst_argb, int pix) { 3717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 3727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4] // src_bg24 3737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8] // dst_argb 3747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 12] // pix 3757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pcmpeqb xmm7, xmm7 // generate mask 0xff000000 3767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pslld xmm7, 24 3777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm6, _kShuffleMaskBG24ToARGB 3787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 3797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 3807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 3817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 3827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 32] 3837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 48] 3847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, xmm3 3857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 3867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pshufb xmm2, xmm6 3877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde por xmm2, xmm7 3887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 3897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pshufb xmm0, xmm6 3907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx + 32], xmm2 3917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde por xmm0, xmm7 3927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pshufb xmm1, xmm6 3937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx], xmm0 3947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde por xmm1, xmm7 3957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 3967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pshufb xmm3, xmm6 3977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx + 16], xmm1 3987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde por xmm3, xmm7 3997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx + 48], xmm3 4007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 64] 4017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 4027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 4037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 4047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 4057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 4067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 4077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 4087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 4097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde int pix) { 4107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 4117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4] // src_raw 4127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8] // dst_argb 4137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 12] // pix 4147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pcmpeqb xmm7, xmm7 // generate mask 0xff000000 4157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pslld xmm7, 24 4167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm6, _kShuffleMaskRAWToARGB 4177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 4187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 4197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 4207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 4217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 32] 4227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 48] 4237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, xmm3 4247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 4257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pshufb xmm2, xmm6 4267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde por xmm2, xmm7 4277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 4287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pshufb xmm0, xmm6 4297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx + 32], xmm2 4307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde por xmm0, xmm7 4317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pshufb xmm1, xmm6 4327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx], xmm0 4337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde por xmm1, xmm7 4347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 4357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pshufb xmm3, xmm6 4367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx + 16], xmm1 4377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde por xmm3, xmm7 4387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx + 48], xmm3 4397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 64] 4407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 4417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 4427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 4437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 4447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 4457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 4467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 4477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYUVToRGB32Row(const uint8* y_buf, 4487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde const uint8* u_buf, 4497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde const uint8* v_buf, 4507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* rgb_buf, 4517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde int width) { 4527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde __asm { 4537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pushad 4547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 32 + 4] 4557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 32 + 8] 4567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 32 + 12] 4577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ebp, [esp + 32 + 16] 4587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 32 + 20] 4597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 4607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 4617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx eax, byte ptr [edi] 4627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edi, [edi + 1] 4637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [esi] 4647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea esi, [esi + 1] 4657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] 4667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx eax, byte ptr [edx] 4677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] 4687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [edx + 1] 4697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm1, [_kCoefficientsRgbY + 8 * eax] 4707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 2] 4717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm2, [_kCoefficientsRgbY + 8 * ebx] 4727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm1, mm0 4737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm2, mm0 4747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm1, 6 4757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm2, 6 4767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb mm1, mm2 4777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movntq [ebp], mm1 4787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea ebp, [ebp + 8] 4797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 2 4807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 4817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 4827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde popad 4837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 4847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 4857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 4867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 4877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 4887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYUVToBGRARow(const uint8* y_buf, 4897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde const uint8* u_buf, 4907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde const uint8* v_buf, 4917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* rgb_buf, 4927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde int width) { 4937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde __asm { 4947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pushad 4957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 32 + 4] 4967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 32 + 8] 4977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 32 + 12] 4987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ebp, [esp + 32 + 16] 4997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 32 + 20] 5007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 5027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx eax, byte ptr [edi] 5037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edi, [edi + 1] 5047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [esi] 5057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea esi, [esi + 1] 5067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm0, [_kCoefficientsBgraY + 2048 + 8 * eax] 5077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx eax, byte ptr [edx] 5087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm0, [_kCoefficientsBgraY + 4096 + 8 * ebx] 5097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [edx + 1] 5107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm1, [_kCoefficientsBgraY + 8 * eax] 5117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 2] 5127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm2, [_kCoefficientsBgraY + 8 * ebx] 5137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm1, mm0 5147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm2, mm0 5157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm1, 6 5167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm2, 6 5177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb mm1, mm2 5187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movntq [ebp], mm1 5197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea ebp, [ebp + 8] 5207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 2 5217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 5227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde popad 5247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 5257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 5267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 5277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 5297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYUVToABGRRow(const uint8* y_buf, 5307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde const uint8* u_buf, 5317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde const uint8* v_buf, 5327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* rgb_buf, 5337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde int width) { 5347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde __asm { 5357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pushad 5367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 32 + 4] 5377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 32 + 8] 5387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 32 + 12] 5397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ebp, [esp + 32 + 16] 5407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 32 + 20] 5417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 5437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx eax, byte ptr [edi] 5447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edi, [edi + 1] 5457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [esi] 5467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea esi, [esi + 1] 5477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm0, [_kCoefficientsAbgrY + 2048 + 8 * eax] 5487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx eax, byte ptr [edx] 5497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm0, [_kCoefficientsAbgrY + 4096 + 8 * ebx] 5507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [edx + 1] 5517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm1, [_kCoefficientsAbgrY + 8 * eax] 5527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 2] 5537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm2, [_kCoefficientsAbgrY + 8 * ebx] 5547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm1, mm0 5557cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm2, mm0 5567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm1, 6 5577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm2, 6 5587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb mm1, mm2 5597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movntq [ebp], mm1 5607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea ebp, [ebp + 8] 5617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 2 5627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 5637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde popad 5657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 5667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 5677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 5687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 5707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYUV444ToRGB32Row(const uint8* y_buf, 5717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde const uint8* u_buf, 5727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde const uint8* v_buf, 5737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* rgb_buf, 5747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde int width) { 5757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde __asm { 5767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pushad 5777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 32 + 4] // Y 5787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 32 + 8] // U 5797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 32 + 12] // V 5807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ebp, [esp + 32 + 16] // rgb 5817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 32 + 20] // width 5827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 5847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx eax, byte ptr [edi] 5857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edi, [edi + 1] 5867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [esi] 5877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea esi, [esi + 1] 5887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm0, [_kCoefficientsRgbY + 2048 + 8 * eax] 5897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx eax, byte ptr [edx] 5907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm0, [_kCoefficientsRgbY + 4096 + 8 * ebx] 5917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 1] 5927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddsw mm0, [_kCoefficientsRgbY + 8 * eax] 5937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm0, 6 5947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb mm0, mm0 5957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movd [ebp], mm0 5967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea ebp, [ebp + 4] 5977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 1 5987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 5997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 6007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde popad 6017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 6027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 6037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 6047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 6057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__declspec(naked) 6067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid FastConvertYToRGB32Row(const uint8* y_buf, 6077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* rgb_buf, 6087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde int width) { 6097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde __asm { 6107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push ebx 6117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4 + 4] // Y 6127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 4 + 8] // rgb 6137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 4 + 12] // width 6147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 6157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde convertloop : 6167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [eax] 6177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm0, [_kCoefficientsRgbY + 8 * ebx] 6187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm0, 6 6197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movzx ebx, byte ptr [eax + 1] 6207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq mm1, [_kCoefficientsRgbY + 8 * ebx] 6217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw mm1, 6 6227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb mm0, mm1 6237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 2] 6247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movq [edx], mm0 6257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 8] 6267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 2 6277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ja convertloop 6287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 6297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop ebx 6307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 6317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 6327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 6337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 6347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#endif 6357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 6367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} // extern "C" 637