17cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde/* 233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp * Copyright 2011 The LibYuv Project Authors. All rights reserved. 37cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * 47cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * Use of this source code is governed by a BSD-style license 57cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * that can be found in the LICENSE file in the root of the source 67cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * tree. An additional intellectual property rights grant can be found 77cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * in the file PATENTS. All contributing project authors may 87cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde * be found in the AUTHORS file in the root of the source tree. 97cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde */ 107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 1133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#include "libyuv/row.h" 127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 1333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus 1433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampnamespace libyuv { 157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordeextern "C" { 1633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 1733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 1833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// This module is for Visual C x86. 1933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#if !defined(YUV_DISABLE_ASM) && defined(_M_IX86) 207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): I420ToRGB24, I420ToRAW 227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde#ifdef HAS_ARGBTOYROW_SSSE3 237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for ARGB. 2533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToY = { 267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0 277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToU = { 307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0 317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 3333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToV = { 347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, 357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 3733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for BGRA. 3833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kBGRAToY = { 397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13 407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 4233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kBGRAToU = { 437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112 447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 4633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kBGRAToV = { 477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18 487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for ABGR. 5133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kABGRToY = { 527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0, 33, 65, 13, 0 537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kABGRToU = { 567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0, -38, -74, 112, 0 577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 5933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kABGRToV = { 607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0, 112, -94, -18, 0 617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 6333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constants for RGBA. 6433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kRGBAToY = { 6533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33, 0, 13, 65, 33 6633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 6733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 6833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kRGBAToU = { 6933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38, 0, 112, -74, -38 7033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 7133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 7233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kRGBAToV = { 7333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112, 0, -18, -94, 112 7433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 7533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 7633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kAddY16 = { 7733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u, 16u 787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 8033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kAddUV128 = { 817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u, 827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 128u, 128u, 128u, 128u, 128u, 128u, 128u, 128u 837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 8533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting RGB24 to ARGB. 8633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskRGB24ToARGB = { 877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 0u, 1u, 2u, 12u, 3u, 4u, 5u, 13u, 6u, 7u, 8u, 14u, 9u, 10u, 11u, 15u 887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde// Shuffle table for converting RAW to ARGB. 9133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskRAWToARGB = { 927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 2u, 1u, 0u, 12u, 5u, 4u, 3u, 13u, 8u, 7u, 6u, 14u, 11u, 10u, 9u, 15u 937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde}; 947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 9533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting BGRA to ARGB. 9633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskBGRAToARGB = { 9733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u 9833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 9933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 10033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ABGR to ARGB. 10133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskABGRToARGB = { 10233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u 10333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 10433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 10533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting RGBA to ARGB. 10633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskRGBAToARGB = { 10733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u 10833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 10933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 11033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RGBA. 11133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskARGBToRGBA = { 11233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u 11333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 11433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 11533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RGB24. 11633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskARGBToRGB24 = { 11733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 0u, 1u, 2u, 4u, 5u, 6u, 8u, 9u, 10u, 12u, 13u, 14u, 128u, 128u, 128u, 128u 11833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 11933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 12033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for converting ARGB to RAW. 12133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMaskARGBToRAW = { 12233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 2u, 1u, 0u, 6u, 5u, 4u, 10u, 9u, 8u, 14u, 13u, 12u, 128u, 128u, 128u, 128u 12333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 12433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 12533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 12633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I400ToARGBRow_SSE2(const uint8* src_y, uint8* dst_argb, int pix) { 12733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 12833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_y 12933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 13033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 13133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0xff000000 13233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm5, 24 13333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 13433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 13533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 13633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq xmm0, qword ptr [eax] 13733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 8] 13833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm0 13933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 14033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm0 14133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm1 14233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm5 14333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm5 14433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 14533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 14633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 14733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 14833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 14933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 15033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 15133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 15233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 15333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 15433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToARGBRow_SSSE3(const uint8* src_bgra, uint8* dst_argb, int pix) { 15533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 15633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_bgra 15733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 15833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 15933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kShuffleMaskBGRAToARGB 16033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 16133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 16233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 16333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 16433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 16533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm5 16633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 16733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 16833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 16933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 17033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 17133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 17233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 17333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 17433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 17533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToARGBRow_SSSE3(const uint8* src_abgr, uint8* dst_argb, int pix) { 17633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 17733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_abgr 17833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 17933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 18033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kShuffleMaskABGRToARGB 18133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 18233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 18333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 18433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 18533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 18633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm5 18733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 18833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 18933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 19033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 19133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 19233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 19333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 19433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 19533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 19633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToARGBRow_SSSE3(const uint8* src_rgba, uint8* dst_argb, int pix) { 19733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 19833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_rgba 19933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 20033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 20133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kShuffleMaskRGBAToARGB 20233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 20333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 20433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 20533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 20633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 20733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm5 20833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 20933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 21033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 21133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 21233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 21333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 21433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 21533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 21633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 21733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGBARow_SSSE3(const uint8* src_argb, uint8* dst_rgba, int pix) { 21833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 21933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb 22033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_rgba 22133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 22233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kShuffleMaskARGBToRGBA 22333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 22433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 22533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 22633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 22733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 22833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm5 22933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 23033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 23133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 23233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 23333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 23433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 23533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 23633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 23733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 23833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGB24ToARGBRow_SSSE3(const uint8* src_rgb24, uint8* dst_argb, int pix) { 23933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 24033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_rgb24 24133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 24233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 24333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0xff000000 24433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm5, 24 24533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kShuffleMaskRGB24ToARGB 24633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 24733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 24833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 24933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 25033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 25133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 32] 25233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 48] 25333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm3 25433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 25533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm2, xmm4 25633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm2, xmm5 25733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 25833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm4 25933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 32], xmm2 26033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm5 26133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm1, xmm4 26233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 26333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm5 26433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 26533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm3, xmm4 26633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 26733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm3, xmm5 26833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 26933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 48], xmm3 27033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 64] 27133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 27233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 27333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 27433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 27533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 27633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 27733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RAWToARGBRow_SSSE3(const uint8* src_raw, uint8* dst_argb, 27833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int pix) { 27933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 28033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_raw 28133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 28233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 28333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0xff000000 28433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm5, 24 28533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kShuffleMaskRAWToARGB 28633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 28733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 28833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 28933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 29033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 29133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 32] 29233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 48] 29333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm3 29433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp palignr xmm2, xmm1, 8 // xmm2 = { xmm3[0:3] xmm1[8:15]} 29533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm2, xmm4 29633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm2, xmm5 29733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp palignr xmm1, xmm0, 12 // xmm1 = { xmm3[0:7] xmm0[12:15]} 29833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm4 29933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 32], xmm2 30033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm5 30133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm1, xmm4 30233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 30333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm5 30433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp palignr xmm3, xmm3, 4 // xmm3 = { xmm3[4:15]} 30533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm3, xmm4 30633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 30733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm3, xmm5 30833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 30933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 48], xmm3 31033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 64] 31133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 31233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 31333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 31433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 31533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 31633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// pmul method to replicate bits. 31733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Math to replicate bits: 31833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// (v << 8) | (v << 3) 31933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// v * 256 + v * 8 32033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// v * (256 + 8) 32133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// G shift of 5 is incorporated, so shift is 5 + 8 and 5 + 3 32233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 20 instructions. 32333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 32433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGB565ToARGBRow_SSE2(const uint8* src_rgb565, uint8* dst_argb, 32533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int pix) { 32633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 32733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, 0x01080108 // generate multiplier to repeat 5 bits 32833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm5, eax 32933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm5, xmm5, 0 33033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, 0x20802080 // multiplier shift by 5 and then repeat 6 bits 33133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm6, eax 33233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm6, xmm6, 0 33333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 33433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm3, 11 33533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0x07e007e0 for Green 33633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm4, 10 33733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm4, 5 33833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 33933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm7, 8 34033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 34133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_rgb565 34233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 34333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 34433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 34533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 34633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 34733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 34833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 34933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] // fetch 8 pixels of bgr565 35033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 35133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm0 35233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm3 // R in upper 5 bits 35333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm2, 11 // B in upper 5 bits 35433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm1, xmm5 // * (256 + 8) 35533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm2, xmm5 // * (256 + 8) 35633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm1, 8 35733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm2 // RB 35833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm4 // G in middle 6 bits 35933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm0, xmm6 // << 5 * (256 + 4) 36033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm7 // AG 36133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm1 36233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm1, xmm0 36333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm2, xmm0 36433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 36533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 36633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 36733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 36833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 36933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 37033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 37133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 37233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 37333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 24 instructions 37433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 37533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGB1555ToARGBRow_SSE2(const uint8* src_argb1555, uint8* dst_argb, 37633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int pix) { 37733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 37833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, 0x01080108 // generate multiplier to repeat 5 bits 37933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm5, eax 38033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm5, xmm5, 0 38133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, 0x42004200 // multiplier shift by 6 and then repeat 5 bits 38233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm6, eax 38333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm6, xmm6, 0 38433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm3, xmm3 // generate mask 0xf800f800 for Red 38533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm3, 11 38633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm3 // generate mask 0x03e003e0 for Green 38733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm4, 6 38833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm7, xmm7 // generate mask 0xff00ff00 for Alpha 38933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm7, 8 39033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 39133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb1555 39233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 39333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 39433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 39533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 39633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 39733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 39833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 39933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] // fetch 8 pixels of 1555 40033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 40133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm0 40233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm1, 1 // R in upper 5 bits 40333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm2, 11 // B in upper 5 bits 40433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm3 40533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm2, xmm5 // * (256 + 8) 40633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm1, xmm5 // * (256 + 8) 40733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm1, 8 40833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm2 // RB 40933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm0 41033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm4 // G in middle 5 bits 41133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm2, 8 // A 41233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm0, xmm6 // << 6 * (256 + 8) 41333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm7 41433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm2 // AG 41533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm1 41633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm1, xmm0 41733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm2, xmm0 41833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax * 2 + edx], xmm1 // store 4 pixels of ARGB 41933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax * 2 + edx + 16], xmm2 // store next 4 pixels of ARGB 42033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 42133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 42233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 42333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 42433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 42533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 42633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 42733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 18 instructions. 42833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 42933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGB4444ToARGBRow_SSE2(const uint8* src_argb4444, uint8* dst_argb, 43033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int pix) { 43133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 43233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, 0x0f0f0f0f // generate mask 0x0f0f0f0f 43333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm4, eax 43433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm4, xmm4, 0 43533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, xmm4 // 0xf0f0f0f0 for high nibbles 43633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm5, 4 43733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb4444 43833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 43933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 44033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 44133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 44233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 44333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 44433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 44533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] // fetch 8 pixels of bgra4444 44633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm0 44733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm4 // mask low nibbles 44833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm5 // mask high nibbles 44933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 45033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 45133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm1, 4 45233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm3, 4 45333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm1 45433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm2, xmm3 45533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 45633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm2 45733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm1, xmm2 45833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax * 2 + edx], xmm0 // store 4 pixels of ARGB 45933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax * 2 + edx + 16], xmm1 // store next 4 pixels of ARGB 46033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 46133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 46233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 46333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 46433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 46533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 46633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 46733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 46833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGB24Row_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 46933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 47033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb 47133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_rgb 47233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 47333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kShuffleMaskARGBToRGB24 47433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 47533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 47633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 47733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // fetch 16 pixels of argb 47833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 47933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax + 32] 48033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax + 48] 48133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 48233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 48333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm1, xmm6 48433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm2, xmm6 48533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm3, xmm6 48633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm1 // 4 bytes from 1 for 0 48733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrldq xmm1, 4 // 8 bytes from 1 48833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslldq xmm4, 12 // 4 bytes from 1 for 0 48933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, xmm2 // 8 bytes from 2 for 1 49033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // 4 bytes from 1 for 0 49133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslldq xmm5, 8 // 8 bytes from 2 for 1 49233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 // store 0 49333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm5 // 8 bytes from 2 for 1 49433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrldq xmm2, 8 // 4 bytes from 2 49533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslldq xmm3, 4 // 12 bytes from 3 for 2 49633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm2, xmm3 // 12 bytes from 3 for 2 49733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 // store 1 49833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 32], xmm2 // store 2 49933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 48] 50033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 50133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 50233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 50333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 50433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 50533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 50633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 50733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRAWRow_SSSE3(const uint8* src_argb, uint8* dst_rgb, int pix) { 50833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 50933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb 51033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_rgb 51133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 51233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kShuffleMaskARGBToRAW 51333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 51433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 51533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 51633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // fetch 16 pixels of argb 51733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 51833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax + 32] 51933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax + 48] 52033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 52133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm6 // pack 16 bytes of ARGB to 12 bytes of RGB 52233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm1, xmm6 52333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm2, xmm6 52433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm3, xmm6 52533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm1 // 4 bytes from 1 for 0 52633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrldq xmm1, 4 // 8 bytes from 1 52733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslldq xmm4, 12 // 4 bytes from 1 for 0 52833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, xmm2 // 8 bytes from 2 for 1 52933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // 4 bytes from 1 for 0 53033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslldq xmm5, 8 // 8 bytes from 2 for 1 53133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 // store 0 53233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm5 // 8 bytes from 2 for 1 53333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrldq xmm2, 8 // 4 bytes from 2 53433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslldq xmm3, 4 // 12 bytes from 3 for 2 53533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm2, xmm3 // 12 bytes from 3 for 2 53633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 // store 1 53733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 32], xmm2 // store 2 53833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 48] 53933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 54033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 54133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 54233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 54333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 54433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 54533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 54633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToRGB565Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 54733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 54833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb 54933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_rgb 55033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 55133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm3, xmm3 // generate mask 0x0000001f 55233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm3, 27 55333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0x000007e0 55433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm4, 26 55533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm4, 5 55633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0xfffff800 55733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm5, 11 55833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 55933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 56033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 56133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // fetch 4 pixels of argb 56233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 // B 56333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm0 // G 56433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm0, 8 // R 56533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm1, 3 // B 56633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm2, 5 // G 56733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrad xmm0, 16 // R 56833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm3 // B 56933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm4 // G 57033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // R 57133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm2 // BG 57233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm1 // BGR 57333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packssdw xmm0, xmm0 57433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 57533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 57633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 57733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 57833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 57933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 58033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 58133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 58233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 58333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Improve sign extension/packing. 58433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 58533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToARGB1555Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 58633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 58733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb 58833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_rgb 58933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 59033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0x0000001f 59133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm4, 27 59233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, xmm4 // generate mask 0x000003e0 59333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm5, 5 59433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, xmm4 // generate mask 0x00007c00 59533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm6, 10 59633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm7, xmm7 // generate mask 0xffff8000 59733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm7, 15 59833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 59933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 60033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 60133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // fetch 4 pixels of argb 60233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 // B 60333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm0 // G 60433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm0 // R 60533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrad xmm0, 16 // A 60633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm1, 3 // B 60733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm2, 6 // G 60833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm3, 9 // R 60933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm7 // A 61033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm4 // B 61133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm5 // G 61233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm3, xmm6 // R 61333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm1 // BA 61433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm2, xmm3 // GR 61533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm2 // BGRA 61633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packssdw xmm0, xmm0 61733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 61833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 // store 4 pixels of ARGB1555 61933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 62033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 62133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 62233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 62333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 62433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 62533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 62633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 62733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToARGB4444Row_SSE2(const uint8* src_argb, uint8* dst_rgb, int pix) { 62833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 62933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb 63033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_rgb 63133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 63233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0xf000f000 63333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm4, 12 63433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm4 // generate mask 0x00f000f0 63533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm3, 8 63633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 63733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 63833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 63933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // fetch 4 pixels of argb 64033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 64133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm3 // low nibble 64233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm4 // high nibble 64333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrl xmm0, 4 64433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrl xmm1, 8 64533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm1 64633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 64733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 64833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 // store 4 pixels of ARGB4444 64933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 65033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 65133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 65233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 65333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 65433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 65533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 65633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 16 ARGB pixels (64 bytes) to 16 Y values. 65733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 6587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ARGBToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 6597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 6607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4] /* src_argb */ 6617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8] /* dst_y */ 6627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 12] /* pix */ 66333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddY16 66433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kARGBToY 6657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 66633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 66733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 6687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 6697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 6707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 6717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 67233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 67333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 67433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm4 67533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm4 6767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 6777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm1 6787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm2, xmm3 6797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm0, 7 6807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm2, 7 6817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb xmm0, xmm2 68233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 68333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 6847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx], xmm0 6857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 16] 68633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 68733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 68833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 68933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 69033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 69133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 69233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 69333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 69433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* src_argb */ 69533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] /* dst_y */ 69633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] /* pix */ 69733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddY16 69833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kARGBToY 69933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 70033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 70133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 70233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 70333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 70433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + 32] 70533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 48] 70633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 70733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 70833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm4 70933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm4 71033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 71133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm1 71233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm2, xmm3 71333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 7 71433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 7 71533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm2 71633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 7177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 71833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 71933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 72033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 7217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 7227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 7237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 7247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 72533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 7267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BGRAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 7277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 7287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4] /* src_argb */ 7297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8] /* dst_y */ 7307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 12] /* pix */ 73133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddY16 73233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kBGRAToY 7337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 73433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 73533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 7367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 7377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 7387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 7397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 74033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 74133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 74233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm4 74333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm4 7447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 7457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm1 7467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm2, xmm3 7477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm0, 7 7487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm2, 7 7497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb xmm0, xmm2 75033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 75133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 7527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx], xmm0 7537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 16] 75433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 75533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 75633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 75733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 75833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 75933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 76033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 76133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 76233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* src_argb */ 76333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] /* dst_y */ 76433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] /* pix */ 76533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddY16 76633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kBGRAToY 76733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 76833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 76933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 77033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 77133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 77233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + 32] 77333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 48] 77433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 77533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 77633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm4 77733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm4 77833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 77933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm1 78033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm2, xmm3 78133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 7 78233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 7 78333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm2 78433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 7857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 78633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 78733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 78833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 7897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 7907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 7917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 7927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 79333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 7947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ABGRToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 7957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 7967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 4] /* src_argb */ 7977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8] /* dst_y */ 7987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 12] /* pix */ 79933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddY16 80033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kABGRToY 8017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 80233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 80333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 8047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 8057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 8067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 8077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 80833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 80933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 81033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm4 81133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm4 8127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 8137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm1 8147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm2, xmm3 8157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm0, 7 8167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psrlw xmm2, 7 8177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packuswb xmm0, xmm2 81833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 81933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 8207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa [edx], xmm0 8217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 16] 82233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 82333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 82433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 82533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 82633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 82733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 82833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 82933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 83033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* src_argb */ 83133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] /* dst_y */ 83233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] /* pix */ 83333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddY16 83433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kABGRToY 83533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 83633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 83733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 83833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 83933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 84033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + 32] 84133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 48] 84233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 84333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 84433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm4 84533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm4 84633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 84733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm1 84833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm2, xmm3 84933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 7 85033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 7 85133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm2 85233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 8537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 85433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 85533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 85633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 8577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 8587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 8597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 8607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 86133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 86233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToYRow_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 8637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 86433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* src_argb */ 86533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] /* dst_y */ 86633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] /* pix */ 86733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddY16 86833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kRGBAToY 8697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 87033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 87133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 8727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 8737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 8747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 8757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 87633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 87733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 87833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm4 87933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm4 88033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 88133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm1 88233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm2, xmm3 88333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 7 88433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 7 88533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm2 88633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 88733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 88833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 88933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 89033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 89133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 89233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 89333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 89433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 89533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 89633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToYRow_Unaligned_SSSE3(const uint8* src_argb, uint8* dst_y, int pix) { 89733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 89833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* src_argb */ 89933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] /* dst_y */ 90033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] /* pix */ 90133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddY16 90233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kRGBAToY 90333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 90433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 90533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 90633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 90733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 90833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + 32] 90933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 48] 91033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 91133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 91233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm4 91333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm4 91433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 91533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm1 91633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm2, xmm3 91733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 7 91833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 7 91933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm2 92033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 92133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 92233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 92333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 92433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 92533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 92633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 92733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 92833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 92933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 93033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 93133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int width) { 93233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 93333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 93433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 93533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_argb 93633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // src_stride_argb 93733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 93833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 93933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 94033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, kARGBToU 94133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kARGBToV 94233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddUV128 94333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx // stride from u to v 94433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 94533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 94633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 94733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* step 1 - subsample 16x2 argb pixels to 8x1 */ 94833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 94933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 95033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax + 32] 95133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax + 48] 95233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, [eax + esi] 95333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, [eax + esi + 16] 95433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, [eax + esi + 32] 95533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm3, [eax + esi + 48] 95633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 95733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm0 95833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm0, xmm1, 0x88 95933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm1, 0xdd 96033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 96133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm2 9627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm2, xmm3, 0x88 9637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm3, 0xdd 9647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, xmm4 9657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 9667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 2 - convert to U and V 9677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // from here down is very similar to Y code except 9687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // instead of 16 different pixels, its 8 pixels of U and 8 of V 9697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, xmm0 9707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, xmm2 9717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 // U 9727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 9737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm6 // V 9747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm6 9757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm2 9767cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm1, xmm3 9777cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm0, 8 9787cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm1, 8 9797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packsswb xmm0, xmm1 9807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm5 // -> unsigned 9817cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 9827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 3 - store 8 U and 8 V values 98333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 9847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movlps qword ptr [edx], xmm0 // U 9857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movhps qword ptr [edx + edi], xmm0 // V 9867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 8] 98733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 98833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 98933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 99033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 99133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 99233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 99333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 99433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 99533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 99633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 99733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int width) { 99833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 99933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 100033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 100133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_argb 100233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // src_stride_argb 100333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 100433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 100533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 100633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, kARGBToU 100733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kARGBToV 100833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddUV128 100933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx // stride from u to v 101033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 101133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 101233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 101333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* step 1 - subsample 16x2 argb pixels to 8x1 */ 101433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 101533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 101633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + 32] 101733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 48] 101833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi] 101933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 102033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 16] 102133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, xmm4 102233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 32] 102333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 102433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 48] 102533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm3, xmm4 102633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 102733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm0 102833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm0, xmm1, 0x88 102933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm1, 0xdd 103033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 103133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm2 103233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm2, xmm3, 0x88 103333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm3, 0xdd 103433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 103533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 103633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 2 - convert to U and V 103733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // from here down is very similar to Y code except 103833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // instead of 16 different pixels, its 8 pixels of U and 8 of V 103933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 104033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 104133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm7 // U 104233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm7 104333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm6 // V 104433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm6 104533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm2 104633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm1, xmm3 104733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm0, 8 104833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm1, 8 104933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packsswb xmm0, xmm1 105033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 // -> unsigned 105133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 105233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 3 - store 8 U and 8 V values 10537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 105433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlps qword ptr [edx], xmm0 // U 105533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movhps qword ptr [edx + edi], xmm0 // V 105633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 105733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 105833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 10597cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop edi 10607cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop esi 10617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 10627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 10637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 10647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 106533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 10667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid BGRAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 10677cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* dst_u, uint8* dst_v, int width) { 10687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 10697cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push esi 10707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push edi 10717cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 8 + 4] // src_argb 10727cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 8 + 8] // src_stride_argb 10737cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8 + 12] // dst_u 10747cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 8 + 16] // dst_v 10757cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 8 + 20] // pix 107633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, kBGRAToU 107733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kBGRAToV 107833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddUV128 10797cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub edi, edx // stride from u to v 10807cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 108133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 108233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 10837cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde /* step 1 - subsample 16x2 argb pixels to 8x1 */ 10847cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 10857cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 10867cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 10877cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 10887cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, [eax + esi] 10897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm1, [eax + esi + 16] 10907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, [eax + esi + 32] 10917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm3, [eax + esi + 48] 10927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 10937cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm0 10947cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm0, xmm1, 0x88 10957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm1, 0xdd 10967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, xmm4 10977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm2 10987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm2, xmm3, 0x88 10997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm3, 0xdd 11007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, xmm4 11017cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 11027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 2 - convert to U and V 11037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // from here down is very similar to Y code except 11047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // instead of 16 different pixels, its 8 pixels of U and 8 of V 11057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, xmm0 11067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, xmm2 11077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 // U 11087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 11097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm6 // V 11107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm6 11117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm2 11127cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm1, xmm3 11137cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm0, 8 11147cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm1, 8 11157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packsswb xmm0, xmm1 11167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm5 // -> unsigned 11177cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 11187cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 3 - store 8 U and 8 V values 111933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 11207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movlps qword ptr [edx], xmm0 // U 11217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movhps qword ptr [edx + edi], xmm0 // V 11227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 8] 112333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 112433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 112533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 112633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 112733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 112833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 112933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 113033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 113133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 113233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid BGRAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 113333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int width) { 113433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 113533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 113633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 113733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_argb 113833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // src_stride_argb 113933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 114033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 114133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 114233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, kBGRAToU 114333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kBGRAToV 114433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddUV128 114533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx // stride from u to v 114633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 114733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 114833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 114933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* step 1 - subsample 16x2 argb pixels to 8x1 */ 115033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 115133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 115233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + 32] 115333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 48] 115433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi] 115533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 115633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 16] 115733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, xmm4 115833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 32] 115933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 116033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 48] 116133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm3, xmm4 116233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 116333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm0 116433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm0, xmm1, 0x88 116533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm1, 0xdd 116633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 116733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm2 116833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm2, xmm3, 0x88 116933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm3, 0xdd 117033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 117133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 117233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 2 - convert to U and V 117333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // from here down is very similar to Y code except 117433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // instead of 16 different pixels, its 8 pixels of U and 8 of V 117533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 117633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 117733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm7 // U 117833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm7 117933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm6 // V 118033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm6 118133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm2 118233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm1, xmm3 118333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm0, 8 118433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm1, 8 118533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packsswb xmm0, xmm1 118633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 // -> unsigned 118733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 118833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 3 - store 8 U and 8 V values 11897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub ecx, 16 119033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlps qword ptr [edx], xmm0 // U 119133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movhps qword ptr [edx + edi], xmm0 // V 119233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 119333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 119433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 11957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop edi 11967cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop esi 11977cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 11987cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 11997cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 12007cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 120133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 12027cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Bordevoid ABGRToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 12037cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde uint8* dst_u, uint8* dst_v, int width) { 12047cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 12057cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push esi 12067cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde push edi 12077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov eax, [esp + 8 + 4] // src_argb 12087cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov esi, [esp + 8 + 8] // src_stride_argb 12097cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edx, [esp + 8 + 12] // dst_u 12107cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov edi, [esp + 8 + 16] // dst_v 12117cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde mov ecx, [esp + 8 + 20] // pix 121233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, kABGRToU 121333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kABGRToV 121433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddUV128 12157cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde sub edi, edx // stride from u to v 12167cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 121733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 121833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 12197cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde /* step 1 - subsample 16x2 argb pixels to 8x1 */ 12207cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm0, [eax] 12217cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, [eax + 16] 12227cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm2, [eax + 32] 12237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, [eax + 48] 12247cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, [eax + esi] 12257cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm1, [eax + esi + 16] 12267cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, [eax + esi + 32] 12277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm3, [eax + esi + 48] 12287cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea eax, [eax + 64] 12297cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm0 12307cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm0, xmm1, 0x88 12317cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm1, 0xdd 12327cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm0, xmm4 12337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm4, xmm2 12347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm2, xmm3, 0x88 12357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde shufps xmm4, xmm3, 0xdd 12367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pavgb xmm2, xmm4 12377cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 12387cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 2 - convert to U and V 12397cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // from here down is very similar to Y code except 12407cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // instead of 16 different pixels, its 8 pixels of U and 8 of V 12417cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm1, xmm0 12427cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movdqa xmm3, xmm2 12437cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm0, xmm7 // U 12447cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm2, xmm7 12457cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm1, xmm6 // V 12467cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pmaddubsw xmm3, xmm6 12477cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm0, xmm2 12487cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde phaddw xmm1, xmm3 12497cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm0, 8 12507cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde psraw xmm1, 8 12517cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde packsswb xmm0, xmm1 12527cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde paddb xmm0, xmm5 // -> unsigned 12537cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 12547cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde // step 3 - store 8 U and 8 V values 125533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 12567cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movlps qword ptr [edx], xmm0 // U 12577cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde movhps qword ptr [edx + edi], xmm0 // V 12587cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 8] 125933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 126033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 12617cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop edi 12627cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde pop esi 12637cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 12647cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 12657cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 12667cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 126733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 126833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ABGRToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 126933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int width) { 12707cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde__asm { 127133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 127233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 127333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_argb 127433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // src_stride_argb 127533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 127633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 127733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 127833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, kABGRToU 127933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kABGRToV 128033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddUV128 128133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx // stride from u to v 12827cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 128333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 128433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 128533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* step 1 - subsample 16x2 argb pixels to 8x1 */ 128633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 128733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 128833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + 32] 128933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 48] 129033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi] 129133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 129233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 16] 129333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, xmm4 129433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 32] 129533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 129633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 48] 129733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm3, xmm4 129833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 129933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm0 130033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm0, xmm1, 0x88 130133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm1, 0xdd 130233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 130333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm2 130433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm2, xmm3, 0x88 130533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm3, 0xdd 130633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 13077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 130833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 2 - convert to U and V 130933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // from here down is very similar to Y code except 131033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // instead of 16 different pixels, its 8 pixels of U and 8 of V 131133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 131233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 131333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm7 // U 131433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm7 131533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm6 // V 131633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm6 131733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm2 131833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm1, xmm3 131933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm0, 8 132033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm1, 8 132133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packsswb xmm0, xmm1 132233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 // -> unsigned 13237cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 132433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 3 - store 8 U and 8 V values 132533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 132633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlps qword ptr [edx], xmm0 // U 132733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movhps qword ptr [edx + edi], xmm0 // V 132833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 132933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 133033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 133133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 133233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 13337cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 13347cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 13357cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 13367cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 133733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 133833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToUVRow_SSSE3(const uint8* src_argb0, int src_stride_argb, 133933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int width) { 134033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 134133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 134233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 134333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_argb 134433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // src_stride_argb 134533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 134633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 134733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 134833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, kRGBAToU 134933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kRGBAToV 135033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddUV128 135133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx // stride from u to v 135233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 135333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 135433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 135533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* step 1 - subsample 16x2 argb pixels to 8x1 */ 135633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 135733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 135833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax + 32] 135933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax + 48] 136033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, [eax + esi] 136133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, [eax + esi + 16] 136233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, [eax + esi + 32] 136333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm3, [eax + esi + 48] 136433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 136533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm0 136633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm0, xmm1, 0x88 136733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm1, 0xdd 136833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 136933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm2 137033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm2, xmm3, 0x88 137133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm3, 0xdd 137233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 137333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 137433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 2 - convert to U and V 137533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // from here down is very similar to Y code except 137633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // instead of 16 different pixels, its 8 pixels of U and 8 of V 137733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 137833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 137933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm7 // U 138033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm7 138133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm6 // V 138233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm6 138333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm2 138433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm1, xmm3 138533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm0, 8 138633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm1, 8 138733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packsswb xmm0, xmm1 138833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 // -> unsigned 138933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 139033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 3 - store 8 U and 8 V values 139133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 139233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlps qword ptr [edx], xmm0 // U 139333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movhps qword ptr [edx + edi], xmm0 // V 139433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 139533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 139633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 139733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 139833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 139933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 140033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 140133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 140233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 140333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 140433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb, 140533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int width) { 140633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 140733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 140833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 140933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_argb 141033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // src_stride_argb 141133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 141233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 141333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 141433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, kRGBAToU 141533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, kRGBAToV 141633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kAddUV128 141733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx // stride from u to v 141833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 141933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 142033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 142133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* step 1 - subsample 16x2 argb pixels to 8x1 */ 142233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 142333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 142433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + 32] 142533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + 48] 142633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi] 142733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 142833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 16] 142933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, xmm4 143033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 32] 143133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 143233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm4, [eax + esi + 48] 143333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm3, xmm4 143433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 143533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm0 143633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm0, xmm1, 0x88 143733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm1, 0xdd 143833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm4 143933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm2 144033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm2, xmm3, 0x88 144133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shufps xmm4, xmm3, 0xdd 144233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm2, xmm4 144333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 144433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 2 - convert to U and V 144533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // from here down is very similar to Y code except 144633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // instead of 16 different pixels, its 8 pixels of U and 8 of V 144733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 144833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 144933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm7 // U 145033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm2, xmm7 145133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm6 // V 145233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm3, xmm6 145333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm2 145433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm1, xmm3 145533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm0, 8 145633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm1, 8 145733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packsswb xmm0, xmm1 145833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddb xmm0, xmm5 // -> unsigned 145933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 146033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // step 3 - store 8 U and 8 V values 146133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 146233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlps qword ptr [edx], xmm0 // U 146333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movhps qword ptr [edx + edi], xmm0 // V 146433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 146533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 146633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 146733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 146833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 146933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 147033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 147133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 147233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBTOYROW_SSSE3 147333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 147433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_I422TOARGBROW_SSSE3 147533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 147633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YG 74 /* static_cast<int8>(1.164 * 64 + 0.5) */ 147733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 147833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UB 127 /* min(63,static_cast<int8>(2.018 * 64)) */ 147933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UG -25 /* static_cast<int8>(-0.391 * 64 - 0.5) */ 148033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define UR 0 148133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 148233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VB 0 148333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VG -52 /* static_cast<int8>(-0.813 * 64 - 0.5) */ 148433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define VR 102 /* static_cast<int8>(1.596 * 64 + 0.5) */ 148533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 148633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Bias 148733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BB UB * 128 + VB * 128 148833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BG UG * 128 + VG * 128 148933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define BR UR * 128 + VR * 128 149033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 149133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kUVToB = { 149233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB 149333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 149433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 149533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kUVToR = { 149633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR 149733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 149833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 149933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kUVToG = { 150033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG 150133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 150233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 150333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kVUToB = { 150433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, 150533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 150633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 150733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kVUToR = { 150833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, 150933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 151033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 151133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kVUToG = { 151233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, 151333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 151433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 151533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG }; 151633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 }; 151733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB }; 151833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG }; 151933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR }; 152033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 152133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): NV12/NV21 fetch UV and use directly. 152233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): Read that does half size on Y and treats 420 as 444. 152333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 152433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 8 UV from 411. 152533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV444 __asm { \ 152633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movq xmm0, qword ptr [esi] /* U */ /* NOLINT */ \ 152733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movq xmm1, qword ptr [esi + edi] /* V */ /* NOLINT */ \ 152833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm lea esi, [esi + 8] \ 152933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpcklbw xmm0, xmm1 /* UV */ \ 153033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 153133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 153233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 4 UV from 422, upsample to 8 UV. 153333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV422 __asm { \ 153433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movd xmm0, [esi] /* U */ \ 153533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movd xmm1, [esi + edi] /* V */ \ 153633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm lea esi, [esi + 4] \ 153733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpcklbw xmm0, xmm1 /* UV */ \ 153833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 153933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 154033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 154133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 2 UV from 411, upsample to 8 UV. 154233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READYUV411 __asm { \ 154333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movd xmm0, [esi] /* U */ \ 154433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movd xmm1, [esi + edi] /* V */ \ 154533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm lea esi, [esi + 2] \ 154633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpcklbw xmm0, xmm1 /* UV */ \ 154733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 154833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpckldq xmm0, xmm0 /* UVUV (upsample) */ \ 154933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 155033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 155133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Read 4 UV from NV12, upsample to 8 UV. 155233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define READNV12 __asm { \ 155333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movq xmm0, qword ptr [esi] /* UV */ /* NOLINT */ \ 155433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm lea esi, [esi + 8] \ 155533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpcklwd xmm0, xmm0 /* UVUV (upsample) */ \ 155633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 155733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 155833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 pixels: 8 UV and 8 Y. 155933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YUVTORGB __asm { \ 156033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 156133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movdqa xmm1, xmm0 \ 156233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movdqa xmm2, xmm0 \ 156333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm pmaddubsw xmm0, kUVToB /* scale B UV */ \ 156433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm pmaddubsw xmm1, kUVToG /* scale G UV */ \ 156533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm pmaddubsw xmm2, kUVToR /* scale R UV */ \ 156633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 156733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psubw xmm1, kUVBiasG \ 156833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psubw xmm2, kUVBiasR \ 156933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* Step 2: Find Y contribution to 8 R,G,B values */ \ 157033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 157133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm lea eax, [eax + 8] \ 157233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpcklbw xmm3, xmm4 \ 157333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psubsw xmm3, kYSub16 \ 157433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm pmullw xmm3, kYToRgb \ 157533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm paddsw xmm0, xmm3 /* B += Y */ \ 157633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm paddsw xmm1, xmm3 /* G += Y */ \ 157733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm paddsw xmm2, xmm3 /* R += Y */ \ 157833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psraw xmm0, 6 \ 157933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psraw xmm1, 6 \ 158033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psraw xmm2, 6 \ 158133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm packuswb xmm0, xmm0 /* B */ \ 158233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm packuswb xmm1, xmm1 /* G */ \ 158333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm packuswb xmm2, xmm2 /* R */ \ 158433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 158533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 158633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 pixels: 8 VU and 8 Y. 158733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#define YVUTORGB __asm { \ 158833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* Step 1: Find 4 UV contributions to 8 R,G,B values */ \ 158933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movdqa xmm1, xmm0 \ 159033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movdqa xmm2, xmm0 \ 159133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm pmaddubsw xmm0, kVUToB /* scale B UV */ \ 159233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm pmaddubsw xmm1, kVUToG /* scale G UV */ \ 159333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm pmaddubsw xmm2, kVUToR /* scale R UV */ \ 159433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psubw xmm0, kUVBiasB /* unbias back to signed */ \ 159533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psubw xmm1, kUVBiasG \ 159633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psubw xmm2, kUVBiasR \ 159733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp /* Step 2: Find Y contribution to 8 R,G,B values */ \ 159833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm movq xmm3, qword ptr [eax] /* NOLINT */ \ 159933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm lea eax, [eax + 8] \ 160033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm punpcklbw xmm3, xmm4 \ 160133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psubsw xmm3, kYSub16 \ 160233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm pmullw xmm3, kYToRgb \ 160333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm paddsw xmm0, xmm3 /* B += Y */ \ 160433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm paddsw xmm1, xmm3 /* G += Y */ \ 160533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm paddsw xmm2, xmm3 /* R += Y */ \ 160633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psraw xmm0, 6 \ 160733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psraw xmm1, 6 \ 160833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm psraw xmm2, 6 \ 160933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm packuswb xmm0, xmm0 /* B */ \ 161033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm packuswb xmm1, xmm1 /* G */ \ 161133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm packuswb xmm2, xmm2 /* R */ \ 161233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 161333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 161433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16. 161533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 161633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 161733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I444ToARGBRow_SSSE3(const uint8* y_buf, 161833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 161933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 162033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 162133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 162233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 162333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 162433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 162533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 162633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 162733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 162833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // argb 162933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 163033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 163133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 163233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 163333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 163433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 163533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 163633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV444 163733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 163833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 163933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 164033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 164133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 164233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 164333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 164433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 164533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 164633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 164733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 164833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 164933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 165033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 165133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 165233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 165333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 165433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 165533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 165633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 165733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16. 165833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 165933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 166033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToARGBRow_SSSE3(const uint8* y_buf, 166133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 166233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 166333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 166433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 166533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 166633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 166733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 166833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 166933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 167033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 167133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // argb 167233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 167333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 167433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 167533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 167633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 167733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 167833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 167933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV422 168033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 168133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 168233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 168333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 168433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 168533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 168633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 168733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 168833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 168933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 169033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 169133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 169233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 169333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 169433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 169533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 169633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 169733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 169833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 169933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 170033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16. 170133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 170233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Similar to I420 but duplicate UV once more. 170333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 170433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I411ToARGBRow_SSSE3(const uint8* y_buf, 170533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 170633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 170733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 170833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 170933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 171033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 171133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 171233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 171333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 171433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 171533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // argb 171633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 171733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 171833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 171933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 172033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 172133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 172233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 172333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV411 172433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 172533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 172633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 172733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 172833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 172933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 173033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 173133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 173233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 173333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 173433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 173533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 173633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 173733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 173833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 173933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 174033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 174133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 174233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 174333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 174433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16. 174533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 174633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 174733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV12ToARGBRow_SSSE3(const uint8* y_buf, 174833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* uv_buf, 174933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 175033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 175133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 175233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 175333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // Y 175433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 4 + 8] // UV 175533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 12] // argb 175633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // width 175733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 175833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 175933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 176033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 176133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 176233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READNV12 176333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 176433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 176533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 176633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 176733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 176833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 176933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 177033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 177133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 177233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 177333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 177433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 177533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 177633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 177733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 177833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 177933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 178033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 178133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 178233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16. 178333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 178433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 178533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV21ToARGBRow_SSSE3(const uint8* y_buf, 178633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* uv_buf, 178733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 178833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 178933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 179033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 179133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // Y 179233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 4 + 8] // VU 179333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 12] // argb 179433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // width 179533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 179633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 179733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 179833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 179933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 180033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READNV12 180133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YVUTORGB 180233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 180333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 180433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 180533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 180633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 180733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 180833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 180933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 181033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 181133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 181233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 181333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 181433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 181533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 181633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 181733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 181833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 181933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 182033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, unaligned. 182133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 UV values, mixed with 8 Y producing 8 ARGB (32 bytes). 182233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 182333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I444ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 182433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 182533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 182633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 182733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 182833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 182933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 183033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 183133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 183233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 183333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 183433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // argb 183533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 183633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 183733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 183833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 183933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 184033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 184133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 184233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV444 184333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 184433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 184533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 184633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 184733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 184833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 184933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 185033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 185133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 185233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx + 16], xmm1 185333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 185433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 185533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 185633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 185733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 185833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 185933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 186033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 186133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 186233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 186333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, unaligned. 186433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 186533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 186633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 186733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 186833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 186933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 187033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 187133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 187233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 187333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 187433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 187533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 187633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 187733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // argb 187833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 187933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 188033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 188133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 188233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 188333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 188433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 188533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV422 188633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 188733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 188833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 188933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 189033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 189133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 189233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 189333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 189433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 189533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx + 16], xmm1 189633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 189733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 189833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 189933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 190033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 190133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 190233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 190333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 190433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 190533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 190633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, unaligned. 190733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 2 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 190833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Similar to I420 but duplicate UV once more. 190933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 191033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I411ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 191133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 191233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 191333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 191433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 191533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 191633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 191733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 191833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 191933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 192033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 192133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // argb 192233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 192333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 192433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 192533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 192633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 192733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 192833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 192933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV411 193033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 193133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 193233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 193333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 193433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 193533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 193633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 193733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 193833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 193933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx + 16], xmm1 194033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 194133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 194233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 194333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 194433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 194533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 194633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 194733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 194833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 194933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 195033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 195133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16. 195233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 195333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 195433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV12ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 195533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* uv_buf, 195633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 195733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 195833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 195933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 196033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // Y 196133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 4 + 8] // UV 196233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 12] // argb 196333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // width 196433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 196533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 196633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 196733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 196833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 196933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READNV12 197033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 197133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 197233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 197333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 197433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 197533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 197633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 197733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 197833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 197933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx + 16], xmm1 198033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 198133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 198233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 198333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 198433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 198533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 198633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 198733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 198833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 198933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 8 pixels, dest aligned 16. 199033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// 4 UV values upsampled to 8 UV, mixed with 8 Y producing 8 ARGB (32 bytes). 199133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 199233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid NV21ToARGBRow_Unaligned_SSSE3(const uint8* y_buf, 199333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* uv_buf, 199433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* argb_buf, 199533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 199633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 199733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 199833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // Y 199933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 4 + 8] // VU 200033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 12] // argb 200133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // width 200233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 200333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 200433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 200533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 200633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 200733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READNV12 200833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YVUTORGB 200933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 201033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 201133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm1 // BG 201233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm5 // RA 201333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 201433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm2 // BGRA first 4 pixels 201533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm2 // BGRA next 4 pixels 201633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 201733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx + 16], xmm1 201833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 201933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 202033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 202133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 202233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 202333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 202433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 202533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 202633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 202733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 202833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToBGRARow_SSSE3(const uint8* y_buf, 202933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 203033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 203133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* bgra_buf, 203233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 203333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 203433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 203533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 203633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 203733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 203833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 203933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // bgra 204033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 204133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 204233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 204333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 204433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 204533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 204633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV422 204733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 204833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 204933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into BGRA 205033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 205133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm1, xmm0 // GB 205233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm5, xmm2 // AR 205333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm5 205433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm5, xmm1 // BGRA first 4 pixels 205533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm0, xmm1 // BGRA next 4 pixels 205633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm5 205733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm0 205833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 205933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 206033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 206133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 206233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 206333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 206433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 206533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 206633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 206733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 206833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 206933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToBGRARow_Unaligned_SSSE3(const uint8* y_buf, 207033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 207133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 207233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* bgra_buf, 207333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 207433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 207533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 207633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 207733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 207833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 207933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 208033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // bgra 208133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 208233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 208333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 208433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 208533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 208633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 208733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV422 208833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 208933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 209033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into BGRA 209133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 209233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm1, xmm0 // GB 209333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm5, xmm2 // AR 209433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm5 209533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm5, xmm1 // BGRA first 4 pixels 209633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm0, xmm1 // BGRA next 4 pixels 209733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm5 209833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx + 16], xmm0 209933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 210033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 210133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 210233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 210333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 210433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 210533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 210633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 210733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 210833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 210933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 211033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToABGRRow_SSSE3(const uint8* y_buf, 211133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 211233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 211333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* abgr_buf, 211433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 211533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 211633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 211733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 211833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 211933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 212033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 212133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // abgr 212233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 212333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 212433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 212533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 212633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 212733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 212833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 212933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV422 213033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 213133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 213233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 213333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm1 // RG 213433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm5 // BA 213533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm2 213633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm2, xmm0 // RGBA first 4 pixels 213733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm0 // RGBA next 4 pixels 213833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm2 213933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 214033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 214133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 214233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 214333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 214433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 214533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 214633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 214733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 214833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 214933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 215033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 215133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToABGRRow_Unaligned_SSSE3(const uint8* y_buf, 215233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 215333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 215433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* abgr_buf, 215533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 215633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 215733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 215833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 215933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 216033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 216133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 216233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // abgr 216333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 216433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 216533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 216633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 216733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 216833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 216933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 217033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV422 217133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 217233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 217333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into ARGB 217433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm1 // RG 217533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm5 // BA 217633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm2 217733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm2, xmm0 // RGBA first 4 pixels 217833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm0 // RGBA next 4 pixels 217933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm2 218033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx + 16], xmm1 218133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 218233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 218333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 218433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 218533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 218633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 218733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 218833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 218933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 219033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 219133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 219233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToRGBARow_SSSE3(const uint8* y_buf, 219333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 219433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 219533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* rgba_buf, 219633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 219733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 219833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 219933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 220033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 220133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 220233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 220333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // rgba 220433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 220533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 220633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 220733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 220833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 220933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 221033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV422 221133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 221233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 221333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into RGBA 221433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 221533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm1, xmm2 // GR 221633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm5, xmm0 // AB 221733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm5 221833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm5, xmm1 // RGBA first 4 pixels 221933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm0, xmm1 // RGBA next 4 pixels 222033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm5 222133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm0 222233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 222333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 222433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 222533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 222633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 222733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 222833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 222933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 223033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 223133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 223233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 223333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid I422ToRGBARow_Unaligned_SSSE3(const uint8* y_buf, 223433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* u_buf, 223533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const uint8* v_buf, 223633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* rgba_buf, 223733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 223833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 223933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 224033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 224133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // Y 224233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // U 224333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 12] // V 224433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 16] // rgba 224533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // width 224633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 224733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm4, xmm4 224833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 224933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 225033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 225133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp READYUV422 225233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp YUVTORGB 225333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 225433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 3: Weave into RGBA 225533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate 0xffffffff for alpha 225633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm1, xmm2 // GR 225733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm5, xmm0 // AB 225833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm5 225933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm5, xmm1 // RGBA first 4 pixels 226033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm0, xmm1 // RGBA next 4 pixels 226133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm5 226233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx + 16], xmm0 226333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 226433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 226533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 226633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 226733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 226833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 226933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 227033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 227133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 227233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 227333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_I422TOARGBROW_SSSE3 227433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 227533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YTOARGBROW_SSE2 227633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 227733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YToARGBRow_SSE2(const uint8* y_buf, 227833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* rgb_buf, 227933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 228033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 228133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0xff000000 228233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm4, 24 228333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax,0x10001000 228433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3,eax 228533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm3,xmm3,0 228633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax,0x012a012a 228733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2,eax 228833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm2,xmm2,0 228933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // Y 229033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // rgb 229133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // width 229233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 229333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 229433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 229533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 1: Scale Y contribution to 8 G values. G = (y - 16) * 1.164 229633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq xmm0, qword ptr [eax] 229733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 8] 229833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm0 // Y.Y 229933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubusw xmm0, xmm3 230033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm0, xmm2 230133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 // G 230233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 230333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // Step 2: Weave into ARGB 230433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm0 // GG 230533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 230633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm0 // BGRA first 4 pixels 230733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm1 // BGRA next 4 pixels 230833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 230933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm1, xmm4 231033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 231133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm1 231233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 32] 231333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 231433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 231533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 231633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 231733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 231833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 231933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_YTOARGBROW_SSE2 232033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 232133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_SSSE3 232233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 232333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes. 232433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMirror = { 232533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 15u, 14u, 13u, 12u, 11u, 10u, 9u, 8u, 7u, 6u, 5u, 4u, 3u, 2u, 1u, 0u 232633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 232733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 232833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 232933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 233033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 233133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src 233233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst 233333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // width 233433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kShuffleMirror 233533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax - 16] 233633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 233733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 233833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 233933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax + ecx] 234033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm5 234133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 234233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 234333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 234433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 234533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 234633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 234733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 234833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_MIRRORROW_SSSE3 234933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 235033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_SSE2 235133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// SSE2 version has movdqu so it can be used on unaligned buffers when SSSE3 235233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// version can not. 235333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 235433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRow_SSE2(const uint8* src, uint8* dst, int width) { 235533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 235633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src 235733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst 235833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // width 235933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax - 16] 236033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 236133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 236233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 236333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax + ecx] 236433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 // swap bytes 236533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm0, 8 236633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 236733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm1 236833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm0, xmm0, 0x1b // swap words 236933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufhw xmm0, xmm0, 0x1b 237033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm0, xmm0, 0x4e // swap qwords 237133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 237233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 237333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 237433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 237533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 237633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 237733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 237833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_MIRRORROW_SSE2 237933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 238033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_MIRRORROW_UV_SSSE3 238133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes of UV channels. 238233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleMirrorUV = { 238333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 14u, 12u, 10u, 8u, 6u, 4u, 2u, 0u, 15u, 13u, 11u, 9u, 7u, 5u, 3u, 1u 238433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 238533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 238633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 238733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid MirrorRowUV_SSSE3(const uint8* src, uint8* dst_u, uint8* dst_v, 238833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 238933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 239033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 239133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // src 239233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 8] // dst_u 239333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 4 + 12] // dst_v 239433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // width 239533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, kShuffleMirrorUV 239633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + ecx * 2 - 16] 239733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 239833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 239933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 240033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 240133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 240233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax - 16] 240333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm1 240433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 240533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlpd qword ptr [edx], xmm0 240633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movhpd qword ptr [edx + edi], xmm0 24077cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde lea edx, [edx + 8] 240833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 240933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 241033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 241133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 241233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 241333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 241433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_MIRRORROW_UV_SSSE3 241533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 241633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBMIRRORROW_SSSE3 241733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 241833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for reversing the bytes. 241933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kARGBShuffleMirror = { 242033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 12u, 13u, 14u, 15u, 8u, 9u, 10u, 11u, 4u, 5u, 6u, 7u, 0u, 1u, 2u, 3u 242133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 242233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 242333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 242433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBMirrorRow_SSSE3(const uint8* src, uint8* dst, int width) { 242533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__asm { 242633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src 242733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst 242833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // width 242933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kARGBShuffleMirror 243033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax - 16] 243133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 243233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 243333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 243433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax + ecx * 4] 243533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm5 243633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 243733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 243833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 243933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 244033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 244133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 244233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 244333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBMIRRORROW_SSSE3 244433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 244533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_SPLITUV_SSE2 244633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 244733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SplitUV_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) { 244833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 244933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 245033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // src_uv 245133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 8] // dst_u 245233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 4 + 12] // dst_v 245333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // pix 245433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 245533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 245633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 245733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 245833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 245933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 246033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 246133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 246233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 246333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, xmm0 246433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm1 246533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // even bytes 246633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 246733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 246833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 8 // odd bytes 246933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm3, 8 247033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm2, xmm3 247133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 247233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + edi], xmm2 247333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 247433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 247533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 247633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 247733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 247833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 247933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 248033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 248133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_SPLITUV_SSE2 248233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 248333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COPYROW_SSE2 248433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// CopyRow copys 'count' bytes using a 16 byte load/store, 32 bytes at time. 248533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 248633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CopyRow_SSE2(const uint8* src, uint8* dst, int count) { 248733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 248833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src 248933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst 249033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // count 249133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 249233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 249333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 249433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 249533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 249633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 249733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 249833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx + 16], xmm1 249933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 250033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 32 250133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 250233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 250333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 250433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 250533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_COPYROW_SSE2 250633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 250733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COPYROW_X86 250833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 250933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CopyRow_X86(const uint8* src, uint8* dst, int count) { 251033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 251133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, esi 251233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, edi 251333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 4] // src 251433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8] // dst 251533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // count 251633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shr ecx, 2 251733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp rep movsd 251833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, edx 251933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, eax 252033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 252133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 252233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 252333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_COPYROW_X86 252433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 252533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_SETROW_X86 252633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// SetRow8 writes 'count' bytes using a 32 bit value repeated. 252733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 252833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SetRow8_X86(uint8* dst, uint32 v32, int count) { 252933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 253033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, edi 253133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 4] // dst 253233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8] // v32 253333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // count 253433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shr ecx, 2 253533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp rep stosd 253633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, edx 253733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 253833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 253933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 254033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 254133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// SetRow32 writes 'count' words using a 32 bit value repeated. 254233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 254333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid SetRows32_X86(uint8* dst, uint32 v32, int width, 254433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int dst_stride, int height) { 254533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 254633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 254733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 254833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push ebp 254933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 12 + 4] // dst 255033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 12 + 8] // v32 255133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ebp, [esp + 12 + 12] // width 255233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 12 + 16] // dst_stride 255333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 12 + 20] // height 255433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea ecx, [ebp * 4] 255533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, ecx // stride - width * 4 255633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 255733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 255833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 255933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, ebp 256033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp rep stosd 256133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add edi, edx 256233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub esi, 1 256333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 256433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 256533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop ebp 256633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 256733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 256833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 256933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 257033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 257133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_SETROW_X86 257233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 257333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_YUY2TOYROW_SSE2 257433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 257533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToYRow_SSE2(const uint8* src_yuy2, 257633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_y, int pix) { 257733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 257833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_yuy2 257933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_y 258033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 258133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 258233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 258333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 258433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 258533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 258633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 258733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 258833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 258933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // even bytes are Y 259033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 259133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 259233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 259333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 259433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 259533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 259633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 259733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 259833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 259933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 260033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 260133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUVRow_SSE2(const uint8* src_yuy2, int stride_yuy2, 260233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int pix) { 260333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 260433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 260533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 260633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_yuy2 260733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // stride_yuy2 260833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 260933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 261033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 261133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 261233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 261333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 261433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 261533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 261633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 261733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 261833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 261933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax + esi] 262033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax + esi + 16] 262133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 262233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm2 262333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, xmm3 262433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 // YUYV -> UVUV 262533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 262633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 262733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 262833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // U 262933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 263033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // V 263133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm1, xmm1 263233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 263333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx + edi], xmm1 263433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 263533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 263633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 263733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 263833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 263933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 264033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 264133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 264233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 264333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 264433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 264533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUV422Row_SSE2(const uint8* src_yuy2, 264633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int pix) { 264733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 264833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 264933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // src_yuy2 265033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 8] // dst_u 265133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 4 + 12] // dst_v 265233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // pix 265333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 265433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 265533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 265633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 265733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 265833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 265933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 266033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 266133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 266233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 // YUYV -> UVUV 266333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 266433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 266533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 266633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // U 266733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 266833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // V 266933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm1, xmm1 267033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 267133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx + edi], xmm1 267233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 267333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 267433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 267533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 267633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 267733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 267833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 267933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 268033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 268133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 268233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToYRow_Unaligned_SSE2(const uint8* src_yuy2, 268333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_y, int pix) { 268433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 268533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_yuy2 268633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_y 268733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 268833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 268933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 269033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 269133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 269233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 269333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 269433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 269533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 269633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // even bytes are Y 269733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 269833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 269933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 270033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 270133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 270233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 270333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 270433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 270533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 270633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 270733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 270833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUVRow_Unaligned_SSE2(const uint8* src_yuy2, int stride_yuy2, 270933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int pix) { 271033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 271133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 271233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 271333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_yuy2 271433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // stride_yuy2 271533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 271633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 271733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 271833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 271933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 272033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 272133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 272233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 272333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 272433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 272533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 272633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + esi] 272733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + esi + 16] 272833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 272933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm2 273033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, xmm3 273133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 // YUYV -> UVUV 273233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 273333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 273433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 273533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // U 273633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 273733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // V 273833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm1, xmm1 273933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 274033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx + edi], xmm1 274133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 274233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 274333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 274433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 274533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 274633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 274733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 274833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 274933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 275033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 275133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 275233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid YUY2ToUV422Row_Unaligned_SSE2(const uint8* src_yuy2, 275333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int pix) { 275433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 275533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 275633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // src_yuy2 275733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 8] // dst_u 275833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 4 + 12] // dst_v 275933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // pix 276033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 276133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 276233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 276333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 276433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 276533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 276633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 276733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 276833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 276933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 // YUYV -> UVUV 277033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 277133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 277233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 277333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // U 277433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 277533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // V 277633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm1, xmm1 277733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 277833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx + edi], xmm1 277933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 278033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 278133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 278233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 278333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 278433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 278533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 278633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 278733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 278833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 278933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToYRow_SSE2(const uint8* src_uyvy, 279033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_y, int pix) { 279133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 279233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_uyvy 279333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_y 279433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 279533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 279633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 279733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 279833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 279933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 280033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 280133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 // odd bytes are Y 280233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 280333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 280433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 280533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 280633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 280733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 280833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 280933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 281033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 281133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 281233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 281333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUVRow_SSE2(const uint8* src_uyvy, int stride_uyvy, 281433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int pix) { 281533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 281633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 281733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 281833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_yuy2 281933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // stride_yuy2 282033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 282133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 282233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 282333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 282433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 282533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 282633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 282733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 282833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 282933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 283033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 283133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax + esi] 283233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax + esi + 16] 283333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 283433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm2 283533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, xmm3 283633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // UYVY -> UVUV 283733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 283833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 283933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 284033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // U 284133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 284233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // V 284333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm1, xmm1 284433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 284533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx + edi], xmm1 284633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 284733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 284833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 284933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 285033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 285133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 285233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 285333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 285433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 285533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 285633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 285733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUV422Row_SSE2(const uint8* src_uyvy, 285833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int pix) { 285933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 286033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 286133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // src_yuy2 286233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 8] // dst_u 286333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 4 + 12] // dst_v 286433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // pix 286533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 286633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 286733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 28687cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 286933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 287033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 287133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 287233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 287333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 287433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // UYVY -> UVUV 287533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 287633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 287733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 287833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // U 287933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 288033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // V 288133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm1, xmm1 288233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 288333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx + edi], xmm1 288433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 288533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 288633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 288733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 288833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 28897cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde ret 28907cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde } 28917cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} 28927cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 289333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 289433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToYRow_Unaligned_SSE2(const uint8* src_uyvy, 289533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_y, int pix) { 289633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 289733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_uyvy 289833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_y 289933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // pix 290033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 290133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 290233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 290333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 290433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 290533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 290633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 // odd bytes are Y 290733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 290833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 290933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 291033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm0 291133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 291233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 291333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 291433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 291533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 291633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 291733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 291833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUVRow_Unaligned_SSE2(const uint8* src_uyvy, int stride_uyvy, 291933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int pix) { 292033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 292133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 292233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 292333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_yuy2 292433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // stride_yuy2 292533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // dst_u 292633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 16] // dst_v 292733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 20] // pix 292833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 292933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 293033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 293133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 293233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 293333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 293433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 293533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 293633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax + esi] 293733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax + esi + 16] 293833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 293933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, xmm2 294033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm1, xmm3 294133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // UYVY -> UVUV 294233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 294333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 294433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 294533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // U 294633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 294733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // V 294833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm1, xmm1 294933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 295033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx + edi], xmm1 295133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 295233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 295333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 295433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 295533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 295633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 295733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 295833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 295933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 296033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 296133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 296233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid UYVYToUV422Row_Unaligned_SSE2(const uint8* src_uyvy, 296333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_u, uint8* dst_v, int pix) { 296433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 296533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 296633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // src_yuy2 296733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 8] // dst_u 296833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 4 + 12] // dst_v 296933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // pix 297033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ff00ff 297133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 8 297233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, edx 297333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 297433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 297533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 297633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm0, [eax] 297733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [eax + 16] 297833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 297933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // UYVY -> UVUV 298033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 298133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 298233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 298333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // U 298433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 298533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // V 298633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm1, xmm1 298733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm0 298833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx + edi], xmm1 298933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 8] 299033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 16 299133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 299233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 299333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 299433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 299533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 299633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 299733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_YUY2TOYROW_SSE2 299833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 299933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBBLENDROW_SSE2 300033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Blend 8 pixels at a time. 300133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 300233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBBlendRow_SSE2(const uint8* src_argb0, const uint8* src_argb1, 300333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_argb, int width) { 300433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 300533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 300633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // src_argb0 300733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 4 + 8] // src_argb1 300833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 12] // dst_argb 300933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // width 301033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm7, xmm7 // generate constant 1 301133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm7, 15 301233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 301333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm6, 8 301433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 301533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm5, 8 301633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0xff000000 301733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm4, 24 301833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 301933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 302033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp je convertloop1 // only 1 pixel? 302133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl convertloop1b 302233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 302333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 1 pixel loop until destination pointer is aligned. 302433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp alignloop1: 302533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp test edx, 15 // aligned? 302633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp je alignloop1b 302733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3, [eax] 302833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 4] 302933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm3 // src argb 303033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm3, xmm4 // ~alpha 303133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, [esi] // _r_b 303233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm3, 8 // alpha 303333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufhw xmm3, xmm3,0F5h // 8 alpha words 303433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm3, xmm3,0F5h 303533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm6 // _r_b 303633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm3, xmm7 // 256 - alpha 303733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm2, xmm3 // _r_b * alpha 303833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm1, [esi] // _a_g 303933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 4] 304033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // _a_g 304133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // set alpha to 255 304233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm1, xmm3 // _a_g * alpha 304333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 8 // _r_b convert to 8 bits again 304433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm2 // + src argb 304533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 // a_g_ convert to 8 bits again 304633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm1 // + src argb 304733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 304833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd [edx], xmm0 304933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 4] 305033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge alignloop1 305133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 305233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp alignloop1b: 305333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add ecx, 1 - 4 305433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl convertloop4b 305533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 305633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 4 pixel loop. 305733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop4: 305833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax] // src argb 305933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 306033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm3 // src argb 306133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm3, xmm4 // ~alpha 306233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [esi] // _r_b 306333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm3, 8 // alpha 306433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufhw xmm3, xmm3,0F5h // 8 alpha words 306533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm3, xmm3,0F5h 306633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm6 // _r_b 306733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm3, xmm7 // 256 - alpha 306833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm2, xmm3 // _r_b * alpha 306933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [esi] // _a_g 307033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 16] 307133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // _a_g 307233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // set alpha to 255 307333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm1, xmm3 // _a_g * alpha 307433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 8 // _r_b convert to 8 bits again 307533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm2 // + src argb 307633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 // a_g_ convert to 8 bits again 307733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm1 // + src argb 307833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 307933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 308033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 308133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge convertloop4 308233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 308333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop4b: 308433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add ecx, 4 - 1 308533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl convertloop1b 308633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 308733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 1 pixel loop. 308833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop1: 308933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3, [eax] // src argb 309033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 4] 309133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm3 // src argb 309233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm3, xmm4 // ~alpha 309333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, [esi] // _r_b 309433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm3, 8 // alpha 309533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufhw xmm3, xmm3,0F5h // 8 alpha words 309633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm3, xmm3,0F5h 309733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm6 // _r_b 309833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm3, xmm7 // 256 - alpha 309933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm2, xmm3 // _r_b * alpha 310033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm1, [esi] // _a_g 310133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 4] 310233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // _a_g 310333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // set alpha to 255 310433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm1, xmm3 // _a_g * alpha 310533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 8 // _r_b convert to 8 bits again 310633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm2 // + src argb 310733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 // a_g_ convert to 8 bits again 310833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm1 // + src argb 310933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 311033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd [edx], xmm0 311133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 4] 311233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge convertloop1 311333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 311433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop1b: 311533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 311633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 311733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 311833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 311933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBBLENDROW_SSE2 312033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 312133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBBLENDROW_SSSE3 312233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table for isolating alpha. 312333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleAlpha = { 312433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 3u, 0x80, 3u, 0x80, 7u, 0x80, 7u, 0x80, 312533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 11u, 0x80, 11u, 0x80, 15u, 0x80, 15u, 0x80 312633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 312733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Same as SSE2, but replaces: 312833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// psrlw xmm3, 8 // alpha 312933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// pshufhw xmm3, xmm3,0F5h // 8 alpha words 313033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// pshuflw xmm3, xmm3,0F5h 313133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// with.. 313233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// pshufb xmm3, kShuffleAlpha // alpha 313333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Blend 8 pixels at a time. 313433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 313533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 313633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBBlendRow_SSSE3(const uint8* src_argb0, const uint8* src_argb1, 313733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_argb, int width) { 313833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 313933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 314033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4 + 4] // src_argb0 314133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 4 + 8] // src_argb1 314233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 4 + 12] // dst_argb 314333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 4 + 16] // width 314433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm7, xmm7 // generate constant 1 314533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm7, 15 314633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm6, xmm6 // generate mask 0x00ff00ff 314733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm6, 8 314833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0xff00ff00 314933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psllw xmm5, 8 315033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0xff000000 315133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm4, 24 315233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 315333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 315433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp je convertloop1 // only 1 pixel? 315533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl convertloop1b 315633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 315733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 1 pixel loop until destination pointer is aligned. 315833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp alignloop1: 315933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp test edx, 15 // aligned? 316033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp je alignloop1b 316133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3, [eax] 316233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 4] 316333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm3 // src argb 316433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm3, xmm4 // ~alpha 316533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, [esi] // _r_b 316633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm3, kShuffleAlpha // alpha 316733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm6 // _r_b 316833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm3, xmm7 // 256 - alpha 316933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm2, xmm3 // _r_b * alpha 317033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm1, [esi] // _a_g 317133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 4] 317233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // _a_g 317333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // set alpha to 255 317433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm1, xmm3 // _a_g * alpha 317533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 8 // _r_b convert to 8 bits again 317633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm2 // + src argb 317733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 // a_g_ convert to 8 bits again 317833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm1 // + src argb 317933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 318033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd [edx], xmm0 318133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 4] 318233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge alignloop1 318333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 318433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp alignloop1b: 318533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add ecx, 1 - 4 318633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl convertloop4b 318733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 318833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp test eax, 15 // unaligned? 318933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jne convertuloop4 319033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp test esi, 15 // unaligned? 319133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jne convertuloop4 319233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 319333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 4 pixel loop. 319433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop4: 319533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax] // src argb 319633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 319733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm3 // src argb 319833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm3, xmm4 // ~alpha 319933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [esi] // _r_b 320033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm3, kShuffleAlpha // alpha 320133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm6 // _r_b 320233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm3, xmm7 // 256 - alpha 320333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm2, xmm3 // _r_b * alpha 320433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [esi] // _a_g 320533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 16] 320633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // _a_g 320733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // set alpha to 255 320833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm1, xmm3 // _a_g * alpha 320933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 8 // _r_b convert to 8 bits again 321033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm2 // + src argb 321133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 // a_g_ convert to 8 bits again 321233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm1 // + src argb 321333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 321433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 321533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 321633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge convertloop4 321733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jmp convertloop4b 321833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 321933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 4 pixel unaligned loop. 322033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertuloop4: 322133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm3, [eax] // src argb 322233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 322333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm3 // src argb 322433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm3, xmm4 // ~alpha 322533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [esi] // _r_b 322633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm3, kShuffleAlpha // alpha 322733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm6 // _r_b 322833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm3, xmm7 // 256 - alpha 322933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm2, xmm3 // _r_b * alpha 323033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm1, [esi] // _a_g 323133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 16] 323233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // _a_g 323333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // set alpha to 255 323433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm1, xmm3 // _a_g * alpha 323533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 8 // _r_b convert to 8 bits again 323633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm2 // + src argb 323733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 // a_g_ convert to 8 bits again 323833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm1 // + src argb 323933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 324033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm0 324133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 324233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge convertuloop4 324333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 324433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop4b: 324533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add ecx, 4 - 1 324633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl convertloop1b 324733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 324833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 1 pixel loop. 324933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop1: 325033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3, [eax] // src argb 325133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 4] 325233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm3 // src argb 325333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm3, xmm4 // ~alpha 325433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, [esi] // _r_b 325533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm3, kShuffleAlpha // alpha 325633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm6 // _r_b 325733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm3, xmm7 // 256 - alpha 325833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm2, xmm3 // _r_b * alpha 325933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm1, [esi] // _a_g 326033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 4] 326133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 // _a_g 326233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm4 // set alpha to 255 326333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm1, xmm3 // _a_g * alpha 326433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm2, 8 // _r_b convert to 8 bits again 326533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm2 // + src argb 326633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm1, xmm5 // a_g_ convert to 8 bits again 326733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddusb xmm0, xmm1 // + src argb 326833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 326933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd [edx], xmm0 327033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 4] 327133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge convertloop1 327233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 327333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop1b: 327433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 327533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 327633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 327733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 327833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBBLENDROW_SSSE3 327933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 328033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBATTENUATE_SSE2 328133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Attenuate 4 pixels at a time. 328233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes. 328333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 328433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAttenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width) { 328533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 328633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb0 328733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 328833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // width 328933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 329033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0xff000000 329133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm4, 24 329233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm5, xmm5 // generate mask 0x00ffffff 329333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm5, 8 329433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 329533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 329633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 329733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // read 4 pixels 329833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm0 // first 2 329933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufhw xmm2, xmm0,0FFh // 8 alpha words 330033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm2, xmm2,0FFh 330133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm0, xmm2 // rgb * a 330233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax] // read 4 pixels 330333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm1, xmm1 // next 2 pixels 330433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufhw xmm2, xmm1,0FFh // 8 alpha words 330533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm2, xmm2,0FFh 330633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm1, xmm2 // rgb * a 330733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax] // alphas 330833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 330933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm4 331033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 331133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 331233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm0, xmm5 // keep original alphas 331333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm2 331433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 331533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 331633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 331733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 331833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 331933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 332033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 332133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 332233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBATTENUATE_SSE2 332333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 332433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBATTENUATEROW_SSSE3 332533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shuffle table duplicating alpha. 332633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleAlpha0 = { 332733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 3u, 3u, 3u, 3u, 3u, 3u, 128u, 128u, 7u, 7u, 7u, 7u, 7u, 7u, 128u, 128u, 332833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 332933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const uvec8 kShuffleAlpha1 = { 333033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 11u, 11u, 11u, 11u, 11u, 11u, 128u, 128u, 333133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 15u, 15u, 15u, 15u, 15u, 15u, 128u, 128u, 333233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 333333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 333433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAttenuateRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 333533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 333633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb0 333733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 333833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // width 333933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 334033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm3, xmm3 // generate mask 0xff000000 334133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm3, 24 334233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kShuffleAlpha0 334333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, kShuffleAlpha1 334433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 334533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 334633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 334733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // read 4 pixels 334833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm0, xmm4 // isolate first 2 alphas 334933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax] // read 4 pixels 335033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm1, xmm1 // first 2 pixel rgbs 335133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm0, xmm1 // rgb * a 335233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax] // read 4 pixels 335333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufb xmm1, xmm5 // isolate next 2 alphas 335433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax] // read 4 pixels 335533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm2, xmm2 // next 2 pixel rgbs 335633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm1, xmm2 // rgb * a 335733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax] // mask original alpha 335833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm3 335933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 336033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 336133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 336233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm2 // copy original alpha 336333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 336433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 336533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 336633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 336733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 336833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 336933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 337033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 337133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBATTENUATEROW_SSSE3 337233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 337333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBUNATTENUATEROW_SSE2 337433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Unattenuate 4 pixels at a time. 337533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes. 337633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 337733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBUnattenuateRow_SSE2(const uint8* src_argb, uint8* dst_argb, 337833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 337933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 338033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 338133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 338233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 4] // src_argb0 338333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 8] // dst_argb 338433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 12] // width 338533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 338633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm4, xmm4 // generate mask 0xff000000 338733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm4, 24 338833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 338933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 339033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 339133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // read 4 pixels 339233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movzx esi, byte ptr [eax + 3] // first alpha 339333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movzx edi, byte ptr [eax + 7] // second alpha 339433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm0 // first 2 339533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, dword ptr fixed_invtbl8[esi * 4] 339633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3, dword ptr fixed_invtbl8[edi * 4] 339733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words 339833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words 339933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlhps xmm2, xmm3 340033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm0, xmm2 // rgb * a 340133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 340233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax] // read 4 pixels 340333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movzx esi, byte ptr [eax + 11] // third alpha 340433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movzx edi, byte ptr [eax + 15] // forth alpha 340533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm1, xmm1 // next 2 340633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, dword ptr fixed_invtbl8[esi * 4] 340733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3, dword ptr fixed_invtbl8[edi * 4] 340833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm2, xmm2,0C0h // first 4 inv_alpha words 340933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm3, xmm3,0C0h // next 4 inv_alpha words 341033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlhps xmm2, xmm3 341133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm1, xmm2 // rgb * a 341233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 341333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax] // alphas 341433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm2, xmm4 341533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 341633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm2 341733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 341833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 341933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 342033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 342133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 342233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 342333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 342433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 342533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 342633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBUNATTENUATEROW_SSE2 342733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 342833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBGRAYROW_SSSE3 342933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constant for ARGB color to gray scale: 0.11 * B + 0.59 * G + 0.30 * R 343033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToGray = { 343133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0, 14, 76, 38, 0 343233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 343333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 343433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels. 343533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 343633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBGrayRow_SSSE3(const uint8* src_argb, uint8* dst_argb, int width) { 343733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 343833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* src_argb */ 343933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] /* dst_argb */ 344033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] /* width */ 344133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kARGBToGray 344233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 344333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 344433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 344533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 344633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // G 344733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 344833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm4 344933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 345033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm1 345133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 7 345233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 // 8 G bytes 345333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax] // A 345433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax + 16] 345533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm2, 24 345633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm3, 24 345733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm2, xmm3 345833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm2, xmm2 // 8 A bytes 345933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm0 // Weave into GG, GA, then GGGA 346033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm0 // 8 GG words 346133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm3, xmm2 // 8 GA words 346233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 346333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm3 // GGGA first 4 346433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm3 // GGGA next 4 346533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 346633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 346733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx + 16], xmm1 346833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 346933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 347033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 347133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 347233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 347333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBGRAYROW_SSSE3 347433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 347533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBSEPIAROW_SSSE3 347633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// b = (r * 35 + g * 68 + b * 17) >> 7 347733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// g = (r * 45 + g * 88 + b * 22) >> 7 347833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// r = (r * 50 + g * 98 + b * 24) >> 7 347933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Constant for ARGB color to sepia tone. 348033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToSepiaB = { 348133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0, 17, 68, 35, 0 348233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 348333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 348433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToSepiaG = { 348533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0, 22, 88, 45, 0 348633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 348733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 348833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampstatic const vec8 kARGBToSepiaR = { 348933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0, 24, 98, 50, 0 349033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp}; 349133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 349233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. 349333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 349433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBSepiaRow_SSSE3(uint8* dst_argb, int width) { 349533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 349633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* dst_argb */ 349733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8] /* width */ 349833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, kARGBToSepiaB 349933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, kARGBToSepiaG 350033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, kARGBToSepiaR 350133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 350233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 350333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 350433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // B 350533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, [eax + 16] 350633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm2 350733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm6, xmm2 350833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm0, xmm6 350933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 7 351033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 // 8 B values 351133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, [eax] // G 351233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 351333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm5, xmm3 351433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm3 351533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm5, xmm1 351633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 7 351733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm5, xmm5 // 8 G values 351833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm5 // 8 BG values 351933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, [eax] // R 352033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 352133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm5, xmm4 352233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 352333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddw xmm5, xmm1 352433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm5, 7 352533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm5, xmm5 // 8 R values 352633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, [eax] // A 352733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 352833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm6, 24 352933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm1, 24 353033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm6, xmm1 353133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm6, xmm6 // 8 A values 353233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm5, xmm6 // 8 RA values 353333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 // Weave BG, RA together 353433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm5 // BGRA first 4 353533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm5 // BGRA next 4 353633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 353733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax], xmm0 353833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + 16], xmm1 353933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 354033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 354133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 354233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 354333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 354433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBSEPIAROW_SSSE3 354533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 354633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBCOLORMATRIXROW_SSSE3 354733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Tranform 8 ARGB pixels (32 bytes) with color matrix. 354833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Same as Sepia except matrix is provided. 354933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// TODO(fbarchard): packuswbs only use half of the reg. To make RGBA, combine R 355033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// and B into a high and low, then G/A, unpackl/hbw and then unpckl/hwd. 355133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 355233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBColorMatrixRow_SSSE3(uint8* dst_argb, const int8* matrix_argb, 355333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 355433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 355533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* dst_argb */ 355633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] /* matrix_argb */ 355733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] /* width */ 355833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, [edx] 355933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3, [edx + 4] 356033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm4, [edx + 8] 356133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm2, xmm2, 0 356233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm3, xmm3, 0 356333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm4, xmm4, 0 356433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 356533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 356633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 356733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // B 356833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, [eax + 16] 356933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm2 357033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm6, xmm2 357133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, [eax] // G 357233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 357333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm5, xmm3 357433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm3 357533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddsw xmm0, xmm6 // B 357633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddsw xmm5, xmm1 // G 357733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm0, 7 // B 357833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm5, 7 // G 357933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 // 8 B values 358033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm5, xmm5 // 8 G values 358133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm5 // 8 BG values 358233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, [eax] // R 358333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 358433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm5, xmm4 358533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm4 358633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp phaddsw xmm5, xmm1 358733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psraw xmm5, 7 358833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm5, xmm5 // 8 R values 358933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm6, [eax] // A 359033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 359133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm6, 24 359233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrld xmm1, 24 359333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm6, xmm1 359433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm6, xmm6 // 8 A values 359533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 // Weave BG, RA together 359633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm5, xmm6 // 8 RA values 359733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm0, xmm5 // BGRA first 4 359833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm1, xmm5 // BGRA next 4 359933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 8 360033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax], xmm0 360133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + 16], xmm1 360233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 32] 360333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 360433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 360533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 360633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 360733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBCOLORMATRIXROW_SSSE3 360833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 360933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBCOLORTABLEROW_X86 361033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Tranform ARGB pixels with color table. 361133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 361233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBColorTableRow_X86(uint8* dst_argb, const uint8* table_argb, 361333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width) { 361433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 361533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push ebx 361633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 361733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 361833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push ebp 361933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 16 + 4] /* dst_argb */ 362033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 16 + 8] /* table_argb */ 362133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 16 + 12] /* width */ 362233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp xor ebx, ebx 362333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp xor edx, edx 362433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 362533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 362633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 362733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ebp, dword ptr [eax] // BGRA 362833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, ebp 362933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp and ebp, 255 363033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shr esi, 8 363133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp and esi, 255 363233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov bl, [edi + ebp * 4 + 0] // B 363333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov dl, [edi + esi * 4 + 1] // G 363433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ebp, dword ptr [eax] // BGRA 363533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, ebp 363633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shr ebp, 16 363733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shr esi, 24 363833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp and ebp, 255 363933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov [eax], bl 364033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov [eax + 1], dl 364133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov bl, [edi + ebp * 4 + 2] // R 364233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov dl, [edi + esi * 4 + 3] // A 364333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov [eax + 2], bl 364433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov [eax + 3], dl 364533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 4] 364633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 364733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 364833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop ebp 364933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 365033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 365133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop ebx 365233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 365333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 365433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 365533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBCOLORTABLEROW_X86 365633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 365733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBQUANTIZEROW_SSE2 365833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Quantize 4 ARGB pixels (16 bytes). 365933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes. 366033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 366133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBQuantizeRow_SSE2(uint8* dst_argb, int scale, int interval_size, 366233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int interval_offset, int width) { 366333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 366433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] /* dst_argb */ 366533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, [esp + 8] /* scale */ 366633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm3, [esp + 12] /* interval_size */ 366733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm4, [esp + 16] /* interval_offset */ 366833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 20] /* width */ 366933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm2, xmm2, 040h 367033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm2, xmm2, 044h 367133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm3, xmm3, 040h 367233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm3, xmm3, 044h 367333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshuflw xmm4, xmm4, 040h 367433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm4, xmm4, 044h 367533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm5, xmm5 // constant 0 367633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pcmpeqb xmm6, xmm6 // generate mask 0xff000000 367733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pslld xmm6, 24 367833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 367933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 368033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 368133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // read 4 pixels 368233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm5 // first 2 pixels 368333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm0, xmm2 // pixel * scale >> 16 368433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax] // read 4 pixels 368533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm1, xmm5 // next 2 pixels 368633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm1, xmm2 368733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm0, xmm3 // * interval_size 368833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm7, [eax] // read 4 pixels 368933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmullw xmm1, xmm3 369033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pand xmm7, xmm6 // mask alpha 369133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm0, xmm4 // + interval_size / 2 369233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddw xmm1, xmm4 369333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 369433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp por xmm0, xmm7 369533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 369633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax], xmm0 369733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 369833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 369933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 370033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 370133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 370233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBQUANTIZEROW_SSE2 370333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 370433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_CUMULATIVESUMTOAVERAGE_SSE2 370533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Consider float CumulativeSum. 370633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Consider calling CumulativeSum one row at time as needed. 370733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Consider circular CumulativeSum buffer of radius * 2 + 1 height. 370833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Convert cumulative sum for an area to an average for 1 pixel. 370933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// topleft is pointer to top left of CumulativeSum buffer for area. 371033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// botleft is pointer to bottom left of CumulativeSum buffer. 371133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// width is offset from left to right of area in CumulativeSum buffer measured 371233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// in number of ints. 371333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// area is the number of pixels in the area being averaged. 371433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// dst points to pixel to store result to. 371533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// count is number of averaged pixels to produce. 371633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Does 4 pixels at a time, requires CumulativeSum pointers to be 16 byte 371733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// aligned. 371833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid CumulativeSumToAverage_SSE2(const int32* topleft, const int32* botleft, 371933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int width, int area, uint8* dst, int count) { 372033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 372133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, topleft // eax topleft 372233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, botleft // esi botleft 372333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, width 372433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm4, area 372533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, dst 372633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, count 372733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtdq2ps xmm4, xmm4 372833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp rcpss xmm4, xmm4 // 1.0f / area 372933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm4, xmm4, 0 373033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 373133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl l4b 373233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 373333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 4 pixel loop 373433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 4 373533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l4: 373633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // top left 373733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 373833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, [eax + 16] 373933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [eax + 32] 374033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [eax + 48] 374133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 374233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // - top right 374333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm0, [eax + edx * 4] 374433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm1, [eax + edx * 4 + 16] 374533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm2, [eax + edx * 4 + 32] 374633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm3, [eax + edx * 4 + 48] 374733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 64] 374833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 374933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // - bottom left 375033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm0, [esi] 375133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm1, [esi + 16] 375233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm2, [esi + 32] 375333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm3, [esi + 48] 375433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 375533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // + bottom right 375633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, [esi + edx * 4] 375733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm1, [esi + edx * 4 + 16] 375833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm2, [esi + edx * 4 + 32] 375933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm3, [esi + edx * 4 + 48] 376033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 64] 376133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 376233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtdq2ps xmm0, xmm0 // Average = Sum * 1 / Area 376333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtdq2ps xmm1, xmm1 376433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mulps xmm0, xmm4 376533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mulps xmm1, xmm4 376633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtdq2ps xmm2, xmm2 376733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtdq2ps xmm3, xmm3 376833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mulps xmm2, xmm4 376933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mulps xmm3, xmm4 377033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtps2dq xmm0, xmm0 377133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtps2dq xmm1, xmm1 377233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtps2dq xmm2, xmm2 377333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtps2dq xmm3, xmm3 377433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packssdw xmm0, xmm1 377533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packssdw xmm2, xmm3 377633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm2 377733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edi], xmm0 377833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edi, [edi + 16] 377933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 378033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge l4 378133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 378233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l4b: 378333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add ecx, 4 - 1 378433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl l1b 378533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 378633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 1 pixel loop 378733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 4 378833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l1: 378933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] 379033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm0, [eax + edx * 4] 379133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 379233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psubd xmm0, [esi] 379333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, [esi + edx * 4] 379433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 16] 379533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtdq2ps xmm0, xmm0 379633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mulps xmm0, xmm4 379733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvtps2dq xmm0, xmm0 379833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packssdw xmm0, xmm0 379933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm0 380033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd dword ptr [edi], xmm0 380133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edi, [edi + 4] 380233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 380333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge l1 380433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l1b: 380533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 380633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 380733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_CUMULATIVESUMTOAVERAGE_SSE2 380833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 380933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_COMPUTECUMULATIVESUMROW_SSE2 381033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Creates a table of cumulative sums where each value is a sum of all values 381133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// above and to the left of the value. 381233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ComputeCumulativeSumRow_SSE2(const uint8* row, int32* cumsum, 381333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp const int32* previous_cumsum, int width) { 381433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 381533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, row 381633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, cumsum 381733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, previous_cumsum 381833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, width 381933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub esi, edx 382033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm0, xmm0 382133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pxor xmm1, xmm1 382233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 382333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 382433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl l4b 382533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp test edx, 15 382633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jne l4b 38277cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde 382833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 4 pixel loop 382933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 4 383033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l4: 383133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [eax] // 4 argb pixels 16 bytes. 383233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 383333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm2 383433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 383533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm1 383633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 383733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm2, xmm1 383833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm3, xmm1 383933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 384033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm4, xmm1 384133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, xmm4 384233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm4, xmm1 384333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhwd xmm5, xmm1 384433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 384533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm2 384633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [edx + esi] // previous row above. 384733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm2, xmm0 384833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 384933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm3 385033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, [edx + esi + 16] 385133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm3, xmm0 385233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 385333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm4 385433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, [edx + esi + 32] 385533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm4, xmm0 385633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 385733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm5 385833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm5, [edx + esi + 48] 385933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm5, xmm0 386033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 386133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx], xmm2 386233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 16], xmm3 386333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 32], xmm4 386433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [edx + 48], xmm5 386533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 386633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 64] 386733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 386833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge l4 386933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 387033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l4b: 387133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add ecx, 4 - 1 387233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl l1b 387333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 387433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 1 pixel loop 387533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 4 387633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l1: 387733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, dword ptr [eax] // 1 argb pixel 4 bytes. 387833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 4] 387933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm1 388033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm2, xmm1 388133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm0, xmm2 388233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu xmm2, [edx + esi] 388333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp paddd xmm2, xmm0 388433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqu [edx], xmm2 388533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 388633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 388733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge l1 388833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 388933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l1b: 389033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 389133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 389233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_COMPUTECUMULATIVESUMROW_SSE2 389333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 389433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBSHADE_SSE2 389533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Shade 4 pixels at a time by specified value. 389633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Aligned to 16 bytes. 389733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 389833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBShadeRow_SSE2(const uint8* src_argb, uint8* dst_argb, int width, 389933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint32 value) { 390033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 390133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 4] // src_argb 390233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8] // dst_argb 390333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 12] // width 390433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm2, [esp + 16] // value 390533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edx, eax 390633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm2, xmm2 390733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklqdq xmm2, xmm2 390833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 390933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 391033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp convertloop: 391133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [eax] // read 4 pixels 391233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 391333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm0 // first 2 391433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm1, xmm1 // next 2 391533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm0, xmm2 // argb * value 391633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmulhuw xmm1, xmm2 // argb * value 391733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 8 391833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 8 391933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 392033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 392133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [eax + edx], xmm0 392233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea eax, [eax + 16] 392333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg convertloop 392433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 392533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 392633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 392733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 392833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBSHADE_SSE2 392933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 393033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef HAS_ARGBAFFINEROW_SSE2 393133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Copy ARGB pixels from source image with slope to a row of destination. 393233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 393333cfdeb7b267ab635413797fffb046b73272f7ecHendrik DahlkampLIBYUV_API 393433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBAffineRow_SSE2(const uint8* src_argb, int src_argb_stride, 393533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp uint8* dst_argb, const float* uv_dudv, int width) { 393633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 393733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 393833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 393933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 12] // src_argb 394033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 16] // stride 394133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 20] // dst_argb 394233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 24] // pointer to uv_dudv 394333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq xmm2, qword ptr [ecx] // uv 394433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq xmm7, qword ptr [ecx + 8] // dudv 394533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 28] // width 394633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shl esi, 16 // 4, stride 394733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add esi, 4 394833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm5, esi 394933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 395033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl l4b 395133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 395233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // setup for 4 pixel loop 395333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm7, xmm7, 0x44 // dup dudv 395433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm5, xmm5, 0 // dup 4, stride 395533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, xmm2 // x0, y0, x1, y1 395633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp addps xmm0, xmm7 395733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movlhps xmm2, xmm0 395833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm4, xmm7 395933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp addps xmm4, xmm4 // dudv *= 2 396033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm3, xmm2 // x2, y2, x3, y3 396133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp addps xmm3, xmm4 396233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp addps xmm4, xmm4 // dudv *= 4 396333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 396433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 4 pixel loop 396533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 4 396633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l4: 396733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvttps2dq xmm0, xmm2 // x, y float to int first 2 396833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvttps2dq xmm1, xmm3 // x, y float to int next 2 396933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packssdw xmm0, xmm1 // x, y as 8 shorts 397033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddwd xmm0, xmm5 // offsets = x * 4 + y * stride. 397133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd esi, xmm0 397233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm0, xmm0, 0x39 // shift right 397333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd edi, xmm0 397433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm0, xmm0, 0x39 // shift right 397533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm1, [eax + esi] // read pixel 0 397633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm6, [eax + edi] // read pixel 1 397733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckldq xmm1, xmm6 // combine pixel 0 and 1 397833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp addps xmm2, xmm4 // x, y += dx, dy first 2 397933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr [edx], xmm1 398033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd esi, xmm0 398133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm0, xmm0, 0x39 // shift right 398233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd edi, xmm0 398333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm6, [eax + esi] // read pixel 2 398433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm0, [eax + edi] // read pixel 3 398533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckldq xmm6, xmm0 // combine pixel 2 and 3 398633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp addps xmm3, xmm4 // x, y += dx, dy next 2 398733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 398833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movq qword ptr 8[edx], xmm6 398933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 16] 399033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge l4 399133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 399233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l4b: 399333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add ecx, 4 - 1 399433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jl l1b 399533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 399633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp // 1 pixel loop 399733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 4 399833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l1: 399933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cvttps2dq xmm0, xmm2 // x, y float to int 400033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packssdw xmm0, xmm0 // x, y as shorts 400133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddwd xmm0, xmm5 // offset = x * 4 + y * stride 400233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp addps xmm2, xmm7 // x, y += dx, dy 400333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd esi, xmm0 400433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm0, [eax + esi] // copy a pixel 400533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 1 400633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd [edx], xmm0 400733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea edx, [edx + 4] 400833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jge l1 400933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp l1b: 401033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 401133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 401233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 401333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 401433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 401533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // HAS_ARGBAFFINEROW_SSE2 401633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 401733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp// Bilinear row filtering combines 4x2 -> 4x1. SSSE3 version. 401833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp__declspec(naked) __declspec(align(16)) 401933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkampvoid ARGBInterpolateRow_SSSE3(uint8* dst_ptr, const uint8* src_ptr, 402033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ptrdiff_t src_stride, int dst_width, 402133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp int source_y_fraction) { 402233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp __asm { 402333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push esi 402433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp push edi 402533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edi, [esp + 8 + 4] // dst_ptr 402633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov esi, [esp + 8 + 8] // src_ptr 402733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov edx, [esp + 8 + 12] // src_stride 402833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov ecx, [esp + 8 + 16] // dst_width 402933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp mov eax, [esp + 8 + 20] // source_y_fraction (0..255) 403033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub edi, esi 403133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp shr eax, 1 403233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cmp eax, 0 403333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp je xloop1 403433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp cmp eax, 64 403533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp je xloop2 403633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm0, eax // high fraction 0..127 403733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp neg eax 403833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp add eax, 128 403933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movd xmm5, eax // low fraction 128..1 404033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm5, xmm0 404133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklwd xmm5, xmm5 404233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pshufd xmm5, xmm5, 0 404333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 404433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 404533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp xloop: 404633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [esi] 404733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm2, [esi + edx] 404833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm1, xmm0 404933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpcklbw xmm0, xmm2 405033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp punpckhbw xmm1, xmm2 405133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm0, xmm5 405233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pmaddubsw xmm1, xmm5 405333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm0, 7 405433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp psrlw xmm1, 7 405533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp packuswb xmm0, xmm1 405633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 405733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [esi + edi], xmm0 405833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 16] 405933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg xloop 406033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 406133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 406233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 406333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 406433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 406533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 406633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp xloop1: 406733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [esi] 406833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 406933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [esi + edi], xmm0 407033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 16] 407133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg xloop1 407233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 407333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 407433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 407533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 407633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 407733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp align 16 407833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp xloop2: 407933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa xmm0, [esi] 408033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pavgb xmm0, [esi + edx] 408133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp sub ecx, 4 408233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp movdqa [esi + edi], xmm0 408333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp lea esi, [esi + 16] 408433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp jg xloop2 408533cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 408633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop edi 408733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp pop esi 408833cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp ret 408933cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp } 409033cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} 409133cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 409233cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif // _M_IX86 409333cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp 409433cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#ifdef __cplusplus 40957cd8149e2cbad8b1ff6d481c37a4775d3c8cf2faShri Borde} // extern "C" 409633cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp} // namespace libyuv 409733cfdeb7b267ab635413797fffb046b73272f7ecHendrik Dahlkamp#endif 4098