15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Copyright (c) 2011 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)%include "media/base/simd/media_export.asm" 62a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)%include "third_party/x86inc/x86inc.asm" 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); processors. 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SECTION_TEXT 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) CPU SSE, SSE3, SSE3, SSSE3 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); XMM registers representing constants. We must not use these registers as 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); destination operands. 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); for (int i = 0; i < 16; i += 4) { 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); xmm7.b[i] = 25; xmm7.b[i+1] = 2; xmm7.b[i+2] = 66; xmm7.b[i+3] = 0; 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); xmm6.b[i] = 0; xmm6.b[i+1] = 127; xmm6.b[i+2] = 0; xmm6.b[i+3] = 0; 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0; 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0; 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); } 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_Y0 xmm7 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_Y1 xmm6 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_U xmm5 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_V xmm4 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_128 xmm3 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); LOAD_XMM %1 (xmm), %2 (imm32) 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Loads an immediate value to an XMM register. 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.d[0] = %1.d[1] = %1.d[2] = %1.d[3] = %2; 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro LOAD_XMM 2 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov TEMPd, %2 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movd %1, TEMPd 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pshufd %1, %1, 00000000B 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); UNPACKRGB %1 (xmm), %2 (imm8) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Unpacks one RGB pixel in the specified XMM register. 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1]; 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[%2] = 0; 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i]; 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro UNPACKRGB 2 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movdqa xmm1, %1 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) psrldq xmm1, %2 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pslldq xmm1, %2 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pxor %1, xmm1 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pslldq xmm1, 1 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) por %1, xmm1 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); READ_ARGB %1 (xmm), %2 (imm) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Read the specified number of ARGB (or RGB) pixels from the source and store 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); them to the destination xmm register. If the input format is RGB, we read RGB 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); pixels and convert them to ARGB pixels. (For this case, the alpha values of 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); the output pixels become 0.) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro READ_ARGB 2 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if PIXELSIZE == 4 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Read ARGB pixels from the source. (This macro assumes the input buffer may 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; not be aligned to a 16-byte boundary.) 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if %2 == 1 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movd %1, DWORD [ARGBq + WIDTHq * 4 * 2] 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %2 == 2 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movq %1, QWORD [ARGBq + WIDTHq * 4 * 2] 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %2 == 4 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movdqu %1, DQWORD [ARGBq + WIDTHq * 4 * 2] 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%else 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%error unsupported number of pixels. 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif PIXELSIZE == 3 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Read RGB pixels from the source and convert them to ARGB pixels. 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if %2 == 1 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Read one RGB pixel and convert it to one ARGB pixel. 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Save the WIDTH register to xmm1. (This macro needs to break it.) 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MOVq xmm1, WIDTHq 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Once read three bytes from the source to TEMPd, and copy it to the 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; destination xmm register. 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lea WIDTHq, [WIDTHq + WIDTHq * 2] 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movzx TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2] 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) shl TEMPd, 16 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov TEMPw, WORD [ARGBq + WIDTHq * 2] 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movd %1, TEMPd 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Restore the WIDTH register. 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) MOVq WIDTHq, xmm1 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %2 == 2 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Read two RGB pixels and convert them to two ARGB pixels. 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Read six bytes from the source to the destination xmm register. 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov TEMPq, WIDTHq 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lea TEMPq, [TEMPq + TEMPq * 2] 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movd %1, DWORD [ARGBq + TEMPq * 2] 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pinsrw %1, WORD [ARGBq + TEMPq * 2 + 4], 3 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Fill the alpha values of these RGB pixels with 0 and convert them to two 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; ARGB pixels. 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UNPACKRGB %1, 3 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %2 == 4 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Read four RGB pixels and convert them to four ARGB pixels. 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Read twelve bytes from the source to the destination xmm register. 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mov TEMPq, WIDTHq 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) lea TEMPq, [TEMPq + TEMPq * 2] 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movq %1, QWORD [ARGBq + TEMPq * 2] 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movd xmm1, DWORD [ARGBq + TEMPq * 2 + 8] 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) shufps %1, xmm1, 01000100B 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; Fill the alpha values of these RGB pixels with 0 and convert them to four 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; ARGB pixels. 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UNPACKRGB %1, 3 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UNPACKRGB %1, 4 + 3 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) UNPACKRGB %1, 4 + 4 + 3 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%else 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%error unsupported number of pixels. 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%else 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%error unsupported PIXELSIZE value. 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); CALC_Y %1 (xmm), %2 (xmm) 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Calculates four Y values from four ARGB pixels stored in %2. 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16); 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16); 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16); 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16); 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro CALC_Y 2 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; To avoid signed saturation, we divide this conversion formula into two 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; formulae and store their results into two XMM registers %1 and xmm2. 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[0] = 25 * %2.b[0] + 2 * %2.b[1] + 66 * %2.b[2] + 0 * %2.b[3]; 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[1] = 25 * %2.b[4] + 2 * %2.b[5] + 66 * %2.b[6] + 0 * %2.b[7]; 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[2] = 25 * %2.b[8] + 2 * %2.b[9] + 66 * %2.b[10] + 0 * %2.b[11]; 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[3] = 25 * %2.b[12] + 2 * %2.b[13] + 66 * %2.b[14] + 0 * %2.b[15]; 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; xmm2.w[0] = 0 * %2.b[0] + 127 * %2.b[1] + 0 * %2.b[2] + 0 * %2.b[3]; 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; xmm2.w[1] = 0 * %2.b[4] + 127 * %2.b[5] + 0 * %2.b[6] + 0 * %2.b[7]; 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; xmm2.w[2] = 0 * %2.b[8] + 127 * %2.b[9] + 0 * %2.b[10] + 0 * %2.b[11]; 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; xmm2.w[3] = 0 * %2.b[12] + 127 * %2.b[13] + 0 * %2.b[14] + 0 * %2.b[15]; 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movdqa %1, %2 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pmaddubsw %1, XMM_CONST_Y0 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) phaddsw %1, %1 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movdqa xmm2, %2 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pmaddubsw xmm2, XMM_CONST_Y1 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) phaddsw xmm2, xmm2 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16); 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16); 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16); 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16); 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) paddw %1, xmm2 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movdqa xmm2, XMM_CONST_128 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) paddw %1, xmm2 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) psrlw %1, 8 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) psrlw xmm2, 3 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) paddw %1, xmm2 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) packuswb %1, %1 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); INIT_UV %1 (r32), %2 (reg) %3 (imm) 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro INIT_UV 3 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if SUBSAMPLING == 1 && LINE == 1 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if %3 == 1 || %3 == 2 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movzx %1, BYTE [%2 + WIDTHq] 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %3 == 4 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movzx %1, WORD [%2 + WIDTHq] 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%else 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%error unsupported number of pixels. 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32) 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Calculates two U (or V) values from four ARGB pixels stored in %2. 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); if %3 == XMM_CONST_U 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); if (SUBSAMPLING) { 1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); } else { 2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); } 2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); if %3 == XMM_CONST_V 2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128); 2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128); 2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro CALC_UV 4 2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; for (int i = 0; i < 4; ++i) { 2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[i] = 0; 2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; for (int j = 0; j < 4; ++j) 2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j]; 2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; } 2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movdqa %1, %2 2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pmaddubsw %1, %3 2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) phaddsw %1, %1 2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if SUBSAMPLING == 1 2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2; 2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2; 2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2; 2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2; 2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pshuflw xmm2, %1, 10110001B 2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pavgw %1, xmm2 2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif 2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128); 2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128); 2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pshuflw %1, %1, 10001000B 2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) paddw %1, XMM_CONST_128 2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) psraw %1, 8 2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) paddw %1, XMM_CONST_128 2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) packuswb %1, %1 2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if SUBSAMPLING == 1 && LINE == 1 2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2; 2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2; 2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) movd xmm2, %4 2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) pavgb %1, xmm2 2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif 2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro 2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb, 2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* y, 2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* u, 2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* v, 2472a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles); ptrdiff_t width); 2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL ConvertARGBToYUVRow_SSSE3 2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE 4 2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING 0 2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE 0 2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc" 2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb, 2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* y, 2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* u, 2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* v, 2602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles); ptrdiff_t width); 2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL ConvertRGBToYUVRow_SSSE3 2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE 3 2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING 0 2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE 0 2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc" 2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb, 2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* y, 2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* u, 2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* v, 2732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles); ptrdiff_t width); 2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL ConvertARGBToYUVEven_SSSE3 2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE 4 2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING 1 2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE 0 2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc" 2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb, 2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* y, 2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* u, 2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* v, 2862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles); ptrdiff_t width); 2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL ConvertARGBToYUVOdd_SSSE3 2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE 4 2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING 1 2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE 1 2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc" 2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb, 2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* y, 2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* u, 2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* v, 2992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles); ptrdiff_t width); 3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL ConvertRGBToYUVEven_SSSE3 3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE 3 3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING 1 3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE 0 3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc" 3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb, 3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* y, 3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* u, 3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); uint8* v, 3122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles); ptrdiff_t width); 3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); 3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL ConvertRGBToYUVOdd_SSSE3 3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE 3 3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING 1 3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE 1 3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc" 319