15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Copyright (c) 2011 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
590dce4d38c5ff5333bea97d859d4e484e27edf0cTorne (Richard Coles)%include "media/base/simd/media_export.asm"
62a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)%include "third_party/x86inc/x86inc.asm"
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); processors.
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SECTION_TEXT
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  CPU       SSE, SSE3, SSE3, SSSE3
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); XMM registers representing constants. We must not use these registers as
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); destination operands.
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); for (int i = 0; i < 16; i += 4) {
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   xmm7.b[i] = 25;  xmm7.b[i+1] = 2;   xmm7.b[i+2] = 66;  xmm7.b[i+3] = 0;
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   xmm6.b[i] = 0;   xmm6.b[i+1] = 127; xmm6.b[i+2] = 0;   xmm6.b[i+3] = 0;
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0;
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0;
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); }
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_Y0    xmm7
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_Y1    xmm6
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_U     xmm5
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_V     xmm4
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define XMM_CONST_128   xmm3
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); LOAD_XMM %1 (xmm), %2 (imm32)
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Loads an immediate value to an XMM register.
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.d[0] = %1.d[1] =  %1.d[2] =  %1.d[3] = %2;
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro LOAD_XMM 2
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  mov       TEMPd, %2
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movd      %1, TEMPd
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pshufd    %1, %1, 00000000B
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); UNPACKRGB %1 (xmm), %2 (imm8)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Unpacks one RGB pixel in the specified XMM register.
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1];
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[%2] = 0;
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i];
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro UNPACKRGB 2
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movdqa    xmm1, %1
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  psrldq    xmm1, %2
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pslldq    xmm1, %2
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pxor      %1, xmm1
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pslldq    xmm1, 1
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  por       %1, xmm1
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); READ_ARGB %1 (xmm), %2 (imm)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Read the specified number of ARGB (or RGB) pixels from the source and store
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); them to the destination xmm register. If the input format is RGB, we read RGB
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); pixels and convert them to ARGB pixels. (For this case, the alpha values of
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); the output pixels become 0.)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro READ_ARGB 2
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if PIXELSIZE == 4
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read ARGB pixels from the source. (This macro assumes the input buffer may
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; not be aligned to a 16-byte boundary.)
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if %2 == 1
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movd      %1, DWORD [ARGBq + WIDTHq * 4 * 2]
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %2 == 2
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      %1, QWORD [ARGBq + WIDTHq * 4 * 2]
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %2 == 4
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movdqu    %1, DQWORD [ARGBq + WIDTHq * 4 * 2]
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%else
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%error unsupported number of pixels.
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif PIXELSIZE == 3
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read RGB pixels from the source and convert them to ARGB pixels.
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if %2 == 1
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read one RGB pixel and convert it to one ARGB pixel.
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Save the WIDTH register to xmm1. (This macro needs to break it.)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  MOVq      xmm1, WIDTHq
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Once read three bytes from the source to TEMPd, and copy it to the
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; destination xmm register.
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  lea       WIDTHq, [WIDTHq + WIDTHq * 2]
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2]
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  shl       TEMPd, 16
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  mov       TEMPw, WORD [ARGBq + WIDTHq * 2]
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movd      %1, TEMPd
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Restore the WIDTH register.
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  MOVq      WIDTHq, xmm1
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %2 == 2
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read two RGB pixels and convert them to two ARGB pixels.
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read six bytes from the source to the destination xmm register.
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  mov       TEMPq, WIDTHq
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  lea       TEMPq, [TEMPq + TEMPq * 2]
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movd      %1, DWORD [ARGBq + TEMPq * 2]
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pinsrw    %1, WORD [ARGBq + TEMPq * 2 + 4], 3
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Fill the alpha values of these RGB pixels with 0 and convert them to two
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; ARGB pixels.
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UNPACKRGB %1, 3
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %2 == 4
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read four RGB pixels and convert them to four ARGB pixels.
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Read twelve bytes from the source to the destination xmm register.
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  mov       TEMPq, WIDTHq
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  lea       TEMPq, [TEMPq + TEMPq * 2]
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movq      %1, QWORD [ARGBq + TEMPq * 2]
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movd      xmm1, DWORD [ARGBq + TEMPq * 2 + 8]
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  shufps    %1, xmm1, 01000100B
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; Fill the alpha values of these RGB pixels with 0 and convert them to four
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; ARGB pixels.
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UNPACKRGB %1, 3
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UNPACKRGB %1, 4 + 3
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  UNPACKRGB %1, 4 + 4 + 3
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%else
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%error unsupported number of pixels.
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%else
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%error unsupported PIXELSIZE value.
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); CALC_Y %1 (xmm), %2 (xmm)
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Calculates four Y values from four ARGB pixels stored in %2.
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16);
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16);
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16);
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16);
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro CALC_Y 2
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; To avoid signed saturation, we divide this conversion formula into two
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; formulae and store their results into two XMM registers %1 and xmm2.
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.w[0]   = 25  * %2.b[0]  + 2   * %2.b[1]  + 66  * %2.b[2]  + 0 * %2.b[3];
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.w[1]   = 25  * %2.b[4]  + 2   * %2.b[5]  + 66  * %2.b[6]  + 0 * %2.b[7];
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.w[2]   = 25  * %2.b[8]  + 2   * %2.b[9]  + 66  * %2.b[10] + 0 * %2.b[11];
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.w[3]   = 25  * %2.b[12] + 2   * %2.b[13] + 66  * %2.b[14] + 0 * %2.b[15];
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; xmm2.w[0] = 0   * %2.b[0]  + 127 * %2.b[1]  + 0   * %2.b[2]  + 0 * %2.b[3];
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; xmm2.w[1] = 0   * %2.b[4]  + 127 * %2.b[5]  + 0   * %2.b[6]  + 0 * %2.b[7];
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; xmm2.w[2] = 0   * %2.b[8]  + 127 * %2.b[9]  + 0   * %2.b[10] + 0 * %2.b[11];
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; xmm2.w[3] = 0   * %2.b[12] + 127 * %2.b[13] + 0   * %2.b[14] + 0 * %2.b[15];
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movdqa    %1, %2
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pmaddubsw %1, XMM_CONST_Y0
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  phaddsw   %1, %1
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movdqa    xmm2, %2
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pmaddubsw xmm2, XMM_CONST_Y1
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  phaddsw   xmm2, xmm2
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16);
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16);
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16);
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16);
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddw     %1, xmm2
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movdqa    xmm2, XMM_CONST_128
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddw     %1, xmm2
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  psrlw     %1, 8
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  psrlw     xmm2, 3
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddw     %1, xmm2
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  packuswb  %1, %1
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); INIT_UV %1 (r32), %2 (reg) %3 (imm)
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro INIT_UV 3
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if SUBSAMPLING == 1 && LINE == 1
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if %3 == 1 || %3 == 2
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     %1, BYTE [%2 + WIDTHq]
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%elif %3 == 4
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movzx     %1, WORD [%2 + WIDTHq]
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%else
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%error unsupported number of pixels.
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32)
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); Calculates two U (or V) values from four ARGB pixels stored in %2.
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); if %3 == XMM_CONST_U
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); if (SUBSAMPLING) {
1955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
1965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
1975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
1985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
1995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); } else {
2005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128);
2015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128);
2025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); }
2035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); if %3 == XMM_CONST_V
2045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128);
2055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);   %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128);
2065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%macro CALC_UV 4
2085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; for (int i = 0; i < 4; ++i) {
2095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ;   %1.w[i] = 0;
2105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ;   for (int j = 0; j < 4; ++j)
2115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ;     %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j];
2125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; }
2135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movdqa    %1, %2
2145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pmaddubsw %1, %3
2155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  phaddsw   %1, %1
2165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if SUBSAMPLING == 1
2185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2;
2195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2;
2205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2;
2215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2;
2225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pshuflw   xmm2, %1, 10110001B
2235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pavgw     %1, xmm2
2245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif
2255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128);
2275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128);
2285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pshuflw   %1, %1, 10001000B
2295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddw     %1, XMM_CONST_128
2305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  psraw     %1, 8
2315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  paddw     %1, XMM_CONST_128
2325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  packuswb  %1, %1
2335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%if SUBSAMPLING == 1 && LINE == 1
2355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2;
2365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2;
2375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  movd      xmm2, %4
2385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  pavgb     %1, xmm2
2395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endif
2405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%endmacro
2415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb,
2445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* y,
2455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* u,
2465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* v,
2472a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles);                                           ptrdiff_t width);
2485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL          ConvertARGBToYUVRow_SSSE3
2505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE       4
2515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING     0
2525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE            0
2535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc"
2545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb,
2575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                          uint8* y,
2585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                          uint8* u,
2595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                          uint8* v,
2602a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles);                                          ptrdiff_t width);
2615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL          ConvertRGBToYUVRow_SSSE3
2635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE       3
2645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING     0
2655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE            0
2665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc"
2675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb,
2705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                            uint8* y,
2715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                            uint8* u,
2725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                            uint8* v,
2732a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles);                                            ptrdiff_t width);
2745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL          ConvertARGBToYUVEven_SSSE3
2765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE       4
2775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING     1
2785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE            0
2795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc"
2805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb,
2835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* y,
2845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* u,
2855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* v,
2862a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles);                                           ptrdiff_t width);
2875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL          ConvertARGBToYUVOdd_SSSE3
2895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE       4
2905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING     1
2915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE            1
2925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc"
2935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
2945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
2955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb,
2965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* y,
2975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* u,
2985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                           uint8* v,
2992a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles);                                           ptrdiff_t width);
3005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
3015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL          ConvertRGBToYUVEven_SSSE3
3025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE       3
3035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING     1
3045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE            0
3055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc"
3065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
3075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
3085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles); extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb,
3095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                          uint8* y,
3105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                          uint8* u,
3115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);                                          uint8* v,
3122a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles);                                          ptrdiff_t width);
3135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles);
3145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SYMBOL          ConvertRGBToYUVOdd_SSSE3
3155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define PIXELSIZE       3
3165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define SUBSAMPLING     1
3175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%define LINE            1
3185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)%include "convert_rgb_to_yuv_ssse3.inc"
319