1; Copyright (c) 2011 The Chromium Authors. All rights reserved. 2; Use of this source code is governed by a BSD-style license that can be 3; found in the LICENSE file. 4 5%include "media/base/simd/media_export.asm" 6%include "third_party/x86inc/x86inc.asm" 7 8; 9; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM 10; processors. 11; 12 SECTION_TEXT 13 CPU SSE, SSE3, SSE3, SSSE3 14 15; 16; XMM registers representing constants. We must not use these registers as 17; destination operands. 18; for (int i = 0; i < 16; i += 4) { 19; xmm7.b[i] = 25; xmm7.b[i+1] = 2; xmm7.b[i+2] = 66; xmm7.b[i+3] = 0; 20; xmm6.b[i] = 0; xmm6.b[i+1] = 127; xmm6.b[i+2] = 0; xmm6.b[i+3] = 0; 21; xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0; 22; xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0; 23; } 24; 25%define XMM_CONST_Y0 xmm7 26%define XMM_CONST_Y1 xmm6 27%define XMM_CONST_U xmm5 28%define XMM_CONST_V xmm4 29%define XMM_CONST_128 xmm3 30 31; 32; LOAD_XMM %1 (xmm), %2 (imm32) 33; Loads an immediate value to an XMM register. 34; %1.d[0] = %1.d[1] = %1.d[2] = %1.d[3] = %2; 35; 36%macro LOAD_XMM 2 37 mov TEMPd, %2 38 movd %1, TEMPd 39 pshufd %1, %1, 00000000B 40%endmacro 41 42; 43; UNPACKRGB %1 (xmm), %2 (imm8) 44; Unpacks one RGB pixel in the specified XMM register. 45; for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1]; 46; %1.b[%2] = 0; 47; for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i]; 48; 49%macro UNPACKRGB 2 50 movdqa xmm1, %1 51 psrldq xmm1, %2 52 pslldq xmm1, %2 53 pxor %1, xmm1 54 pslldq xmm1, 1 55 por %1, xmm1 56%endmacro 57 58; 59; READ_ARGB %1 (xmm), %2 (imm) 60; Read the specified number of ARGB (or RGB) pixels from the source and store 61; them to the destination xmm register. If the input format is RGB, we read RGB 62; pixels and convert them to ARGB pixels. (For this case, the alpha values of 63; the output pixels become 0.) 64; 65%macro READ_ARGB 2 66 67%if PIXELSIZE == 4 68 69 ; Read ARGB pixels from the source. (This macro assumes the input buffer may 70 ; not be aligned to a 16-byte boundary.) 71%if %2 == 1 72 movd %1, DWORD [ARGBq + WIDTHq * 4 * 2] 73%elif %2 == 2 74 movq %1, QWORD [ARGBq + WIDTHq * 4 * 2] 75%elif %2 == 4 76 movdqu %1, DQWORD [ARGBq + WIDTHq * 4 * 2] 77%else 78%error unsupported number of pixels. 79%endif 80 81%elif PIXELSIZE == 3 82 83 ; Read RGB pixels from the source and convert them to ARGB pixels. 84%if %2 == 1 85 ; Read one RGB pixel and convert it to one ARGB pixel. 86 ; Save the WIDTH register to xmm1. (This macro needs to break it.) 87 MOVq xmm1, WIDTHq 88 89 ; Once read three bytes from the source to TEMPd, and copy it to the 90 ; destination xmm register. 91 lea WIDTHq, [WIDTHq + WIDTHq * 2] 92 movzx TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2] 93 shl TEMPd, 16 94 mov TEMPw, WORD [ARGBq + WIDTHq * 2] 95 movd %1, TEMPd 96 97 ; Restore the WIDTH register. 98 MOVq WIDTHq, xmm1 99%elif %2 == 2 100 ; Read two RGB pixels and convert them to two ARGB pixels. 101 ; Read six bytes from the source to the destination xmm register. 102 mov TEMPq, WIDTHq 103 lea TEMPq, [TEMPq + TEMPq * 2] 104 movd %1, DWORD [ARGBq + TEMPq * 2] 105 pinsrw %1, WORD [ARGBq + TEMPq * 2 + 4], 3 106 107 ; Fill the alpha values of these RGB pixels with 0 and convert them to two 108 ; ARGB pixels. 109 UNPACKRGB %1, 3 110%elif %2 == 4 111 ; Read four RGB pixels and convert them to four ARGB pixels. 112 ; Read twelve bytes from the source to the destination xmm register. 113 mov TEMPq, WIDTHq 114 lea TEMPq, [TEMPq + TEMPq * 2] 115 movq %1, QWORD [ARGBq + TEMPq * 2] 116 movd xmm1, DWORD [ARGBq + TEMPq * 2 + 8] 117 shufps %1, xmm1, 01000100B 118 119 ; Fill the alpha values of these RGB pixels with 0 and convert them to four 120 ; ARGB pixels. 121 UNPACKRGB %1, 3 122 UNPACKRGB %1, 4 + 3 123 UNPACKRGB %1, 4 + 4 + 3 124%else 125%error unsupported number of pixels. 126%endif 127 128%else 129%error unsupported PIXELSIZE value. 130%endif 131 132%endmacro 133 134; 135; CALC_Y %1 (xmm), %2 (xmm) 136; Calculates four Y values from four ARGB pixels stored in %2. 137; %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16); 138; %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16); 139; %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16); 140; %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16); 141; 142%macro CALC_Y 2 143 ; To avoid signed saturation, we divide this conversion formula into two 144 ; formulae and store their results into two XMM registers %1 and xmm2. 145 ; %1.w[0] = 25 * %2.b[0] + 2 * %2.b[1] + 66 * %2.b[2] + 0 * %2.b[3]; 146 ; %1.w[1] = 25 * %2.b[4] + 2 * %2.b[5] + 66 * %2.b[6] + 0 * %2.b[7]; 147 ; %1.w[2] = 25 * %2.b[8] + 2 * %2.b[9] + 66 * %2.b[10] + 0 * %2.b[11]; 148 ; %1.w[3] = 25 * %2.b[12] + 2 * %2.b[13] + 66 * %2.b[14] + 0 * %2.b[15]; 149 ; xmm2.w[0] = 0 * %2.b[0] + 127 * %2.b[1] + 0 * %2.b[2] + 0 * %2.b[3]; 150 ; xmm2.w[1] = 0 * %2.b[4] + 127 * %2.b[5] + 0 * %2.b[6] + 0 * %2.b[7]; 151 ; xmm2.w[2] = 0 * %2.b[8] + 127 * %2.b[9] + 0 * %2.b[10] + 0 * %2.b[11]; 152 ; xmm2.w[3] = 0 * %2.b[12] + 127 * %2.b[13] + 0 * %2.b[14] + 0 * %2.b[15]; 153 movdqa %1, %2 154 pmaddubsw %1, XMM_CONST_Y0 155 phaddsw %1, %1 156 movdqa xmm2, %2 157 pmaddubsw xmm2, XMM_CONST_Y1 158 phaddsw xmm2, xmm2 159 160 ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16); 161 ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16); 162 ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16); 163 ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16); 164 paddw %1, xmm2 165 movdqa xmm2, XMM_CONST_128 166 paddw %1, xmm2 167 psrlw %1, 8 168 psrlw xmm2, 3 169 paddw %1, xmm2 170 packuswb %1, %1 171%endmacro 172 173; 174; INIT_UV %1 (r32), %2 (reg) %3 (imm) 175; 176%macro INIT_UV 3 177 178%if SUBSAMPLING == 1 && LINE == 1 179%if %3 == 1 || %3 == 2 180 movzx %1, BYTE [%2 + WIDTHq] 181%elif %3 == 4 182 movzx %1, WORD [%2 + WIDTHq] 183%else 184%error unsupported number of pixels. 185%endif 186%endif 187 188%endmacro 189 190; 191; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32) 192; Calculates two U (or V) values from four ARGB pixels stored in %2. 193; if %3 == XMM_CONST_U 194; if (SUBSAMPLING) { 195; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 196; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 197; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 198; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 199; } else { 200; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 201; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 202; } 203; if %3 == XMM_CONST_V 204; %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128); 205; %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128); 206; 207%macro CALC_UV 4 208 ; for (int i = 0; i < 4; ++i) { 209 ; %1.w[i] = 0; 210 ; for (int j = 0; j < 4; ++j) 211 ; %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j]; 212 ; } 213 movdqa %1, %2 214 pmaddubsw %1, %3 215 phaddsw %1, %1 216 217%if SUBSAMPLING == 1 218 ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2; 219 ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2; 220 ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2; 221 ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2; 222 pshuflw xmm2, %1, 10110001B 223 pavgw %1, xmm2 224%endif 225 226 ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128); 227 ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128); 228 pshuflw %1, %1, 10001000B 229 paddw %1, XMM_CONST_128 230 psraw %1, 8 231 paddw %1, XMM_CONST_128 232 packuswb %1, %1 233 234%if SUBSAMPLING == 1 && LINE == 1 235 ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2; 236 ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2; 237 movd xmm2, %4 238 pavgb %1, xmm2 239%endif 240%endmacro 241 242; 243; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb, 244; uint8* y, 245; uint8* u, 246; uint8* v, 247; ptrdiff_t width); 248; 249%define SYMBOL ConvertARGBToYUVRow_SSSE3 250%define PIXELSIZE 4 251%define SUBSAMPLING 0 252%define LINE 0 253%include "convert_rgb_to_yuv_ssse3.inc" 254 255; 256; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb, 257; uint8* y, 258; uint8* u, 259; uint8* v, 260; ptrdiff_t width); 261; 262%define SYMBOL ConvertRGBToYUVRow_SSSE3 263%define PIXELSIZE 3 264%define SUBSAMPLING 0 265%define LINE 0 266%include "convert_rgb_to_yuv_ssse3.inc" 267 268; 269; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb, 270; uint8* y, 271; uint8* u, 272; uint8* v, 273; ptrdiff_t width); 274; 275%define SYMBOL ConvertARGBToYUVEven_SSSE3 276%define PIXELSIZE 4 277%define SUBSAMPLING 1 278%define LINE 0 279%include "convert_rgb_to_yuv_ssse3.inc" 280 281; 282; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb, 283; uint8* y, 284; uint8* u, 285; uint8* v, 286; ptrdiff_t width); 287; 288%define SYMBOL ConvertARGBToYUVOdd_SSSE3 289%define PIXELSIZE 4 290%define SUBSAMPLING 1 291%define LINE 1 292%include "convert_rgb_to_yuv_ssse3.inc" 293 294; 295; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb, 296; uint8* y, 297; uint8* u, 298; uint8* v, 299; ptrdiff_t width); 300; 301%define SYMBOL ConvertRGBToYUVEven_SSSE3 302%define PIXELSIZE 3 303%define SUBSAMPLING 1 304%define LINE 0 305%include "convert_rgb_to_yuv_ssse3.inc" 306 307; 308; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb, 309; uint8* y, 310; uint8* u, 311; uint8* v, 312; ptrdiff_t width); 313; 314%define SYMBOL ConvertRGBToYUVOdd_SSSE3 315%define PIXELSIZE 3 316%define SUBSAMPLING 1 317%define LINE 1 318%include "convert_rgb_to_yuv_ssse3.inc" 319