convert_rgb_to_yuv_ssse3.asm revision 2a99a7e74a7f215066514fe81d2bfa6639d9eddd
1; Copyright (c) 2011 The Chromium Authors. All rights reserved. 2; Use of this source code is governed by a BSD-style license that can be 3; found in the LICENSE file. 4 5%include "third_party/x86inc/x86inc.asm" 6 7; 8; This file uses SSE, SSE2, SSE3, and SSSE3, which are supported by all ATOM 9; processors. 10; 11 SECTION_TEXT 12 CPU SSE, SSE3, SSE3, SSSE3 13 14; 15; XMM registers representing constants. We must not use these registers as 16; destination operands. 17; for (int i = 0; i < 16; i += 4) { 18; xmm7.b[i] = 25; xmm7.b[i+1] = 2; xmm7.b[i+2] = 66; xmm7.b[i+3] = 0; 19; xmm6.b[i] = 0; xmm6.b[i+1] = 127; xmm6.b[i+2] = 0; xmm6.b[i+3] = 0; 20; xmm5.b[i] = 112; xmm5.b[i+1] = -74; xmm5.b[i+2] = -38; xmm5.b[i+3] = 0; 21; xmm4.b[i] = -18; xmm4.b[i+1] = -94; xmm4.b[i+2] = 112; xmm4.b[i+3] = 0; 22; } 23; 24%define XMM_CONST_Y0 xmm7 25%define XMM_CONST_Y1 xmm6 26%define XMM_CONST_U xmm5 27%define XMM_CONST_V xmm4 28%define XMM_CONST_128 xmm3 29 30; 31; LOAD_XMM %1 (xmm), %2 (imm32) 32; Loads an immediate value to an XMM register. 33; %1.d[0] = %1.d[1] = %1.d[2] = %1.d[3] = %2; 34; 35%macro LOAD_XMM 2 36 mov TEMPd, %2 37 movd %1, TEMPd 38 pshufd %1, %1, 00000000B 39%endmacro 40 41; 42; UNPACKRGB %1 (xmm), %2 (imm8) 43; Unpacks one RGB pixel in the specified XMM register. 44; for (int i = 15; i > %2; --i) %1.b[i] = %1.b[i - 1]; 45; %1.b[%2] = 0; 46; for (int i = %2 - 1; i >= 0; --i) %1.b[i] = %1.b[i]; 47; 48%macro UNPACKRGB 2 49 movdqa xmm1, %1 50 psrldq xmm1, %2 51 pslldq xmm1, %2 52 pxor %1, xmm1 53 pslldq xmm1, 1 54 por %1, xmm1 55%endmacro 56 57; 58; READ_ARGB %1 (xmm), %2 (imm) 59; Read the specified number of ARGB (or RGB) pixels from the source and store 60; them to the destination xmm register. If the input format is RGB, we read RGB 61; pixels and convert them to ARGB pixels. (For this case, the alpha values of 62; the output pixels become 0.) 63; 64%macro READ_ARGB 2 65 66%if PIXELSIZE == 4 67 68 ; Read ARGB pixels from the source. (This macro assumes the input buffer may 69 ; not be aligned to a 16-byte boundary.) 70%if %2 == 1 71 movd %1, DWORD [ARGBq + WIDTHq * 4 * 2] 72%elif %2 == 2 73 movq %1, QWORD [ARGBq + WIDTHq * 4 * 2] 74%elif %2 == 4 75 movdqu %1, DQWORD [ARGBq + WIDTHq * 4 * 2] 76%else 77%error unsupported number of pixels. 78%endif 79 80%elif PIXELSIZE == 3 81 82 ; Read RGB pixels from the source and convert them to ARGB pixels. 83%if %2 == 1 84 ; Read one RGB pixel and convert it to one ARGB pixel. 85 ; Save the WIDTH register to xmm1. (This macro needs to break it.) 86 MOVq xmm1, WIDTHq 87 88 ; Once read three bytes from the source to TEMPd, and copy it to the 89 ; destination xmm register. 90 lea WIDTHq, [WIDTHq + WIDTHq * 2] 91 movzx TEMPd, BYTE [ARGBq + WIDTHq * 2 + 2] 92 shl TEMPd, 16 93 mov TEMPw, WORD [ARGBq + WIDTHq * 2] 94 movd %1, TEMPd 95 96 ; Restore the WIDTH register. 97 MOVq WIDTHq, xmm1 98%elif %2 == 2 99 ; Read two RGB pixels and convert them to two ARGB pixels. 100 ; Read six bytes from the source to the destination xmm register. 101 mov TEMPq, WIDTHq 102 lea TEMPq, [TEMPq + TEMPq * 2] 103 movd %1, DWORD [ARGBq + TEMPq * 2] 104 pinsrw %1, WORD [ARGBq + TEMPq * 2 + 4], 3 105 106 ; Fill the alpha values of these RGB pixels with 0 and convert them to two 107 ; ARGB pixels. 108 UNPACKRGB %1, 3 109%elif %2 == 4 110 ; Read four RGB pixels and convert them to four ARGB pixels. 111 ; Read twelve bytes from the source to the destination xmm register. 112 mov TEMPq, WIDTHq 113 lea TEMPq, [TEMPq + TEMPq * 2] 114 movq %1, QWORD [ARGBq + TEMPq * 2] 115 movd xmm1, DWORD [ARGBq + TEMPq * 2 + 8] 116 shufps %1, xmm1, 01000100B 117 118 ; Fill the alpha values of these RGB pixels with 0 and convert them to four 119 ; ARGB pixels. 120 UNPACKRGB %1, 3 121 UNPACKRGB %1, 4 + 3 122 UNPACKRGB %1, 4 + 4 + 3 123%else 124%error unsupported number of pixels. 125%endif 126 127%else 128%error unsupported PIXELSIZE value. 129%endif 130 131%endmacro 132 133; 134; CALC_Y %1 (xmm), %2 (xmm) 135; Calculates four Y values from four ARGB pixels stored in %2. 136; %1.b[0] = ToByte((25 * B(0) + 129 * G(0) + 66 * R(0) + 128) / 256 + 16); 137; %1.b[1] = ToByte((25 * B(1) + 129 * G(1) + 66 * R(1) + 128) / 256 + 16); 138; %1.b[2] = ToByte((25 * B(2) + 129 * G(2) + 66 * R(2) + 128) / 256 + 16); 139; %1.b[3] = ToByte((25 * B(3) + 129 * G(3) + 66 * R(3) + 128) / 256 + 16); 140; 141%macro CALC_Y 2 142 ; To avoid signed saturation, we divide this conversion formula into two 143 ; formulae and store their results into two XMM registers %1 and xmm2. 144 ; %1.w[0] = 25 * %2.b[0] + 2 * %2.b[1] + 66 * %2.b[2] + 0 * %2.b[3]; 145 ; %1.w[1] = 25 * %2.b[4] + 2 * %2.b[5] + 66 * %2.b[6] + 0 * %2.b[7]; 146 ; %1.w[2] = 25 * %2.b[8] + 2 * %2.b[9] + 66 * %2.b[10] + 0 * %2.b[11]; 147 ; %1.w[3] = 25 * %2.b[12] + 2 * %2.b[13] + 66 * %2.b[14] + 0 * %2.b[15]; 148 ; xmm2.w[0] = 0 * %2.b[0] + 127 * %2.b[1] + 0 * %2.b[2] + 0 * %2.b[3]; 149 ; xmm2.w[1] = 0 * %2.b[4] + 127 * %2.b[5] + 0 * %2.b[6] + 0 * %2.b[7]; 150 ; xmm2.w[2] = 0 * %2.b[8] + 127 * %2.b[9] + 0 * %2.b[10] + 0 * %2.b[11]; 151 ; xmm2.w[3] = 0 * %2.b[12] + 127 * %2.b[13] + 0 * %2.b[14] + 0 * %2.b[15]; 152 movdqa %1, %2 153 pmaddubsw %1, XMM_CONST_Y0 154 phaddsw %1, %1 155 movdqa xmm2, %2 156 pmaddubsw xmm2, XMM_CONST_Y1 157 phaddsw xmm2, xmm2 158 159 ; %1.b[0] = ToByte((%1.w[0] + xmm2.w[0] + 128) / 256 + 16); 160 ; %1.b[1] = ToByte((%1.w[1] + xmm2.w[1] + 128) / 256 + 16); 161 ; %1.b[2] = ToByte((%1.w[2] + xmm2.w[2] + 128) / 256 + 16); 162 ; %1.b[3] = ToByte((%1.w[3] + xmm2.w[3] + 128) / 256 + 16); 163 paddw %1, xmm2 164 movdqa xmm2, XMM_CONST_128 165 paddw %1, xmm2 166 psrlw %1, 8 167 psrlw xmm2, 3 168 paddw %1, xmm2 169 packuswb %1, %1 170%endmacro 171 172; 173; INIT_UV %1 (r32), %2 (reg) %3 (imm) 174; 175%macro INIT_UV 3 176 177%if SUBSAMPLING == 1 && LINE == 1 178%if %3 == 1 || %3 == 2 179 movzx %1, BYTE [%2 + WIDTHq] 180%elif %3 == 4 181 movzx %1, WORD [%2 + WIDTHq] 182%else 183%error unsupported number of pixels. 184%endif 185%endif 186 187%endmacro 188 189; 190; CALC_UV %1 (xmm), %2 (xmm), %3 (xmm), %4 (r32) 191; Calculates two U (or V) values from four ARGB pixels stored in %2. 192; if %3 == XMM_CONST_U 193; if (SUBSAMPLING) { 194; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 195; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 196; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 197; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 198; } else { 199; %1.b[0] = ToByte((112 * B(0) - 74 * G(0) - 38 * R(0) + 128) / 256 + 128); 200; %1.b[1] = ToByte((112 * B(2) - 74 * G(2) - 38 * R(2) + 128) / 256 + 128); 201; } 202; if %3 == XMM_CONST_V 203; %1.b[0] = ToByte((-18 * B(0) - 94 * G(0) + 112 * R(0) + 128) / 256 + 128); 204; %1.b[1] = ToByte((-18 * B(2) - 94 * G(2) + 112 * R(2) + 128) / 256 + 128); 205; 206%macro CALC_UV 4 207 ; for (int i = 0; i < 4; ++i) { 208 ; %1.w[i] = 0; 209 ; for (int j = 0; j < 4; ++j) 210 ; %1.w[i] += %3.b[i * 4 + j] + %2.b[i * 4 + j]; 211 ; } 212 movdqa %1, %2 213 pmaddubsw %1, %3 214 phaddsw %1, %1 215 216%if SUBSAMPLING == 1 217 ; %1.w[0] = (%1.w[0] + %1.w[1] + 1) / 2; 218 ; %1.w[1] = (%1.w[1] + %1.w[0] + 1) / 2; 219 ; %1.w[2] = (%1.w[2] + %1.w[3] + 1) / 2; 220 ; %1.w[3] = (%1.w[3] + %1.w[2] + 1) / 2; 221 pshuflw xmm2, %1, 10110001B 222 pavgw %1, xmm2 223%endif 224 225 ; %1.b[0] = ToByte((%1.w[0] + 128) / 256 + 128); 226 ; %1.b[1] = ToByte((%1.w[2] + 128) / 256 + 128); 227 pshuflw %1, %1, 10001000B 228 paddw %1, XMM_CONST_128 229 psraw %1, 8 230 paddw %1, XMM_CONST_128 231 packuswb %1, %1 232 233%if SUBSAMPLING == 1 && LINE == 1 234 ; %1.b[0] = (%1.b[0] + %3.b[0] + 1) / 2; 235 ; %1.b[1] = (%1.b[1] + %3.b[1] + 1) / 2; 236 movd xmm2, %4 237 pavgb %1, xmm2 238%endif 239%endmacro 240 241; 242; extern "C" void ConvertARGBToYUVRow_SSSE3(const uint8* argb, 243; uint8* y, 244; uint8* u, 245; uint8* v, 246; ptrdiff_t width); 247; 248%define SYMBOL ConvertARGBToYUVRow_SSSE3 249%define PIXELSIZE 4 250%define SUBSAMPLING 0 251%define LINE 0 252%include "convert_rgb_to_yuv_ssse3.inc" 253 254; 255; extern "C" void ConvertRGBToYUVRow_SSSE3(const uint8* rgb, 256; uint8* y, 257; uint8* u, 258; uint8* v, 259; ptrdiff_t width); 260; 261%define SYMBOL ConvertRGBToYUVRow_SSSE3 262%define PIXELSIZE 3 263%define SUBSAMPLING 0 264%define LINE 0 265%include "convert_rgb_to_yuv_ssse3.inc" 266 267; 268; extern "C" void ConvertARGBToYUVEven_SSSE3(const uint8* argb, 269; uint8* y, 270; uint8* u, 271; uint8* v, 272; ptrdiff_t width); 273; 274%define SYMBOL ConvertARGBToYUVEven_SSSE3 275%define PIXELSIZE 4 276%define SUBSAMPLING 1 277%define LINE 0 278%include "convert_rgb_to_yuv_ssse3.inc" 279 280; 281; extern "C" void ConvertARGBToYUVOdd_SSSE3(const uint8* argb, 282; uint8* y, 283; uint8* u, 284; uint8* v, 285; ptrdiff_t width); 286; 287%define SYMBOL ConvertARGBToYUVOdd_SSSE3 288%define PIXELSIZE 4 289%define SUBSAMPLING 1 290%define LINE 1 291%include "convert_rgb_to_yuv_ssse3.inc" 292 293; 294; extern "C" void ConvertRGBToYUVEven_SSSE3(const uint8* rgb, 295; uint8* y, 296; uint8* u, 297; uint8* v, 298; ptrdiff_t width); 299; 300%define SYMBOL ConvertRGBToYUVEven_SSSE3 301%define PIXELSIZE 3 302%define SUBSAMPLING 1 303%define LINE 0 304%include "convert_rgb_to_yuv_ssse3.inc" 305 306; 307; extern "C" void ConvertRGBToYUVOdd_SSSE3(const uint8* rgb, 308; uint8* y, 309; uint8* u, 310; uint8* v, 311; ptrdiff_t width); 312; 313%define SYMBOL ConvertRGBToYUVOdd_SSSE3 314%define PIXELSIZE 3 315%define SUBSAMPLING 1 316%define LINE 1 317%include "convert_rgb_to_yuv_ssse3.inc" 318