mmx_blend.S revision d60bb2fbc8b61e9748ce9c235acd4e870a2df613
1/* 2 * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk> 3 */ 4 5#include "matypes.h" 6 7/* 8 * make the following approximation to the division (Sree) 9 * 10 * rgb*a/255 ~= (rgb*(a+1)) >> 256 11 * 12 * which is the fastest method that satisfies the following OpenGL criteria 13 * 14 * 0*0 = 0 and 255*255 = 255 15 * 16 * note this one should be used alone 17 */ 18#define GMBT_ALPHA_PLUS_ONE 0 19 20/* 21 * take the geometric series approximation to the division 22 * 23 * t/255 = (t >> 8) + (t >> 16) + (t >> 24) .. 24 * 25 * in this case just the first two terms to fit in 16bit arithmetic 26 * 27 * t/255 ~= (t + (t >> 8)) >> 8 28 * 29 * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254, 30 * so the special case a = 255 must be accounted or roundoff must be used 31 */ 32#define GMBT_GEOMETRIC_SERIES 1 33 34/* 35 * when using a geometric series division instead of truncating the result 36 * use roundoff in the approximation (Jim Blinn) 37 * 38 * t = rgb*a + 0x80 39 * 40 * achieving the exact results 41 */ 42#define GMBT_ROUNDOFF 0 43 44/* instead of the roundoff this adds a small correction to satisfy the OpenGL criteria 45 * 46 * t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8 47 * 48 * note that although is faster than rounding off it doesn't give always the exact results 49 */ 50#define GMBT_GEOMETRIC_CORRECTION 1 51 52/* 53 * do 54 * 55 * s = (q - p)*a + q 56 * 57 * instead of 58 * 59 * s = p*a + q*(1-a) 60 * 61 * this eliminates a multiply at the expense of 62 * complicating the roundoff but is generally worth it 63 */ 64#define GMBT_SIGNED_ARITHMETIC 1 65 66#if GMBT_ROUNDOFF 67 SEG_DATA 68 69ALIGNDATA8 70const_80: 71 D_LONG 0x00800080, 0x00800080 72#endif 73 74 SEG_TEXT 75 76ALIGNTEXT16 77GLOBL GLNAME(_mesa_mmx_blend_transparency) 78 79/* 80 * void blend_transparency( GLcontext *ctx, 81 * GLuint n, 82 * const GLubyte mask[], 83 * GLchan rgba[][4], 84 * CONST GLchan dest[][4] ) 85 * 86 * Common transparency blending mode. 87 */ 88GLNAME( _mesa_mmx_blend_transparency ): 89 90 PUSH_L ( EBP ) 91 MOV_L ( ESP, EBP ) 92 PUSH_L ( ESI ) 93 PUSH_L ( EDI ) 94 PUSH_L ( EBX ) 95 96 MOV_L ( REGOFF(12, EBP), ECX ) /* n */ 97 CMP_L ( CONST(0), ECX) 98 JE ( LLBL (GMBT_return) ) 99 100 MOV_L ( REGOFF(16, EBP), EBX ) /* mask */ 101 MOV_L ( REGOFF(20, EBP), EDI ) /* rgba */ 102 MOV_L ( REGOFF(24, EBP), ESI ) /* dest */ 103 104 TEST_L ( CONST(4), EDI ) /* align rgba on an 8-byte boundary */ 105 JZ ( LLBL (GMBT_align_end) ) 106 107 CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */ 108 JE ( LLBL (GMBT_align_continue) ) 109 110 PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ 111 112 MOVD ( REGIND(ESI), MM1 ) /* | | | | qa1 | qb1 | qg1 | qr1 */ 113 MOVD ( REGIND(EDI), MM2 ) /* | | | | pa1 | pb1 | pg1 | pr1 */ 114 115 PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */ 116 PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */ 117 118 MOVQ ( MM2, MM3 ) 119 120 PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */ 121 PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */ 122 123#if GMBT_ALPHA_PLUS_ONE 124 PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ 125 126 PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */ 127#endif 128 129#if GMBT_SIGNED_ARITHMETIC 130 PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ 131 132 PSLLW ( CONST(8), MM1 ) /* q1 << 8 */ 133 134#if GMBT_ROUNDOFF 135 MOVQ ( MM2, MM4 ) 136#endif 137 138 PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */ 139 140#if GMBT_ROUNDOFF 141 PSRLW ( CONST(15), MM4 ) /* q1 > p1 ? 1 : 0 */ 142 143 PSLLW ( CONST(8), MM4 ) /* q1 > p1 ? 0x100 : 0 */ 144 145 PSUBW ( MM4, MM2 ) /* t1 -=? 0x100 */ 146#endif 147 148#else 149 PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ 150 PUNPCKLBW ( MM0, MM4 ) /* 0x00ff | 0x00ff | 0x00ff | 0x00ff */ 151 MOVQ ( MM4, MM0 ) 152 153 PMULLW ( MM3, MM2 ) /* p1*pa1 */ 154 155 PSUBW ( MM3, MM0 ) /* 255 - pa1 | 255 - pa1 | 255 - pa1 | 255 - pa1 */ 156 157 PMULLW ( MM0, MM1 ) /* q1*(255 - pa1) */ 158 159 PADDW ( MM1, MM2 ) /* t1 = p1*pa1 + q1*(255 - pa1) */ 160#endif 161 162#if GMBT_ROUNDOFF 163 MOVQ ( CONTENT(const_80), MM4 ) 164 165 PADDW ( MM4, MM2 ) /* t1 += 0x80 */ 166#endif 167 168#if GMBT_GEOMETRIC_SERIES 169 MOVQ ( MM2, MM3 ) 170 171 PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */ 172 173 PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ 174 175#if GMBT_GEOMETRIC_CORRECTION 176 PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */ 177 178 PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ 179#endif 180#endif 181 182#if GMBT_SIGNED_ARITHMETIC 183 PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */ 184#endif 185 186 PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */ 187 188 PACKUSWB ( MM0, MM2 ) /* | | | | sa1 | sb1 | sg1 | sr1 */ 189 MOVD ( MM2, REGIND(EDI) ) 190 191LLBL (GMBT_align_continue): 192 193 DEC_L ( ECX ) /* n -= 1 */ 194 INC_L ( EBX ) /* mask += 1 */ 195 ADD_L ( CONST(4), EDI ) /* rgba += 1 */ 196 ADD_L ( CONST(4), ESI ) /* dest += 1 */ 197 198LLBL (GMBT_align_end): 199 200 CMP_L ( CONST(2), ECX) 201 JB ( LLBL (GMBT_loop_end) ) 202 203ALIGNTEXT16 204LLBL (GMBT_loop_begin): 205 206 CMP_W ( CONST(0), REGIND(EBX) ) /* *mask == 0 && *(mask + 1) == 0 */ 207 JE ( LLBL (GMBT_loop_continue) ) 208 209 /* NOTE: the instruction pairing when multiple pipelines are available must be checked */ 210 211 PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ 212 213 MOVQ ( REGIND(ESI), MM7 ) /* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */ 214 MOVQ ( REGIND(EDI), MM6 ) /* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */ 215 216 MOVQ ( MM7, MM1 ) 217 MOVQ ( MM6, MM2 ) 218 219 PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */ 220 PUNPCKHBW ( MM0, MM7 ) /* qa2 | qb2 | qg2 | qr2 */ 221 PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */ 222 PUNPCKHBW ( MM0, MM6 ) /* pa2 | pb2 | pg2 | pr2 */ 223 224 MOVQ ( MM2, MM3 ) 225 MOVQ ( MM6, MM5 ) 226 227 PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */ 228 PUNPCKHWD ( MM5, MM5 ) /* pa2 | pa2 | | */ 229 PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */ 230 PUNPCKHDQ ( MM5, MM5 ) /* pa2 | pa2 | pa2 | pa2 */ 231 232#if GMBT_ALPHA_PLUS_ONE 233 PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ 234 235 PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */ 236 PSUBW ( MM4, MM5 ) /* pa2 + 1 | pa2 + 1 | pa2 + 1 | pa2 + 1 */ 237#endif 238 239#if GMBT_SIGNED_ARITHMETIC 240 PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ 241 PSUBW ( MM7, MM6 ) /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */ 242 243 PSLLW ( CONST(8), MM1 ) /* q1 << 8 */ 244 PSLLW ( CONST(8), MM7 ) /* q2 << 8 */ 245 246#if GMBT_ROUNDOFF 247 MOVQ ( MM2, MM0 ) 248 MOVQ ( MM6, MM4 ) 249#endif 250 251 PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */ 252 PMULLW ( MM5, MM6 ) /* t2 = (q2 - p2)*pa2 */ 253 254#if GMBT_ROUNDOFF 255 PSRLW ( CONST(15), MM0 ) /* q1 > p1 ? 1 : 0 */ 256 PSRLW ( CONST(15), MM4 ) /* q2 > q2 ? 1 : 0 */ 257 258 PSLLW ( CONST(8), MM0 ) /* q1 > p1 ? 0x100 : 0 */ 259 PSLLW ( CONST(8), MM4 ) /* q2 > q2 ? 0x100 : 0 */ 260 261 PSUBW ( MM0, MM2 ) /* t1 -=? 0x100 */ 262 PSUBW ( MM4, MM7 ) /* t2 -=? 0x100 */ 263#endif 264 265#else 266 PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ 267 PUNPCKLBW ( MM0, MM4 ) /* 0x00ff | 0x00ff | 0x00ff | 0x00ff */ 268 MOVQ ( MM4, MM0 ) 269 270 PMULLW ( MM3, MM2 ) /* p1*pa1 */ 271 PMULLW ( MM5, MM6 ) /* p2*pa2 */ 272 273 PSUBW ( MM3, MM0 ) /* 255 - pa1 | 255 - pa1 | 255 - pa1 | 255 - pa1 */ 274 PSUBW ( MM5, MM4 ) /* 255 - pa2 | 255 - pa2 | 255 - pa2 | 255 - pa2 */ 275 276 PMULLW ( MM0, MM1 ) /* q1*(255 - pa1) */ 277 PMULLW ( MM4, MM7 ) /* q2*(255 - pa2) */ 278 279 PADDW ( MM1, MM2 ) /* t1 = p1*pa1 + q1*(255 - pa1) */ 280 PADDW ( MM7, MM6 ) /* t2 = p2*pa2 + q2*(255 - pa2) */ 281#endif 282 283#if GMBT_ROUNDOFF 284 MOVQ ( CONTENT(const_80), MM4 ) 285 286 PADDW ( MM4, MM2 ) /* t1 += 0x80 */ 287 PADDW ( MM4, MM6 ) /* t2 += 0x80 */ 288#endif 289 290#if GMBT_GEOMETRIC_SERIES 291 MOVQ ( MM2, MM3 ) 292 MOVQ ( MM6, MM5 ) 293 294 PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */ 295 PSRLW ( CONST(8), MM5 ) /* t2 >> 8 */ 296 297 PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ 298 PADDW ( MM5, MM6 ) /* t2 + (t2 >> 8) ~= (t2/255) << 8 */ 299 300#if GMBT_GEOMETRIC_CORRECTION 301 PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */ 302 PSRLW ( CONST(7), MM5 ) /* t2 >> 15 */ 303 304 PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ 305 PADDW ( MM5, MM6 ) /* t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8 */ 306#endif 307#endif 308 309#if GMBT_SIGNED_ARITHMETIC 310 PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */ 311 PADDW ( MM7, MM6 ) /* (t2/255 + q2) << 8 */ 312#endif 313 314 PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */ 315 PSRLW ( CONST(8), MM6 ) /* sa2 | sb2 | sg2 | sr2 */ 316 317 PACKUSWB ( MM6, MM2 ) /* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */ 318 MOVQ ( MM2, REGIND(EDI) ) 319 320LLBL (GMBT_loop_continue): 321 322 DEC_L ( ECX ) 323 DEC_L ( ECX ) /* n -= 2 */ 324 ADD_L ( CONST(2), EBX ) /* mask += 2 */ 325 ADD_L ( CONST(8), EDI ) /* rgba += 2 */ 326 ADD_L ( CONST(8), ESI ) /* dest += 2 */ 327 CMP_L ( CONST(2), ECX ) 328 JAE ( LLBL (GMBT_loop_begin) ) 329 330LLBL (GMBT_loop_end): 331 332 CMP_L ( CONST(1), ECX ) 333 JB ( LLBL (GMBT_done) ) 334 335 CMP_B ( CONST(0), REGIND(EBX) ) /* *mask == 0 */ 336 JE ( LLBL (GMBT_done) ) 337 338 PXOR ( MM0, MM0 ) /* 0x0000 | 0x0000 | 0x0000 | 0x0000 */ 339 340 MOVD ( REGIND(ESI), MM1 ) /* | | | | qa1 | qb1 | qg1 | qr1 */ 341 MOVD ( REGIND(EDI), MM2 ) /* | | | | pa1 | pb1 | pg1 | pr1 */ 342 343 PUNPCKLBW ( MM0, MM1 ) /* qa1 | qb1 | qg1 | qr1 */ 344 PUNPCKLBW ( MM0, MM2 ) /* pa1 | pb1 | pg1 | pr1 */ 345 346 MOVQ ( MM2, MM3 ) 347 348 PUNPCKHWD ( MM3, MM3 ) /* pa1 | pa1 | | */ 349 PUNPCKHDQ ( MM3, MM3 ) /* pa1 | pa1 | pa1 | pa1 */ 350 351#if GMBT_ALPHA_PLUS_ONE 352 PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ 353 354 PSUBW ( MM4, MM3 ) /* pa1 + 1 | pa1 + 1 | pa1 + 1 | pa1 + 1 */ 355#endif 356 357#if GMBT_SIGNED_ARITHMETIC 358 PSUBW ( MM1, MM2 ) /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */ 359 360 PSLLW ( CONST(8), MM1 ) /* q1 << 8 */ 361 362#if GMBT_ROUNDOFF 363 MOVQ ( MM2, MM4 ) 364#endif 365 366 PMULLW ( MM3, MM2 ) /* t1 = (q1 - p1)*pa1 */ 367 368#if GMBT_ROUNDOFF 369 PSRLW ( CONST(15), MM4 ) /* q1 > p1 ? 1 : 0 */ 370 371 PSLLW ( CONST(8), MM4 ) /* q1 > p1 ? 0x100 : 0 */ 372 373 PSUBW ( MM4, MM2 ) /* t1 -=? 0x100 */ 374#endif 375 376#else 377 PCMPEQW ( MM4, MM4 ) /* 0xffff | 0xffff | 0xffff | 0xffff */ 378 PUNPCKLBW ( MM0, MM4 ) /* 0x00ff | 0x00ff | 0x00ff | 0x00ff */ 379 MOVQ ( MM4, MM0 ) 380 381 PMULLW ( MM3, MM2 ) /* p1*pa1 */ 382 383 PSUBW ( MM3, MM0 ) /* 255 - pa1 | 255 - pa1 | 255 - pa1 | 255 - pa1 */ 384 385 PMULLW ( MM0, MM1 ) /* q1*(255 - pa1) */ 386 387 PADDW ( MM1, MM2 ) /* t1 = p1*pa1 + q1*(255 - pa1) */ 388#endif 389 390#if GMBT_ROUNDOFF 391 MOVQ ( CONTENT(const_80), MM4 ) 392 393 PADDW ( MM4, MM2 ) /* t1 += 0x80 */ 394#endif 395 396#if GMBT_GEOMETRIC_SERIES 397 MOVQ ( MM2, MM3 ) 398 399 PSRLW ( CONST(8), MM3 ) /* t1 >> 8 */ 400 401 PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) ~= (t1/255) << 8 */ 402 403#if GMBT_GEOMETRIC_CORRECTION 404 PSRLW ( CONST(7), MM3 ) /* t1 >> 15 */ 405 406 PADDW ( MM3, MM2 ) /* t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8 */ 407#endif 408#endif 409 410#if GMBT_SIGNED_ARITHMETIC 411 PADDW ( MM1, MM2 ) /* (t1/255 + q1) << 8 */ 412#endif 413 414 PSRLW ( CONST(8), MM2 ) /* sa1 | sb1 | sg1 | sr1 */ 415 416 PACKUSWB ( MM0, MM2 ) /* | | | | sa1 | sb1 | sg1 | sr1 */ 417 MOVD ( MM2, REGIND(EDI) ) 418 419LLBL (GMBT_done): 420 421 EMMS 422 423LLBL (GMBT_return): 424 425 POP_L ( EBX ) 426 POP_L ( EDI ) 427 POP_L ( ESI ) 428 MOV_L ( EBP, ESP ) 429 POP_L ( EBP ) 430 RET 431