translate_sse.c revision 8808d62f608d1397ee75d0087301d0b0a0278244
1/* 2 * Copyright 2003 Tungsten Graphics, inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw@tungstengraphics.com> 26 */ 27 28 29#include "pipe/p_compiler.h" 30#include "pipe/p_util.h" 31#include "util/u_simple_list.h" 32 33#include "translate.h" 34 35 36#if defined(__i386__) || defined(__386__) || defined(i386) 37 38#include "rtasm/rtasm_cpu.h" 39#include "rtasm/rtasm_x86sse.h" 40 41 42#define X 0 43#define Y 1 44#define Z 2 45#define W 3 46 47 48typedef void (PIPE_CDECL *run_func)( struct translate *translate, 49 unsigned start, 50 unsigned count, 51 void *output_buffer ); 52 53typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, 54 const unsigned *elts, 55 unsigned count, 56 void *output_buffer ); 57 58 59 60struct translate_sse { 61 struct translate translate; 62 63 struct x86_function linear_func; 64 struct x86_function elt_func; 65 struct x86_function *func; 66 67 boolean loaded_identity; 68 boolean loaded_255; 69 boolean loaded_inv_255; 70 71 float identity[4]; 72 float float_255[4]; 73 float inv_255[4]; 74 75 struct { 76 char *input_ptr; 77 unsigned input_stride; 78 } attrib[PIPE_MAX_ATTRIBS]; 79 80 run_func gen_run; 81 run_elts_func gen_run_elts; 82 83}; 84 85static int get_offset( const void *a, const void *b ) 86{ 87 return (const char *)b - (const char *)a; 88} 89 90 91 92static struct x86_reg get_identity( struct translate_sse *p ) 93{ 94 struct x86_reg reg = x86_make_reg(file_XMM, 6); 95 96 if (!p->loaded_identity) { 97 /* Nasty: 98 */ 99 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI); 100 101 p->loaded_identity = TRUE; 102 p->identity[0] = 0; 103 p->identity[1] = 0; 104 p->identity[2] = 0; 105 p->identity[3] = 1; 106 107 sse_movups(p->func, reg, 108 x86_make_disp(translateESI, 109 get_offset(p, &p->identity[0]))); 110 } 111 112 return reg; 113} 114 115static struct x86_reg get_255( struct translate_sse *p ) 116{ 117 struct x86_reg reg = x86_make_reg(file_XMM, 6); 118 119 if (!p->loaded_255) { 120 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI); 121 122 p->loaded_255 = TRUE; 123 p->float_255[0] = 124 p->float_255[1] = 125 p->float_255[2] = 126 p->float_255[3] = 255.0f; 127 128 sse_movups(p->func, reg, 129 x86_make_disp(translateESI, 130 get_offset(p, &p->float_255[0]))); 131 } 132 133 return reg; 134 return x86_make_reg(file_XMM, 7); 135} 136 137static struct x86_reg get_inv_255( struct translate_sse *p ) 138{ 139 struct x86_reg reg = x86_make_reg(file_XMM, 5); 140 141 if (!p->loaded_inv_255) { 142 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI); 143 144 p->loaded_inv_255 = TRUE; 145 p->inv_255[0] = 146 p->inv_255[1] = 147 p->inv_255[2] = 148 p->inv_255[3] = 1.0f / 255.0f; 149 150 sse_movups(p->func, reg, 151 x86_make_disp(translateESI, 152 get_offset(p, &p->inv_255[0]))); 153 } 154 155 return reg; 156} 157 158 159static void emit_load_R32G32B32A32( struct translate_sse *p, 160 struct x86_reg data, 161 struct x86_reg arg0 ) 162{ 163 sse_movups(p->func, data, arg0); 164} 165 166static void emit_load_R32G32B32( struct translate_sse *p, 167 struct x86_reg data, 168 struct x86_reg arg0 ) 169{ 170 /* Have to jump through some hoops: 171 * 172 * c 0 0 0 173 * c 0 0 1 174 * 0 0 c 1 175 * a b c 1 176 */ 177 sse_movss(p->func, data, x86_make_disp(arg0, 8)); 178 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); 179 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); 180 sse_movlps(p->func, data, arg0); 181} 182 183static void emit_load_R32G32( struct translate_sse *p, 184 struct x86_reg data, 185 struct x86_reg arg0 ) 186{ 187 /* 0 0 0 1 188 * a b 0 1 189 */ 190 sse_movups(p->func, data, get_identity(p) ); 191 sse_movlps(p->func, data, arg0); 192} 193 194 195static void emit_load_R32( struct translate_sse *p, 196 struct x86_reg data, 197 struct x86_reg arg0 ) 198{ 199 /* a 0 0 0 200 * a 0 0 1 201 */ 202 sse_movss(p->func, data, arg0); 203 sse_orps(p->func, data, get_identity(p) ); 204} 205 206 207static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, 208 struct x86_reg data, 209 struct x86_reg src ) 210{ 211 212 /* Load and unpack twice: 213 */ 214 sse_movss(p->func, data, src); 215 sse2_punpcklbw(p->func, data, get_identity(p)); 216 sse2_punpcklbw(p->func, data, get_identity(p)); 217 218 /* Convert to float: 219 */ 220 sse2_cvtdq2ps(p->func, data, data); 221 222 223 /* Scale by 1/255.0 224 */ 225 sse_mulps(p->func, data, get_inv_255(p)); 226} 227 228 229 230 231static void emit_store_R32G32B32A32( struct translate_sse *p, 232 struct x86_reg dest, 233 struct x86_reg dataXMM ) 234{ 235 sse_movups(p->func, dest, dataXMM); 236} 237 238static void emit_store_R32G32B32( struct translate_sse *p, 239 struct x86_reg dest, 240 struct x86_reg dataXMM ) 241{ 242 /* Emit two, shuffle, emit one. 243 */ 244 sse_movlps(p->func, dest, dataXMM); 245 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ 246 sse_movss(p->func, x86_make_disp(dest,8), dataXMM); 247} 248 249static void emit_store_R32G32( struct translate_sse *p, 250 struct x86_reg dest, 251 struct x86_reg dataXMM ) 252{ 253 sse_movlps(p->func, dest, dataXMM); 254} 255 256static void emit_store_R32( struct translate_sse *p, 257 struct x86_reg dest, 258 struct x86_reg dataXMM ) 259{ 260 sse_movss(p->func, dest, dataXMM); 261} 262 263 264 265static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, 266 struct x86_reg dest, 267 struct x86_reg dataXMM ) 268{ 269 /* Scale by 255.0 270 */ 271 sse_mulps(p->func, dataXMM, get_255(p)); 272 273 /* Pack and emit: 274 */ 275 sse2_cvtps2dq(p->func, dataXMM, dataXMM); 276 sse2_packssdw(p->func, dataXMM, dataXMM); 277 sse2_packuswb(p->func, dataXMM, dataXMM); 278 sse_movss(p->func, dest, dataXMM); 279} 280 281 282 283 284 285static void get_src_ptr( struct translate_sse *p, 286 struct x86_reg srcEAX, 287 struct x86_reg translateREG, 288 struct x86_reg eltREG, 289 unsigned a ) 290{ 291 struct x86_reg input_ptr = 292 x86_make_disp(translateREG, 293 get_offset(p, &p->attrib[a].input_ptr)); 294 295 struct x86_reg input_stride = 296 x86_make_disp(translateREG, 297 get_offset(p, &p->attrib[a].input_stride)); 298 299 /* Calculate pointer to current attrib: 300 */ 301 x86_mov(p->func, srcEAX, input_stride); 302 x86_imul(p->func, srcEAX, eltREG); 303 x86_add(p->func, srcEAX, input_ptr); 304} 305 306 307/* Extended swizzles? Maybe later. 308 */ 309static void emit_swizzle( struct translate_sse *p, 310 struct x86_reg dest, 311 struct x86_reg src, 312 unsigned char shuffle ) 313{ 314 sse_shufps(p->func, dest, src, shuffle); 315} 316 317 318static boolean translate_attr( struct translate_sse *p, 319 const struct translate_element *a, 320 struct x86_reg srcECX, 321 struct x86_reg dstEAX) 322{ 323 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 324 325 switch (a->input_format) { 326 case PIPE_FORMAT_R32_FLOAT: 327 emit_load_R32(p, dataXMM, srcECX); 328 break; 329 case PIPE_FORMAT_R32G32_FLOAT: 330 emit_load_R32G32(p, dataXMM, srcECX); 331 break; 332 case PIPE_FORMAT_R32G32B32_FLOAT: 333 emit_load_R32G32B32(p, dataXMM, srcECX); 334 break; 335 case PIPE_FORMAT_R32G32B32A32_FLOAT: 336 emit_load_R32G32B32A32(p, dataXMM, srcECX); 337 break; 338 case PIPE_FORMAT_B8G8R8A8_UNORM: 339 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 340 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 341 break; 342 case PIPE_FORMAT_R8G8B8A8_UNORM: 343 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 344 break; 345 default: 346 return FALSE; 347 } 348 349 switch (a->output_format) { 350 case PIPE_FORMAT_R32_FLOAT: 351 emit_store_R32(p, dstEAX, dataXMM); 352 break; 353 case PIPE_FORMAT_R32G32_FLOAT: 354 emit_store_R32G32(p, dstEAX, dataXMM); 355 break; 356 case PIPE_FORMAT_R32G32B32_FLOAT: 357 emit_store_R32G32B32(p, dstEAX, dataXMM); 358 break; 359 case PIPE_FORMAT_R32G32B32A32_FLOAT: 360 emit_store_R32G32B32A32(p, dstEAX, dataXMM); 361 break; 362 case PIPE_FORMAT_B8G8R8A8_UNORM: 363 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 364 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 365 break; 366 case PIPE_FORMAT_R8G8B8A8_UNORM: 367 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 368 break; 369 default: 370 return FALSE; 371 } 372 373 return TRUE; 374} 375 376/* Build run( struct translate *translate, 377 * unsigned start, 378 * unsigned count, 379 * void *output_buffer ) 380 * or 381 * run_elts( struct translate *translate, 382 * unsigned *elts, 383 * unsigned count, 384 * void *output_buffer ) 385 * 386 * Lots of hardcoding 387 * 388 * EAX -- pointer to current output vertex 389 * ECX -- pointer to current attribute 390 * 391 */ 392static boolean build_vertex_emit( struct translate_sse *p, 393 struct x86_function *func, 394 boolean linear ) 395{ 396 struct x86_reg vertexECX = x86_make_reg(file_REG32, reg_AX); 397 struct x86_reg idxEBX = x86_make_reg(file_REG32, reg_BX); 398 struct x86_reg srcEAX = x86_make_reg(file_REG32, reg_CX); 399 struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP); 400 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI); 401 int fixup, label; 402 unsigned j; 403 404 p->func = func; 405 p->loaded_inv_255 = FALSE; 406 p->loaded_255 = FALSE; 407 p->loaded_identity = FALSE; 408 409 x86_init_func(p->func); 410 411 /* Push a few regs? 412 */ 413 x86_push(p->func, countEBP); 414 x86_push(p->func, translateESI); 415 x86_push(p->func, idxEBX); 416 417 /* Get vertex count, compare to zero 418 */ 419 x86_xor(p->func, idxEBX, idxEBX); 420 x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3)); 421 x86_cmp(p->func, countEBP, idxEBX); 422 fixup = x86_jcc_forward(p->func, cc_E); 423 424 /* If linear, idx is the current element, otherwise it is a pointer 425 * to the current element. 426 */ 427 x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2)); 428 429 /* Initialize destination register. 430 */ 431 x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4)); 432 433 /* Move argument 1 (translate_sse pointer) into a reg: 434 */ 435 x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1)); 436 437 438 /* always load, needed or not: 439 */ 440 441 /* Note address for loop jump */ 442 label = x86_get_label(p->func); 443 444 445 for (j = 0; j < p->translate.key.nr_elements; j++) { 446 const struct translate_element *a = &p->translate.key.element[j]; 447 448 struct x86_reg destEAX = x86_make_disp(vertexECX, 449 a->output_offset); 450 451 /* Figure out source pointer address: 452 */ 453 if (linear) { 454 get_src_ptr(p, srcEAX, translateESI, idxEBX, j); 455 } 456 else { 457 get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j); 458 } 459 460 if (!translate_attr( p, a, x86_deref(srcEAX), destEAX )) 461 return FALSE; 462 } 463 464 /* Next vertex: 465 */ 466 x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride)); 467 468 /* Incr index 469 */ 470 if (linear) { 471 x86_inc(p->func, idxEBX); 472 } 473 else { 474 x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4)); 475 } 476 477 /* decr count, loop if not zero 478 */ 479 x86_dec(p->func, countEBP); 480 x86_test(p->func, countEBP, countEBP); 481 x86_jcc(p->func, cc_NZ, label); 482 483 /* Exit mmx state? 484 */ 485 if (p->func->need_emms) 486 mmx_emms(p->func); 487 488 /* Land forward jump here: 489 */ 490 x86_fixup_fwd_jump(p->func, fixup); 491 492 /* Pop regs and return 493 */ 494 495 x86_pop(p->func, idxEBX); 496 x86_pop(p->func, translateESI); 497 x86_pop(p->func, countEBP); 498 x86_ret(p->func); 499 500 return TRUE; 501} 502 503 504 505 506 507 508 509static void translate_sse_set_buffer( struct translate *translate, 510 unsigned buf, 511 const void *ptr, 512 unsigned stride ) 513{ 514 struct translate_sse *p = (struct translate_sse *)translate; 515 unsigned i; 516 517 for (i = 0; i < p->translate.key.nr_elements; i++) { 518 if (p->translate.key.element[i].input_buffer == buf) { 519 p->attrib[i].input_ptr = ((char *)ptr + 520 p->translate.key.element[i].input_offset); 521 p->attrib[i].input_stride = stride; 522 } 523 } 524} 525 526 527static void translate_sse_release( struct translate *translate ) 528{ 529 struct translate_sse *p = (struct translate_sse *)translate; 530 531 x86_release_func( &p->linear_func ); 532 x86_release_func( &p->elt_func ); 533 534 FREE(p); 535} 536 537static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, 538 const unsigned *elts, 539 unsigned count, 540 void *output_buffer ) 541{ 542 struct translate_sse *p = (struct translate_sse *)translate; 543 544 p->gen_run_elts( translate, 545 elts, 546 count, 547 output_buffer ); 548} 549 550static void PIPE_CDECL translate_sse_run( struct translate *translate, 551 unsigned start, 552 unsigned count, 553 void *output_buffer ) 554{ 555 struct translate_sse *p = (struct translate_sse *)translate; 556 557 p->gen_run( translate, 558 start, 559 count, 560 output_buffer ); 561} 562 563 564struct translate *translate_sse2_create( const struct translate_key *key ) 565{ 566 struct translate_sse *p = NULL; 567 568 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) 569 goto fail; 570 571 p = CALLOC_STRUCT( translate_sse ); 572 if (p == NULL) 573 goto fail; 574 575 p->translate.key = *key; 576 p->translate.release = translate_sse_release; 577 p->translate.set_buffer = translate_sse_set_buffer; 578 p->translate.run_elts = translate_sse_run_elts; 579 p->translate.run = translate_sse_run; 580 581 if (!build_vertex_emit(p, &p->linear_func, TRUE)) 582 goto fail; 583 584 if (!build_vertex_emit(p, &p->elt_func, FALSE)) 585 goto fail; 586 587 p->gen_run = (run_func)x86_get_func(&p->linear_func); 588 if (p->gen_run == NULL) 589 goto fail; 590 591 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); 592 if (p->gen_run_elts == NULL) 593 goto fail; 594 595 return &p->translate; 596 597 fail: 598 if (p) 599 translate_sse_release( &p->translate ); 600 601 return NULL; 602} 603 604 605 606#else 607 608void translate_create_sse( const struct translate_key *key ) 609{ 610 return NULL; 611} 612 613#endif 614