translate_sse.c revision 7ca0ce38340144794267609646048b3820d594ab
1/* 2 * Copyright 2003 Tungsten Graphics, inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw@tungstengraphics.com> 26 */ 27 28 29#include "pipe/p_config.h" 30#include "pipe/p_compiler.h" 31#include "util/u_memory.h" 32#include "util/u_math.h" 33 34#include "translate.h" 35 36 37#if defined(PIPE_ARCH_X86) 38 39#include "rtasm/rtasm_cpu.h" 40#include "rtasm/rtasm_x86sse.h" 41 42 43#define X 0 44#define Y 1 45#define Z 2 46#define W 3 47 48 49typedef void (PIPE_CDECL *run_func)( struct translate *translate, 50 unsigned start, 51 unsigned count, 52 void *output_buffer ); 53 54typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, 55 const unsigned *elts, 56 unsigned count, 57 void *output_buffer ); 58 59struct translate_buffer { 60 const void *base_ptr; 61 unsigned stride; 62 void *ptr; /* updated per vertex */ 63}; 64 65 66struct translate_sse { 67 struct translate translate; 68 69 struct x86_function linear_func; 70 struct x86_function elt_func; 71 struct x86_function *func; 72 73 boolean loaded_identity; 74 boolean loaded_255; 75 boolean loaded_inv_255; 76 77 float identity[4]; 78 float float_255[4]; 79 float inv_255[4]; 80 81 struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; 82 unsigned nr_buffers; 83 84 run_func gen_run; 85 run_elts_func gen_run_elts; 86 87 /* these are actually known values, but putting them in a struct 88 * like this is helpful to keep them in sync across the file. 89 */ 90 struct x86_reg tmp_EAX; 91 struct x86_reg idx_EBX; /* either start+i or &elt[i] */ 92 struct x86_reg outbuf_ECX; 93 struct x86_reg machine_EDX; 94 struct x86_reg count_ESI; /* decrements to zero */ 95}; 96 97static int get_offset( const void *a, const void *b ) 98{ 99 return (const char *)b - (const char *)a; 100} 101 102 103 104static struct x86_reg get_identity( struct translate_sse *p ) 105{ 106 struct x86_reg reg = x86_make_reg(file_XMM, 6); 107 108 if (!p->loaded_identity) { 109 p->loaded_identity = TRUE; 110 p->identity[0] = 0; 111 p->identity[1] = 0; 112 p->identity[2] = 0; 113 p->identity[3] = 1; 114 115 sse_movups(p->func, reg, 116 x86_make_disp(p->machine_EDX, 117 get_offset(p, &p->identity[0]))); 118 } 119 120 return reg; 121} 122 123static struct x86_reg get_255( struct translate_sse *p ) 124{ 125 struct x86_reg reg = x86_make_reg(file_XMM, 7); 126 127 if (!p->loaded_255) { 128 p->loaded_255 = TRUE; 129 p->float_255[0] = 130 p->float_255[1] = 131 p->float_255[2] = 132 p->float_255[3] = 255.0f; 133 134 sse_movups(p->func, reg, 135 x86_make_disp(p->machine_EDX, 136 get_offset(p, &p->float_255[0]))); 137 } 138 139 return reg; 140} 141 142static struct x86_reg get_inv_255( struct translate_sse *p ) 143{ 144 struct x86_reg reg = x86_make_reg(file_XMM, 5); 145 146 if (!p->loaded_inv_255) { 147 p->loaded_inv_255 = TRUE; 148 p->inv_255[0] = 149 p->inv_255[1] = 150 p->inv_255[2] = 151 p->inv_255[3] = 1.0f / 255.0f; 152 153 sse_movups(p->func, reg, 154 x86_make_disp(p->machine_EDX, 155 get_offset(p, &p->inv_255[0]))); 156 } 157 158 return reg; 159} 160 161 162static void emit_load_R32G32B32A32( struct translate_sse *p, 163 struct x86_reg data, 164 struct x86_reg arg0 ) 165{ 166 sse_movups(p->func, data, arg0); 167} 168 169static void emit_load_R32G32B32( struct translate_sse *p, 170 struct x86_reg data, 171 struct x86_reg arg0 ) 172{ 173 /* Have to jump through some hoops: 174 * 175 * c 0 0 0 176 * c 0 0 1 177 * 0 0 c 1 178 * a b c 1 179 */ 180 sse_movss(p->func, data, x86_make_disp(arg0, 8)); 181 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); 182 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); 183 sse_movlps(p->func, data, arg0); 184} 185 186static void emit_load_R32G32( struct translate_sse *p, 187 struct x86_reg data, 188 struct x86_reg arg0 ) 189{ 190 /* 0 0 0 1 191 * a b 0 1 192 */ 193 sse_movups(p->func, data, get_identity(p) ); 194 sse_movlps(p->func, data, arg0); 195} 196 197 198static void emit_load_R32( struct translate_sse *p, 199 struct x86_reg data, 200 struct x86_reg arg0 ) 201{ 202 /* a 0 0 0 203 * a 0 0 1 204 */ 205 sse_movss(p->func, data, arg0); 206 sse_orps(p->func, data, get_identity(p) ); 207} 208 209 210static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, 211 struct x86_reg data, 212 struct x86_reg src ) 213{ 214 215 /* Load and unpack twice: 216 */ 217 sse_movss(p->func, data, src); 218 sse2_punpcklbw(p->func, data, get_identity(p)); 219 sse2_punpcklbw(p->func, data, get_identity(p)); 220 221 /* Convert to float: 222 */ 223 sse2_cvtdq2ps(p->func, data, data); 224 225 226 /* Scale by 1/255.0 227 */ 228 sse_mulps(p->func, data, get_inv_255(p)); 229} 230 231 232 233 234static void emit_store_R32G32B32A32( struct translate_sse *p, 235 struct x86_reg dest, 236 struct x86_reg dataXMM ) 237{ 238 sse_movups(p->func, dest, dataXMM); 239} 240 241static void emit_store_R32G32B32( struct translate_sse *p, 242 struct x86_reg dest, 243 struct x86_reg dataXMM ) 244{ 245 /* Emit two, shuffle, emit one. 246 */ 247 sse_movlps(p->func, dest, dataXMM); 248 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ 249 sse_movss(p->func, x86_make_disp(dest,8), dataXMM); 250} 251 252static void emit_store_R32G32( struct translate_sse *p, 253 struct x86_reg dest, 254 struct x86_reg dataXMM ) 255{ 256 sse_movlps(p->func, dest, dataXMM); 257} 258 259static void emit_store_R32( struct translate_sse *p, 260 struct x86_reg dest, 261 struct x86_reg dataXMM ) 262{ 263 sse_movss(p->func, dest, dataXMM); 264} 265 266 267 268static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, 269 struct x86_reg dest, 270 struct x86_reg dataXMM ) 271{ 272 /* Scale by 255.0 273 */ 274 sse_mulps(p->func, dataXMM, get_255(p)); 275 276 /* Pack and emit: 277 */ 278 sse2_cvtps2dq(p->func, dataXMM, dataXMM); 279 sse2_packssdw(p->func, dataXMM, dataXMM); 280 sse2_packuswb(p->func, dataXMM, dataXMM); 281 sse_movss(p->func, dest, dataXMM); 282} 283 284 285 286 287 288/* Extended swizzles? Maybe later. 289 */ 290static void emit_swizzle( struct translate_sse *p, 291 struct x86_reg dest, 292 struct x86_reg src, 293 unsigned char shuffle ) 294{ 295 sse_shufps(p->func, dest, src, shuffle); 296} 297 298 299static boolean translate_attr( struct translate_sse *p, 300 const struct translate_element *a, 301 struct x86_reg srcECX, 302 struct x86_reg dstEAX) 303{ 304 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 305 306 switch (a->input_format) { 307 case PIPE_FORMAT_R32_FLOAT: 308 emit_load_R32(p, dataXMM, srcECX); 309 break; 310 case PIPE_FORMAT_R32G32_FLOAT: 311 emit_load_R32G32(p, dataXMM, srcECX); 312 break; 313 case PIPE_FORMAT_R32G32B32_FLOAT: 314 emit_load_R32G32B32(p, dataXMM, srcECX); 315 break; 316 case PIPE_FORMAT_R32G32B32A32_FLOAT: 317 emit_load_R32G32B32A32(p, dataXMM, srcECX); 318 break; 319 case PIPE_FORMAT_B8G8R8A8_UNORM: 320 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 321 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 322 break; 323 case PIPE_FORMAT_R8G8B8A8_UNORM: 324 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 325 break; 326 default: 327 return FALSE; 328 } 329 330 switch (a->output_format) { 331 case PIPE_FORMAT_R32_FLOAT: 332 emit_store_R32(p, dstEAX, dataXMM); 333 break; 334 case PIPE_FORMAT_R32G32_FLOAT: 335 emit_store_R32G32(p, dstEAX, dataXMM); 336 break; 337 case PIPE_FORMAT_R32G32B32_FLOAT: 338 emit_store_R32G32B32(p, dstEAX, dataXMM); 339 break; 340 case PIPE_FORMAT_R32G32B32A32_FLOAT: 341 emit_store_R32G32B32A32(p, dstEAX, dataXMM); 342 break; 343 case PIPE_FORMAT_B8G8R8A8_UNORM: 344 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 345 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 346 break; 347 case PIPE_FORMAT_R8G8B8A8_UNORM: 348 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 349 break; 350 default: 351 return FALSE; 352 } 353 354 return TRUE; 355} 356 357 358static boolean init_inputs( struct translate_sse *p, 359 boolean linear ) 360{ 361 unsigned i; 362 if (linear) { 363 for (i = 0; i < p->nr_buffers; i++) { 364 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, 365 get_offset(p, &p->buffer[i].stride)); 366 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, 367 get_offset(p, &p->buffer[i].ptr)); 368 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, 369 get_offset(p, &p->buffer[i].base_ptr)); 370 struct x86_reg elt = p->idx_EBX; 371 struct x86_reg tmp = p->tmp_EAX; 372 373 374 /* Calculate pointer to first attrib: 375 */ 376 x86_mov(p->func, tmp, buf_stride); 377 x86_imul(p->func, tmp, elt); 378 x86_add(p->func, tmp, buf_base_ptr); 379 380 381 /* In the linear case, keep the buffer pointer instead of the 382 * index number. 383 */ 384 if (p->nr_buffers == 1) 385 x86_mov( p->func, elt, tmp ); 386 else 387 x86_mov( p->func, buf_ptr, tmp ); 388 } 389 } 390 391 return TRUE; 392} 393 394 395static struct x86_reg get_buffer_ptr( struct translate_sse *p, 396 boolean linear, 397 unsigned buf_idx, 398 struct x86_reg elt ) 399{ 400 if (linear && p->nr_buffers == 1) { 401 return p->idx_EBX; 402 } 403 else if (linear) { 404 struct x86_reg ptr = p->tmp_EAX; 405 struct x86_reg buf_ptr = 406 x86_make_disp(p->machine_EDX, 407 get_offset(p, &p->buffer[buf_idx].ptr)); 408 409 x86_mov(p->func, ptr, buf_ptr); 410 return ptr; 411 } 412 else { 413 struct x86_reg ptr = p->tmp_EAX; 414 415 struct x86_reg buf_stride = 416 x86_make_disp(p->machine_EDX, 417 get_offset(p, &p->buffer[buf_idx].stride)); 418 419 struct x86_reg buf_base_ptr = 420 x86_make_disp(p->machine_EDX, 421 get_offset(p, &p->buffer[buf_idx].base_ptr)); 422 423 424 425 /* Calculate pointer to current attrib: 426 */ 427 x86_mov(p->func, ptr, buf_stride); 428 x86_imul(p->func, ptr, elt); 429 x86_add(p->func, ptr, buf_base_ptr); 430 return ptr; 431 } 432} 433 434 435 436static boolean incr_inputs( struct translate_sse *p, 437 boolean linear ) 438{ 439 if (linear && p->nr_buffers == 1) { 440 struct x86_reg stride = x86_make_disp(p->machine_EDX, 441 get_offset(p, &p->buffer[0].stride)); 442 443 x86_add(p->func, p->idx_EBX, stride); 444 sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); 445 } 446 else if (linear) { 447 unsigned i; 448 449 /* Is this worthwhile?? 450 */ 451 for (i = 0; i < p->nr_buffers; i++) { 452 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, 453 get_offset(p, &p->buffer[i].ptr)); 454 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, 455 get_offset(p, &p->buffer[i].stride)); 456 457 x86_mov(p->func, p->tmp_EAX, buf_ptr); 458 x86_add(p->func, p->tmp_EAX, buf_stride); 459 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); 460 x86_mov(p->func, buf_ptr, p->tmp_EAX); 461 } 462 } 463 else { 464 x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4)); 465 } 466 467 return TRUE; 468} 469 470 471/* Build run( struct translate *machine, 472 * unsigned start, 473 * unsigned count, 474 * void *output_buffer ) 475 * or 476 * run_elts( struct translate *machine, 477 * unsigned *elts, 478 * unsigned count, 479 * void *output_buffer ) 480 * 481 * Lots of hardcoding 482 * 483 * EAX -- pointer to current output vertex 484 * ECX -- pointer to current attribute 485 * 486 */ 487static boolean build_vertex_emit( struct translate_sse *p, 488 struct x86_function *func, 489 boolean linear ) 490{ 491 int fixup, label; 492 unsigned j; 493 494 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); 495 p->idx_EBX = x86_make_reg(file_REG32, reg_BX); 496 p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX); 497 p->machine_EDX = x86_make_reg(file_REG32, reg_DX); 498 p->count_ESI = x86_make_reg(file_REG32, reg_SI); 499 500 p->func = func; 501 p->loaded_inv_255 = FALSE; 502 p->loaded_255 = FALSE; 503 p->loaded_identity = FALSE; 504 505 x86_init_func(p->func); 506 507 /* Push a few regs? 508 */ 509 x86_push(p->func, p->idx_EBX); 510 x86_push(p->func, p->count_ESI); 511 512 /* Load arguments into regs: 513 */ 514 x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); 515 x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); 516 x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); 517 x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4)); 518 519 /* Get vertex count, compare to zero 520 */ 521 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); 522 x86_cmp(p->func, p->count_ESI, p->tmp_EAX); 523 fixup = x86_jcc_forward(p->func, cc_E); 524 525 /* always load, needed or not: 526 */ 527 init_inputs(p, linear); 528 529 /* Note address for loop jump 530 */ 531 label = x86_get_label(p->func); 532 { 533 struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); 534 int last_vb = -1; 535 struct x86_reg vb; 536 537 for (j = 0; j < p->translate.key.nr_elements; j++) { 538 const struct translate_element *a = &p->translate.key.element[j]; 539 540 /* Figure out source pointer address: 541 */ 542 if (a->input_buffer != last_vb) { 543 last_vb = a->input_buffer; 544 vb = get_buffer_ptr(p, linear, a->input_buffer, elt); 545 } 546 547 if (!translate_attr( p, a, 548 x86_make_disp(vb, a->input_offset), 549 x86_make_disp(p->outbuf_ECX, a->output_offset))) 550 return FALSE; 551 } 552 553 /* Next output vertex: 554 */ 555 x86_lea(p->func, 556 p->outbuf_ECX, 557 x86_make_disp(p->outbuf_ECX, 558 p->translate.key.output_stride)); 559 560 /* Incr index 561 */ 562 incr_inputs( p, linear ); 563 } 564 565 /* decr count, loop if not zero 566 */ 567 x86_dec(p->func, p->count_ESI); 568 x86_jcc(p->func, cc_NZ, label); 569 570 /* Exit mmx state? 571 */ 572 if (p->func->need_emms) 573 mmx_emms(p->func); 574 575 /* Land forward jump here: 576 */ 577 x86_fixup_fwd_jump(p->func, fixup); 578 579 /* Pop regs and return 580 */ 581 582 x86_pop(p->func, p->count_ESI); 583 x86_pop(p->func, p->idx_EBX); 584 x86_ret(p->func); 585 586 return TRUE; 587} 588 589 590 591 592 593 594 595static void translate_sse_set_buffer( struct translate *translate, 596 unsigned buf, 597 const void *ptr, 598 unsigned stride ) 599{ 600 struct translate_sse *p = (struct translate_sse *)translate; 601 602 if (buf < p->nr_buffers) { 603 p->buffer[buf].base_ptr = (char *)ptr; 604 p->buffer[buf].stride = stride; 605 } 606 607 if (0) debug_printf("%s %d/%d: %p %d\n", 608 __FUNCTION__, buf, 609 p->nr_buffers, 610 ptr, stride); 611} 612 613 614static void translate_sse_release( struct translate *translate ) 615{ 616 struct translate_sse *p = (struct translate_sse *)translate; 617 618 x86_release_func( &p->linear_func ); 619 x86_release_func( &p->elt_func ); 620 621 FREE(p); 622} 623 624static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, 625 const unsigned *elts, 626 unsigned count, 627 void *output_buffer ) 628{ 629 struct translate_sse *p = (struct translate_sse *)translate; 630 631 p->gen_run_elts( translate, 632 elts, 633 count, 634 output_buffer ); 635} 636 637static void PIPE_CDECL translate_sse_run( struct translate *translate, 638 unsigned start, 639 unsigned count, 640 unsigned instance_id, 641 void *output_buffer ) 642{ 643 struct translate_sse *p = (struct translate_sse *)translate; 644 645 p->gen_run( translate, 646 start, 647 count, 648 output_buffer ); 649} 650 651 652struct translate *translate_sse2_create( const struct translate_key *key ) 653{ 654 struct translate_sse *p = NULL; 655 unsigned i; 656 657 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) 658 goto fail; 659 660 p = CALLOC_STRUCT( translate_sse ); 661 if (p == NULL) 662 goto fail; 663 664 p->translate.key = *key; 665 p->translate.release = translate_sse_release; 666 p->translate.set_buffer = translate_sse_set_buffer; 667 p->translate.run_elts = translate_sse_run_elts; 668 p->translate.run = translate_sse_run; 669 670 for (i = 0; i < key->nr_elements; i++) 671 p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 ); 672 673 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); 674 675 if (!build_vertex_emit(p, &p->linear_func, TRUE)) 676 goto fail; 677 678 if (!build_vertex_emit(p, &p->elt_func, FALSE)) 679 goto fail; 680 681 p->gen_run = (run_func)x86_get_func(&p->linear_func); 682 if (p->gen_run == NULL) 683 goto fail; 684 685 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); 686 if (p->gen_run_elts == NULL) 687 goto fail; 688 689 return &p->translate; 690 691 fail: 692 if (p) 693 translate_sse_release( &p->translate ); 694 695 return NULL; 696} 697 698 699 700#else 701 702struct translate *translate_sse2_create( const struct translate_key *key ) 703{ 704 return NULL; 705} 706 707#endif 708