translate_sse.c revision fc431a58dc1446383edc11aec2a0b7de5b363e5e
1/* 2 * Copyright 2003 Tungsten Graphics, inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw@tungstengraphics.com> 26 */ 27 28 29#include "pipe/p_config.h" 30#include "pipe/p_compiler.h" 31#include "util/u_memory.h" 32#include "util/u_math.h" 33 34#include "translate.h" 35 36 37#if defined(PIPE_ARCH_X86) 38 39#include "rtasm/rtasm_cpu.h" 40#include "rtasm/rtasm_x86sse.h" 41 42 43#define X 0 44#define Y 1 45#define Z 2 46#define W 3 47 48 49typedef void (PIPE_CDECL *run_func)( struct translate *translate, 50 unsigned start, 51 unsigned count, 52 unsigned instance_id, 53 void *output_buffer); 54 55typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, 56 const unsigned *elts, 57 unsigned count, 58 unsigned instance_id, 59 void *output_buffer); 60 61struct translate_buffer { 62 const void *base_ptr; 63 unsigned stride; 64 unsigned max_index; 65}; 66 67struct translate_buffer_varient { 68 unsigned buffer_index; 69 unsigned instance_divisor; 70 void *ptr; /* updated either per vertex or per instance */ 71}; 72 73 74#define ELEMENT_BUFFER_INSTANCE_ID 1001 75 76 77struct translate_sse { 78 struct translate translate; 79 80 struct x86_function linear_func; 81 struct x86_function elt_func; 82 struct x86_function *func; 83 84 boolean loaded_identity; 85 boolean loaded_255; 86 boolean loaded_inv_255; 87 88 float identity[4]; 89 float float_255[4]; 90 float inv_255[4]; 91 92 struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; 93 unsigned nr_buffers; 94 95 /* Multiple buffer varients can map to a single buffer. */ 96 struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS]; 97 unsigned nr_buffer_varients; 98 99 /* Multiple elements can map to a single buffer varient. */ 100 unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS]; 101 102 boolean use_instancing; 103 unsigned instance_id; 104 105 run_func gen_run; 106 run_elts_func gen_run_elts; 107 108 /* these are actually known values, but putting them in a struct 109 * like this is helpful to keep them in sync across the file. 110 */ 111 struct x86_reg tmp_EAX; 112 struct x86_reg idx_EBX; /* either start+i or &elt[i] */ 113 struct x86_reg outbuf_ECX; 114 struct x86_reg machine_EDX; 115 struct x86_reg count_ESI; /* decrements to zero */ 116}; 117 118static int get_offset( const void *a, const void *b ) 119{ 120 return (const char *)b - (const char *)a; 121} 122 123 124 125static struct x86_reg get_identity( struct translate_sse *p ) 126{ 127 struct x86_reg reg = x86_make_reg(file_XMM, 6); 128 129 if (!p->loaded_identity) { 130 p->loaded_identity = TRUE; 131 p->identity[0] = 0; 132 p->identity[1] = 0; 133 p->identity[2] = 0; 134 p->identity[3] = 1; 135 136 sse_movups(p->func, reg, 137 x86_make_disp(p->machine_EDX, 138 get_offset(p, &p->identity[0]))); 139 } 140 141 return reg; 142} 143 144static struct x86_reg get_255( struct translate_sse *p ) 145{ 146 struct x86_reg reg = x86_make_reg(file_XMM, 7); 147 148 if (!p->loaded_255) { 149 p->loaded_255 = TRUE; 150 p->float_255[0] = 151 p->float_255[1] = 152 p->float_255[2] = 153 p->float_255[3] = 255.0f; 154 155 sse_movups(p->func, reg, 156 x86_make_disp(p->machine_EDX, 157 get_offset(p, &p->float_255[0]))); 158 } 159 160 return reg; 161} 162 163static struct x86_reg get_inv_255( struct translate_sse *p ) 164{ 165 struct x86_reg reg = x86_make_reg(file_XMM, 5); 166 167 if (!p->loaded_inv_255) { 168 p->loaded_inv_255 = TRUE; 169 p->inv_255[0] = 170 p->inv_255[1] = 171 p->inv_255[2] = 172 p->inv_255[3] = 1.0f / 255.0f; 173 174 sse_movups(p->func, reg, 175 x86_make_disp(p->machine_EDX, 176 get_offset(p, &p->inv_255[0]))); 177 } 178 179 return reg; 180} 181 182 183static void emit_load_R32G32B32A32( struct translate_sse *p, 184 struct x86_reg data, 185 struct x86_reg arg0 ) 186{ 187 sse_movups(p->func, data, arg0); 188} 189 190static void emit_load_R32G32B32( struct translate_sse *p, 191 struct x86_reg data, 192 struct x86_reg arg0 ) 193{ 194 /* Have to jump through some hoops: 195 * 196 * c 0 0 0 197 * c 0 0 1 198 * 0 0 c 1 199 * a b c 1 200 */ 201 sse_movss(p->func, data, x86_make_disp(arg0, 8)); 202 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); 203 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); 204 sse_movlps(p->func, data, arg0); 205} 206 207static void emit_load_R32G32( struct translate_sse *p, 208 struct x86_reg data, 209 struct x86_reg arg0 ) 210{ 211 /* 0 0 0 1 212 * a b 0 1 213 */ 214 sse_movups(p->func, data, get_identity(p) ); 215 sse_movlps(p->func, data, arg0); 216} 217 218 219static void emit_load_R32( struct translate_sse *p, 220 struct x86_reg data, 221 struct x86_reg arg0 ) 222{ 223 /* a 0 0 0 224 * a 0 0 1 225 */ 226 sse_movss(p->func, data, arg0); 227 sse_orps(p->func, data, get_identity(p) ); 228} 229 230 231static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, 232 struct x86_reg data, 233 struct x86_reg src ) 234{ 235 236 /* Load and unpack twice: 237 */ 238 sse_movss(p->func, data, src); 239 sse2_punpcklbw(p->func, data, get_identity(p)); 240 sse2_punpcklbw(p->func, data, get_identity(p)); 241 242 /* Convert to float: 243 */ 244 sse2_cvtdq2ps(p->func, data, data); 245 246 247 /* Scale by 1/255.0 248 */ 249 sse_mulps(p->func, data, get_inv_255(p)); 250} 251 252 253 254 255static void emit_store_R32G32B32A32( struct translate_sse *p, 256 struct x86_reg dest, 257 struct x86_reg dataXMM ) 258{ 259 sse_movups(p->func, dest, dataXMM); 260} 261 262static void emit_store_R32G32B32( struct translate_sse *p, 263 struct x86_reg dest, 264 struct x86_reg dataXMM ) 265{ 266 /* Emit two, shuffle, emit one. 267 */ 268 sse_movlps(p->func, dest, dataXMM); 269 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ 270 sse_movss(p->func, x86_make_disp(dest,8), dataXMM); 271} 272 273static void emit_store_R32G32( struct translate_sse *p, 274 struct x86_reg dest, 275 struct x86_reg dataXMM ) 276{ 277 sse_movlps(p->func, dest, dataXMM); 278} 279 280static void emit_store_R32( struct translate_sse *p, 281 struct x86_reg dest, 282 struct x86_reg dataXMM ) 283{ 284 sse_movss(p->func, dest, dataXMM); 285} 286 287 288 289static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, 290 struct x86_reg dest, 291 struct x86_reg dataXMM ) 292{ 293 /* Scale by 255.0 294 */ 295 sse_mulps(p->func, dataXMM, get_255(p)); 296 297 /* Pack and emit: 298 */ 299 sse2_cvtps2dq(p->func, dataXMM, dataXMM); 300 sse2_packssdw(p->func, dataXMM, dataXMM); 301 sse2_packuswb(p->func, dataXMM, dataXMM); 302 sse_movss(p->func, dest, dataXMM); 303} 304 305 306 307 308 309/* Extended swizzles? Maybe later. 310 */ 311static void emit_swizzle( struct translate_sse *p, 312 struct x86_reg dest, 313 struct x86_reg src, 314 unsigned char shuffle ) 315{ 316 sse_shufps(p->func, dest, src, shuffle); 317} 318 319 320static boolean translate_attr( struct translate_sse *p, 321 const struct translate_element *a, 322 struct x86_reg srcECX, 323 struct x86_reg dstEAX) 324{ 325 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 326 327 switch (a->input_format) { 328 case PIPE_FORMAT_R32_FLOAT: 329 emit_load_R32(p, dataXMM, srcECX); 330 break; 331 case PIPE_FORMAT_R32G32_FLOAT: 332 emit_load_R32G32(p, dataXMM, srcECX); 333 break; 334 case PIPE_FORMAT_R32G32B32_FLOAT: 335 emit_load_R32G32B32(p, dataXMM, srcECX); 336 break; 337 case PIPE_FORMAT_R32G32B32A32_FLOAT: 338 emit_load_R32G32B32A32(p, dataXMM, srcECX); 339 break; 340 case PIPE_FORMAT_B8G8R8A8_UNORM: 341 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 342 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 343 break; 344 case PIPE_FORMAT_R8G8B8A8_UNORM: 345 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 346 break; 347 default: 348 return FALSE; 349 } 350 351 switch (a->output_format) { 352 case PIPE_FORMAT_R32_FLOAT: 353 emit_store_R32(p, dstEAX, dataXMM); 354 break; 355 case PIPE_FORMAT_R32G32_FLOAT: 356 emit_store_R32G32(p, dstEAX, dataXMM); 357 break; 358 case PIPE_FORMAT_R32G32B32_FLOAT: 359 emit_store_R32G32B32(p, dstEAX, dataXMM); 360 break; 361 case PIPE_FORMAT_R32G32B32A32_FLOAT: 362 emit_store_R32G32B32A32(p, dstEAX, dataXMM); 363 break; 364 case PIPE_FORMAT_B8G8R8A8_UNORM: 365 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 366 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 367 break; 368 case PIPE_FORMAT_R8G8B8A8_UNORM: 369 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 370 break; 371 default: 372 return FALSE; 373 } 374 375 return TRUE; 376} 377 378 379static boolean init_inputs( struct translate_sse *p, 380 boolean linear ) 381{ 382 unsigned i; 383 struct x86_reg instance_id = x86_make_disp(p->machine_EDX, 384 get_offset(p, &p->instance_id)); 385 386 for (i = 0; i < p->nr_buffer_varients; i++) { 387 struct translate_buffer_varient *varient = &p->buffer_varient[i]; 388 struct translate_buffer *buffer = &p->buffer[varient->buffer_index]; 389 390 if (linear || varient->instance_divisor) { 391 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, 392 get_offset(p, &buffer->stride)); 393 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, 394 get_offset(p, &varient->ptr)); 395 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, 396 get_offset(p, &buffer->base_ptr)); 397 struct x86_reg elt = p->idx_EBX; 398 struct x86_reg tmp_EAX = p->tmp_EAX; 399 400 /* Calculate pointer to first attrib: 401 * base_ptr + stride * index, where index depends on instance divisor 402 */ 403 if (varient->instance_divisor) { 404 /* Our index is instance ID divided by instance divisor. 405 */ 406 x86_mov(p->func, tmp_EAX, instance_id); 407 408 if (varient->instance_divisor != 1) { 409 struct x86_reg tmp_EDX = p->machine_EDX; 410 struct x86_reg tmp_ECX = p->outbuf_ECX; 411 412 /* TODO: Add x86_shr() to rtasm and use it whenever 413 * instance divisor is power of two. 414 */ 415 416 x86_push(p->func, tmp_EDX); 417 x86_push(p->func, tmp_ECX); 418 x86_xor(p->func, tmp_EDX, tmp_EDX); 419 x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor); 420 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ 421 x86_pop(p->func, tmp_ECX); 422 x86_pop(p->func, tmp_EDX); 423 } 424 } else { 425 x86_mov(p->func, tmp_EAX, elt); 426 } 427 428 /* 429 * TODO: Respect translate_buffer::max_index. 430 */ 431 432 x86_imul(p->func, tmp_EAX, buf_stride); 433 x86_add(p->func, tmp_EAX, buf_base_ptr); 434 435 436 /* In the linear case, keep the buffer pointer instead of the 437 * index number. 438 */ 439 if (linear && p->nr_buffer_varients == 1) 440 x86_mov(p->func, elt, tmp_EAX); 441 else 442 x86_mov(p->func, buf_ptr, tmp_EAX); 443 } 444 } 445 446 return TRUE; 447} 448 449 450static struct x86_reg get_buffer_ptr( struct translate_sse *p, 451 boolean linear, 452 unsigned var_idx, 453 struct x86_reg elt ) 454{ 455 if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { 456 return x86_make_disp(p->machine_EDX, 457 get_offset(p, &p->instance_id)); 458 } 459 if (linear && p->nr_buffer_varients == 1) { 460 return p->idx_EBX; 461 } 462 else if (linear || p->buffer_varient[var_idx].instance_divisor) { 463 struct x86_reg ptr = p->tmp_EAX; 464 struct x86_reg buf_ptr = 465 x86_make_disp(p->machine_EDX, 466 get_offset(p, &p->buffer_varient[var_idx].ptr)); 467 468 x86_mov(p->func, ptr, buf_ptr); 469 return ptr; 470 } 471 else { 472 struct x86_reg ptr = p->tmp_EAX; 473 const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx]; 474 475 struct x86_reg buf_stride = 476 x86_make_disp(p->machine_EDX, 477 get_offset(p, &p->buffer[varient->buffer_index].stride)); 478 479 struct x86_reg buf_base_ptr = 480 x86_make_disp(p->machine_EDX, 481 get_offset(p, &p->buffer[varient->buffer_index].base_ptr)); 482 483 484 485 /* Calculate pointer to current attrib: 486 */ 487 x86_mov(p->func, ptr, buf_stride); 488 x86_imul(p->func, ptr, elt); 489 x86_add(p->func, ptr, buf_base_ptr); 490 return ptr; 491 } 492} 493 494 495 496static boolean incr_inputs( struct translate_sse *p, 497 boolean linear ) 498{ 499 if (linear && p->nr_buffer_varients == 1) { 500 struct x86_reg stride = x86_make_disp(p->machine_EDX, 501 get_offset(p, &p->buffer[0].stride)); 502 503 if (p->buffer_varient[0].instance_divisor == 0) { 504 x86_add(p->func, p->idx_EBX, stride); 505 sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); 506 } 507 } 508 else if (linear) { 509 unsigned i; 510 511 /* Is this worthwhile?? 512 */ 513 for (i = 0; i < p->nr_buffer_varients; i++) { 514 struct translate_buffer_varient *varient = &p->buffer_varient[i]; 515 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, 516 get_offset(p, &varient->ptr)); 517 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, 518 get_offset(p, &p->buffer[varient->buffer_index].stride)); 519 520 if (varient->instance_divisor == 0) { 521 x86_mov(p->func, p->tmp_EAX, buf_ptr); 522 x86_add(p->func, p->tmp_EAX, buf_stride); 523 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); 524 x86_mov(p->func, buf_ptr, p->tmp_EAX); 525 } 526 } 527 } 528 else { 529 x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4)); 530 } 531 532 return TRUE; 533} 534 535 536/* Build run( struct translate *machine, 537 * unsigned start, 538 * unsigned count, 539 * void *output_buffer ) 540 * or 541 * run_elts( struct translate *machine, 542 * unsigned *elts, 543 * unsigned count, 544 * void *output_buffer ) 545 * 546 * Lots of hardcoding 547 * 548 * EAX -- pointer to current output vertex 549 * ECX -- pointer to current attribute 550 * 551 */ 552static boolean build_vertex_emit( struct translate_sse *p, 553 struct x86_function *func, 554 boolean linear ) 555{ 556 int fixup, label; 557 unsigned j; 558 559 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); 560 p->idx_EBX = x86_make_reg(file_REG32, reg_BX); 561 p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX); 562 p->machine_EDX = x86_make_reg(file_REG32, reg_DX); 563 p->count_ESI = x86_make_reg(file_REG32, reg_SI); 564 565 p->func = func; 566 p->loaded_inv_255 = FALSE; 567 p->loaded_255 = FALSE; 568 p->loaded_identity = FALSE; 569 570 x86_init_func(p->func); 571 572 /* Push a few regs? 573 */ 574 x86_push(p->func, p->idx_EBX); 575 x86_push(p->func, p->count_ESI); 576 577 /* Load arguments into regs: 578 */ 579 x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); 580 x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); 581 x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); 582 x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); 583 584 /* Load instance ID. 585 */ 586 if (p->use_instancing) { 587 x86_mov(p->func, 588 p->tmp_EAX, 589 x86_fn_arg(p->func, 4)); 590 x86_mov(p->func, 591 x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)), 592 p->tmp_EAX); 593 } 594 595 /* Get vertex count, compare to zero 596 */ 597 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); 598 x86_cmp(p->func, p->count_ESI, p->tmp_EAX); 599 fixup = x86_jcc_forward(p->func, cc_E); 600 601 /* always load, needed or not: 602 */ 603 init_inputs(p, linear); 604 605 /* Note address for loop jump 606 */ 607 label = x86_get_label(p->func); 608 { 609 struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); 610 int last_varient = -1; 611 struct x86_reg vb; 612 613 for (j = 0; j < p->translate.key.nr_elements; j++) { 614 const struct translate_element *a = &p->translate.key.element[j]; 615 unsigned varient = p->element_to_buffer_varient[j]; 616 617 /* Figure out source pointer address: 618 */ 619 if (varient != last_varient) { 620 last_varient = varient; 621 vb = get_buffer_ptr(p, linear, varient, elt); 622 } 623 624 if (!translate_attr( p, a, 625 x86_make_disp(vb, a->input_offset), 626 x86_make_disp(p->outbuf_ECX, a->output_offset))) 627 return FALSE; 628 } 629 630 /* Next output vertex: 631 */ 632 x86_lea(p->func, 633 p->outbuf_ECX, 634 x86_make_disp(p->outbuf_ECX, 635 p->translate.key.output_stride)); 636 637 /* Incr index 638 */ 639 incr_inputs( p, linear ); 640 } 641 642 /* decr count, loop if not zero 643 */ 644 x86_dec(p->func, p->count_ESI); 645 x86_jcc(p->func, cc_NZ, label); 646 647 /* Exit mmx state? 648 */ 649 if (p->func->need_emms) 650 mmx_emms(p->func); 651 652 /* Land forward jump here: 653 */ 654 x86_fixup_fwd_jump(p->func, fixup); 655 656 /* Pop regs and return 657 */ 658 659 x86_pop(p->func, p->count_ESI); 660 x86_pop(p->func, p->idx_EBX); 661 x86_ret(p->func); 662 663 return TRUE; 664} 665 666 667 668 669 670 671 672static void translate_sse_set_buffer( struct translate *translate, 673 unsigned buf, 674 const void *ptr, 675 unsigned stride, 676 unsigned max_index ) 677{ 678 struct translate_sse *p = (struct translate_sse *)translate; 679 680 if (buf < p->nr_buffers) { 681 p->buffer[buf].base_ptr = (char *)ptr; 682 p->buffer[buf].stride = stride; 683 p->buffer[buf].max_index = max_index; 684 } 685 686 if (0) debug_printf("%s %d/%d: %p %d\n", 687 __FUNCTION__, buf, 688 p->nr_buffers, 689 ptr, stride); 690} 691 692 693static void translate_sse_release( struct translate *translate ) 694{ 695 struct translate_sse *p = (struct translate_sse *)translate; 696 697 x86_release_func( &p->linear_func ); 698 x86_release_func( &p->elt_func ); 699 700 FREE(p); 701} 702 703static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, 704 const unsigned *elts, 705 unsigned count, 706 unsigned instance_id, 707 void *output_buffer ) 708{ 709 struct translate_sse *p = (struct translate_sse *)translate; 710 711 p->gen_run_elts( translate, 712 elts, 713 count, 714 instance_id, 715 output_buffer); 716} 717 718static void PIPE_CDECL translate_sse_run( struct translate *translate, 719 unsigned start, 720 unsigned count, 721 unsigned instance_id, 722 void *output_buffer ) 723{ 724 struct translate_sse *p = (struct translate_sse *)translate; 725 726 p->gen_run( translate, 727 start, 728 count, 729 instance_id, 730 output_buffer); 731} 732 733 734struct translate *translate_sse2_create( const struct translate_key *key ) 735{ 736 struct translate_sse *p = NULL; 737 unsigned i; 738 739 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) 740 goto fail; 741 742 p = CALLOC_STRUCT( translate_sse ); 743 if (p == NULL) 744 goto fail; 745 746 p->translate.key = *key; 747 p->translate.release = translate_sse_release; 748 p->translate.set_buffer = translate_sse_set_buffer; 749 p->translate.run_elts = translate_sse_run_elts; 750 p->translate.run = translate_sse_run; 751 752 for (i = 0; i < key->nr_elements; i++) { 753 if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { 754 unsigned j; 755 756 p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1); 757 758 if (key->element[i].instance_divisor) { 759 p->use_instancing = TRUE; 760 } 761 762 /* 763 * Map vertex element to vertex buffer varient. 764 */ 765 for (j = 0; j < p->nr_buffer_varients; j++) { 766 if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer && 767 p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) { 768 break; 769 } 770 } 771 if (j == p->nr_buffer_varients) { 772 p->buffer_varient[j].buffer_index = key->element[i].input_buffer; 773 p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor; 774 p->nr_buffer_varients++; 775 } 776 p->element_to_buffer_varient[i] = j; 777 } else { 778 assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); 779 780 p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID; 781 } 782 } 783 784 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); 785 786 if (!build_vertex_emit(p, &p->linear_func, TRUE)) 787 goto fail; 788 789 if (!build_vertex_emit(p, &p->elt_func, FALSE)) 790 goto fail; 791 792 p->gen_run = (run_func)x86_get_func(&p->linear_func); 793 if (p->gen_run == NULL) 794 goto fail; 795 796 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); 797 if (p->gen_run_elts == NULL) 798 goto fail; 799 800 return &p->translate; 801 802 fail: 803 if (p) 804 translate_sse_release( &p->translate ); 805 806 return NULL; 807} 808 809 810 811#else 812 813struct translate *translate_sse2_create( const struct translate_key *key ) 814{ 815 return NULL; 816} 817 818#endif 819