translate_sse.c revision 09c0287b84725098c0b365668231ddf00487c84c
1/* 2 * Copyright 2003 Tungsten Graphics, inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw@tungstengraphics.com> 26 */ 27 28 29#include "pipe/p_config.h" 30#include "pipe/p_compiler.h" 31#include "util/u_memory.h" 32#include "util/u_math.h" 33 34#include "translate.h" 35 36 37#if defined(PIPE_ARCH_X86) 38 39#include "rtasm/rtasm_cpu.h" 40#include "rtasm/rtasm_x86sse.h" 41 42 43#define X 0 44#define Y 1 45#define Z 2 46#define W 3 47 48 49typedef void (PIPE_CDECL *run_func)( struct translate *translate, 50 unsigned start, 51 unsigned count, 52 unsigned instance_id, 53 void *output_buffer ); 54 55typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate, 56 const unsigned *elts, 57 unsigned count, 58 void *output_buffer ); 59 60struct translate_buffer { 61 const void *base_ptr; 62 unsigned stride; 63}; 64 65struct translate_buffer_varient { 66 unsigned buffer_index; 67 unsigned instance_divisor; 68 void *ptr; /* updated either per vertex or per instance */ 69}; 70 71 72struct translate_sse { 73 struct translate translate; 74 75 struct x86_function linear_func; 76 struct x86_function elt_func; 77 struct x86_function *func; 78 79 boolean loaded_identity; 80 boolean loaded_255; 81 boolean loaded_inv_255; 82 83 float identity[4]; 84 float float_255[4]; 85 float inv_255[4]; 86 87 struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; 88 unsigned nr_buffers; 89 90 /* Multiple buffer varients can map to a single buffer. */ 91 struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS]; 92 unsigned nr_buffer_varients; 93 94 /* Multiple elements can map to a single buffer varient. */ 95 unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS]; 96 97 boolean use_instancing; 98 unsigned instance_id; 99 100 run_func gen_run; 101 run_elts_func gen_run_elts; 102 103 /* these are actually known values, but putting them in a struct 104 * like this is helpful to keep them in sync across the file. 105 */ 106 struct x86_reg tmp_EAX; 107 struct x86_reg idx_EBX; /* either start+i or &elt[i] */ 108 struct x86_reg outbuf_ECX; 109 struct x86_reg machine_EDX; 110 struct x86_reg count_ESI; /* decrements to zero */ 111}; 112 113static int get_offset( const void *a, const void *b ) 114{ 115 return (const char *)b - (const char *)a; 116} 117 118 119 120static struct x86_reg get_identity( struct translate_sse *p ) 121{ 122 struct x86_reg reg = x86_make_reg(file_XMM, 6); 123 124 if (!p->loaded_identity) { 125 p->loaded_identity = TRUE; 126 p->identity[0] = 0; 127 p->identity[1] = 0; 128 p->identity[2] = 0; 129 p->identity[3] = 1; 130 131 sse_movups(p->func, reg, 132 x86_make_disp(p->machine_EDX, 133 get_offset(p, &p->identity[0]))); 134 } 135 136 return reg; 137} 138 139static struct x86_reg get_255( struct translate_sse *p ) 140{ 141 struct x86_reg reg = x86_make_reg(file_XMM, 7); 142 143 if (!p->loaded_255) { 144 p->loaded_255 = TRUE; 145 p->float_255[0] = 146 p->float_255[1] = 147 p->float_255[2] = 148 p->float_255[3] = 255.0f; 149 150 sse_movups(p->func, reg, 151 x86_make_disp(p->machine_EDX, 152 get_offset(p, &p->float_255[0]))); 153 } 154 155 return reg; 156} 157 158static struct x86_reg get_inv_255( struct translate_sse *p ) 159{ 160 struct x86_reg reg = x86_make_reg(file_XMM, 5); 161 162 if (!p->loaded_inv_255) { 163 p->loaded_inv_255 = TRUE; 164 p->inv_255[0] = 165 p->inv_255[1] = 166 p->inv_255[2] = 167 p->inv_255[3] = 1.0f / 255.0f; 168 169 sse_movups(p->func, reg, 170 x86_make_disp(p->machine_EDX, 171 get_offset(p, &p->inv_255[0]))); 172 } 173 174 return reg; 175} 176 177 178static void emit_load_R32G32B32A32( struct translate_sse *p, 179 struct x86_reg data, 180 struct x86_reg arg0 ) 181{ 182 sse_movups(p->func, data, arg0); 183} 184 185static void emit_load_R32G32B32( struct translate_sse *p, 186 struct x86_reg data, 187 struct x86_reg arg0 ) 188{ 189 /* Have to jump through some hoops: 190 * 191 * c 0 0 0 192 * c 0 0 1 193 * 0 0 c 1 194 * a b c 1 195 */ 196 sse_movss(p->func, data, x86_make_disp(arg0, 8)); 197 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); 198 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); 199 sse_movlps(p->func, data, arg0); 200} 201 202static void emit_load_R32G32( struct translate_sse *p, 203 struct x86_reg data, 204 struct x86_reg arg0 ) 205{ 206 /* 0 0 0 1 207 * a b 0 1 208 */ 209 sse_movups(p->func, data, get_identity(p) ); 210 sse_movlps(p->func, data, arg0); 211} 212 213 214static void emit_load_R32( struct translate_sse *p, 215 struct x86_reg data, 216 struct x86_reg arg0 ) 217{ 218 /* a 0 0 0 219 * a 0 0 1 220 */ 221 sse_movss(p->func, data, arg0); 222 sse_orps(p->func, data, get_identity(p) ); 223} 224 225 226static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, 227 struct x86_reg data, 228 struct x86_reg src ) 229{ 230 231 /* Load and unpack twice: 232 */ 233 sse_movss(p->func, data, src); 234 sse2_punpcklbw(p->func, data, get_identity(p)); 235 sse2_punpcklbw(p->func, data, get_identity(p)); 236 237 /* Convert to float: 238 */ 239 sse2_cvtdq2ps(p->func, data, data); 240 241 242 /* Scale by 1/255.0 243 */ 244 sse_mulps(p->func, data, get_inv_255(p)); 245} 246 247 248 249 250static void emit_store_R32G32B32A32( struct translate_sse *p, 251 struct x86_reg dest, 252 struct x86_reg dataXMM ) 253{ 254 sse_movups(p->func, dest, dataXMM); 255} 256 257static void emit_store_R32G32B32( struct translate_sse *p, 258 struct x86_reg dest, 259 struct x86_reg dataXMM ) 260{ 261 /* Emit two, shuffle, emit one. 262 */ 263 sse_movlps(p->func, dest, dataXMM); 264 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ 265 sse_movss(p->func, x86_make_disp(dest,8), dataXMM); 266} 267 268static void emit_store_R32G32( struct translate_sse *p, 269 struct x86_reg dest, 270 struct x86_reg dataXMM ) 271{ 272 sse_movlps(p->func, dest, dataXMM); 273} 274 275static void emit_store_R32( struct translate_sse *p, 276 struct x86_reg dest, 277 struct x86_reg dataXMM ) 278{ 279 sse_movss(p->func, dest, dataXMM); 280} 281 282 283 284static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, 285 struct x86_reg dest, 286 struct x86_reg dataXMM ) 287{ 288 /* Scale by 255.0 289 */ 290 sse_mulps(p->func, dataXMM, get_255(p)); 291 292 /* Pack and emit: 293 */ 294 sse2_cvtps2dq(p->func, dataXMM, dataXMM); 295 sse2_packssdw(p->func, dataXMM, dataXMM); 296 sse2_packuswb(p->func, dataXMM, dataXMM); 297 sse_movss(p->func, dest, dataXMM); 298} 299 300 301 302 303 304/* Extended swizzles? Maybe later. 305 */ 306static void emit_swizzle( struct translate_sse *p, 307 struct x86_reg dest, 308 struct x86_reg src, 309 unsigned char shuffle ) 310{ 311 sse_shufps(p->func, dest, src, shuffle); 312} 313 314 315static boolean translate_attr( struct translate_sse *p, 316 const struct translate_element *a, 317 struct x86_reg srcECX, 318 struct x86_reg dstEAX) 319{ 320 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 321 322 switch (a->input_format) { 323 case PIPE_FORMAT_R32_FLOAT: 324 emit_load_R32(p, dataXMM, srcECX); 325 break; 326 case PIPE_FORMAT_R32G32_FLOAT: 327 emit_load_R32G32(p, dataXMM, srcECX); 328 break; 329 case PIPE_FORMAT_R32G32B32_FLOAT: 330 emit_load_R32G32B32(p, dataXMM, srcECX); 331 break; 332 case PIPE_FORMAT_R32G32B32A32_FLOAT: 333 emit_load_R32G32B32A32(p, dataXMM, srcECX); 334 break; 335 case PIPE_FORMAT_B8G8R8A8_UNORM: 336 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 337 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 338 break; 339 case PIPE_FORMAT_R8G8B8A8_UNORM: 340 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 341 break; 342 default: 343 return FALSE; 344 } 345 346 switch (a->output_format) { 347 case PIPE_FORMAT_R32_FLOAT: 348 emit_store_R32(p, dstEAX, dataXMM); 349 break; 350 case PIPE_FORMAT_R32G32_FLOAT: 351 emit_store_R32G32(p, dstEAX, dataXMM); 352 break; 353 case PIPE_FORMAT_R32G32B32_FLOAT: 354 emit_store_R32G32B32(p, dstEAX, dataXMM); 355 break; 356 case PIPE_FORMAT_R32G32B32A32_FLOAT: 357 emit_store_R32G32B32A32(p, dstEAX, dataXMM); 358 break; 359 case PIPE_FORMAT_B8G8R8A8_UNORM: 360 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 361 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 362 break; 363 case PIPE_FORMAT_R8G8B8A8_UNORM: 364 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 365 break; 366 default: 367 return FALSE; 368 } 369 370 return TRUE; 371} 372 373 374static boolean init_inputs( struct translate_sse *p, 375 boolean linear ) 376{ 377 unsigned i; 378 if (linear) { 379 struct x86_reg instance_id = x86_make_disp(p->machine_EDX, 380 get_offset(p, &p->instance_id)); 381 382 for (i = 0; i < p->nr_buffer_varients; i++) { 383 struct translate_buffer_varient *varient = &p->buffer_varient[i]; 384 struct translate_buffer *buffer = &p->buffer[varient->buffer_index]; 385 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, 386 get_offset(p, &buffer->stride)); 387 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, 388 get_offset(p, &varient->ptr)); 389 struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX, 390 get_offset(p, &buffer->base_ptr)); 391 struct x86_reg elt = p->idx_EBX; 392 struct x86_reg tmp_EAX = p->tmp_EAX; 393 394 /* Calculate pointer to first attrib: 395 * base_ptr + stride * index, where index depends on instance divisor 396 */ 397 if (varient->instance_divisor) { 398 /* Our index is instance ID divided by instance divisor. 399 */ 400 x86_mov(p->func, tmp_EAX, instance_id); 401 402 if (varient->instance_divisor != 1) { 403 struct x86_reg tmp_EDX = p->machine_EDX; 404 struct x86_reg tmp_ECX = p->outbuf_ECX; 405 406 /* TODO: Add x86_shr() to rtasm and use it whenever 407 * instance divisor is power of two. 408 */ 409 410 x86_push(p->func, tmp_EDX); 411 x86_push(p->func, tmp_ECX); 412 x86_xor(p->func, tmp_EDX, tmp_EDX); 413 x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor); 414 x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ 415 x86_pop(p->func, tmp_ECX); 416 x86_pop(p->func, tmp_EDX); 417 } 418 } else { 419 x86_mov(p->func, tmp_EAX, elt); 420 } 421 x86_imul(p->func, tmp_EAX, buf_stride); 422 x86_add(p->func, tmp_EAX, buf_base_ptr); 423 424 425 /* In the linear case, keep the buffer pointer instead of the 426 * index number. 427 */ 428 if (p->nr_buffer_varients == 1) 429 x86_mov(p->func, elt, tmp_EAX); 430 else 431 x86_mov(p->func, buf_ptr, tmp_EAX); 432 } 433 } 434 435 return TRUE; 436} 437 438 439static struct x86_reg get_buffer_ptr( struct translate_sse *p, 440 boolean linear, 441 unsigned var_idx, 442 struct x86_reg elt ) 443{ 444 if (linear && p->nr_buffer_varients == 1) { 445 return p->idx_EBX; 446 } 447 else if (linear) { 448 struct x86_reg ptr = p->tmp_EAX; 449 struct x86_reg buf_ptr = 450 x86_make_disp(p->machine_EDX, 451 get_offset(p, &p->buffer_varient[var_idx].ptr)); 452 453 x86_mov(p->func, ptr, buf_ptr); 454 return ptr; 455 } 456 else { 457 struct x86_reg ptr = p->tmp_EAX; 458 const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx]; 459 460 struct x86_reg buf_stride = 461 x86_make_disp(p->machine_EDX, 462 get_offset(p, &p->buffer[varient->buffer_index].stride)); 463 464 struct x86_reg buf_base_ptr = 465 x86_make_disp(p->machine_EDX, 466 get_offset(p, &p->buffer[varient->buffer_index].base_ptr)); 467 468 469 470 /* Calculate pointer to current attrib: 471 */ 472 x86_mov(p->func, ptr, buf_stride); 473 x86_imul(p->func, ptr, elt); 474 x86_add(p->func, ptr, buf_base_ptr); 475 return ptr; 476 } 477} 478 479 480 481static boolean incr_inputs( struct translate_sse *p, 482 boolean linear ) 483{ 484 if (linear && p->nr_buffer_varients == 1) { 485 struct x86_reg stride = x86_make_disp(p->machine_EDX, 486 get_offset(p, &p->buffer[0].stride)); 487 488 if (p->buffer_varient[0].instance_divisor == 0) { 489 x86_add(p->func, p->idx_EBX, stride); 490 sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192)); 491 } 492 } 493 else if (linear) { 494 unsigned i; 495 496 /* Is this worthwhile?? 497 */ 498 for (i = 0; i < p->nr_buffer_varients; i++) { 499 struct translate_buffer_varient *varient = &p->buffer_varient[i]; 500 struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX, 501 get_offset(p, &varient->ptr)); 502 struct x86_reg buf_stride = x86_make_disp(p->machine_EDX, 503 get_offset(p, &p->buffer[varient->buffer_index].stride)); 504 505 if (varient->instance_divisor == 0) { 506 x86_mov(p->func, p->tmp_EAX, buf_ptr); 507 x86_add(p->func, p->tmp_EAX, buf_stride); 508 if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); 509 x86_mov(p->func, buf_ptr, p->tmp_EAX); 510 } 511 } 512 } 513 else { 514 x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4)); 515 } 516 517 return TRUE; 518} 519 520 521/* Build run( struct translate *machine, 522 * unsigned start, 523 * unsigned count, 524 * void *output_buffer ) 525 * or 526 * run_elts( struct translate *machine, 527 * unsigned *elts, 528 * unsigned count, 529 * void *output_buffer ) 530 * 531 * Lots of hardcoding 532 * 533 * EAX -- pointer to current output vertex 534 * ECX -- pointer to current attribute 535 * 536 */ 537static boolean build_vertex_emit( struct translate_sse *p, 538 struct x86_function *func, 539 boolean linear ) 540{ 541 int fixup, label; 542 unsigned j; 543 544 p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); 545 p->idx_EBX = x86_make_reg(file_REG32, reg_BX); 546 p->outbuf_ECX = x86_make_reg(file_REG32, reg_CX); 547 p->machine_EDX = x86_make_reg(file_REG32, reg_DX); 548 p->count_ESI = x86_make_reg(file_REG32, reg_SI); 549 550 p->func = func; 551 p->loaded_inv_255 = FALSE; 552 p->loaded_255 = FALSE; 553 p->loaded_identity = FALSE; 554 555 x86_init_func(p->func); 556 557 /* Push a few regs? 558 */ 559 x86_push(p->func, p->idx_EBX); 560 x86_push(p->func, p->count_ESI); 561 562 /* Load arguments into regs: 563 */ 564 x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1)); 565 x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2)); 566 x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3)); 567 x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5)); 568 569 /* Load instance ID. 570 */ 571 if (p->use_instancing) { 572 x86_mov(p->func, 573 p->tmp_EAX, 574 x86_fn_arg(p->func, 4)); 575 x86_mov(p->func, 576 x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)), 577 p->tmp_EAX); 578 } 579 580 /* Get vertex count, compare to zero 581 */ 582 x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); 583 x86_cmp(p->func, p->count_ESI, p->tmp_EAX); 584 fixup = x86_jcc_forward(p->func, cc_E); 585 586 /* always load, needed or not: 587 */ 588 init_inputs(p, linear); 589 590 /* Note address for loop jump 591 */ 592 label = x86_get_label(p->func); 593 { 594 struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX); 595 int last_varient = -1; 596 struct x86_reg vb; 597 598 for (j = 0; j < p->translate.key.nr_elements; j++) { 599 const struct translate_element *a = &p->translate.key.element[j]; 600 unsigned varient = p->element_to_buffer_varient[j]; 601 602 /* Figure out source pointer address: 603 */ 604 if (varient != last_varient) { 605 last_varient = varient; 606 vb = get_buffer_ptr(p, linear, varient, elt); 607 } 608 609 if (!translate_attr( p, a, 610 x86_make_disp(vb, a->input_offset), 611 x86_make_disp(p->outbuf_ECX, a->output_offset))) 612 return FALSE; 613 } 614 615 /* Next output vertex: 616 */ 617 x86_lea(p->func, 618 p->outbuf_ECX, 619 x86_make_disp(p->outbuf_ECX, 620 p->translate.key.output_stride)); 621 622 /* Incr index 623 */ 624 incr_inputs( p, linear ); 625 } 626 627 /* decr count, loop if not zero 628 */ 629 x86_dec(p->func, p->count_ESI); 630 x86_jcc(p->func, cc_NZ, label); 631 632 /* Exit mmx state? 633 */ 634 if (p->func->need_emms) 635 mmx_emms(p->func); 636 637 /* Land forward jump here: 638 */ 639 x86_fixup_fwd_jump(p->func, fixup); 640 641 /* Pop regs and return 642 */ 643 644 x86_pop(p->func, p->count_ESI); 645 x86_pop(p->func, p->idx_EBX); 646 x86_ret(p->func); 647 648 return TRUE; 649} 650 651 652 653 654 655 656 657static void translate_sse_set_buffer( struct translate *translate, 658 unsigned buf, 659 const void *ptr, 660 unsigned stride ) 661{ 662 struct translate_sse *p = (struct translate_sse *)translate; 663 664 if (buf < p->nr_buffers) { 665 p->buffer[buf].base_ptr = (char *)ptr; 666 p->buffer[buf].stride = stride; 667 } 668 669 if (0) debug_printf("%s %d/%d: %p %d\n", 670 __FUNCTION__, buf, 671 p->nr_buffers, 672 ptr, stride); 673} 674 675 676static void translate_sse_release( struct translate *translate ) 677{ 678 struct translate_sse *p = (struct translate_sse *)translate; 679 680 x86_release_func( &p->linear_func ); 681 x86_release_func( &p->elt_func ); 682 683 FREE(p); 684} 685 686static void PIPE_CDECL translate_sse_run_elts( struct translate *translate, 687 const unsigned *elts, 688 unsigned count, 689 void *output_buffer ) 690{ 691 struct translate_sse *p = (struct translate_sse *)translate; 692 693 p->gen_run_elts( translate, 694 elts, 695 count, 696 output_buffer ); 697} 698 699static void PIPE_CDECL translate_sse_run( struct translate *translate, 700 unsigned start, 701 unsigned count, 702 unsigned instance_id, 703 void *output_buffer ) 704{ 705 struct translate_sse *p = (struct translate_sse *)translate; 706 707 p->gen_run( translate, 708 start, 709 count, 710 instance_id, 711 output_buffer ); 712} 713 714 715struct translate *translate_sse2_create( const struct translate_key *key ) 716{ 717 struct translate_sse *p = NULL; 718 unsigned i; 719 720 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) 721 goto fail; 722 723 p = CALLOC_STRUCT( translate_sse ); 724 if (p == NULL) 725 goto fail; 726 727 p->translate.key = *key; 728 p->translate.release = translate_sse_release; 729 p->translate.set_buffer = translate_sse_set_buffer; 730 p->translate.run_elts = translate_sse_run_elts; 731 p->translate.run = translate_sse_run; 732 733 for (i = 0; i < key->nr_elements; i++) { 734 unsigned j; 735 736 p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 ); 737 738 if (key->element[i].instance_divisor) { 739 p->use_instancing = TRUE; 740 } 741 742 /* 743 * Map vertex element to vertex buffer varient. 744 */ 745 for (j = 0; j < p->nr_buffer_varients; j++) { 746 if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer && 747 p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) { 748 break; 749 } 750 } 751 if (j == p->nr_buffer_varients) { 752 p->buffer_varient[j].buffer_index = key->element[i].input_buffer; 753 p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor; 754 p->nr_buffer_varients++; 755 } 756 p->element_to_buffer_varient[i] = j; 757 } 758 759 if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); 760 761 if (!build_vertex_emit(p, &p->linear_func, TRUE)) 762 goto fail; 763 764 if (!build_vertex_emit(p, &p->elt_func, FALSE)) 765 goto fail; 766 767 p->gen_run = (run_func)x86_get_func(&p->linear_func); 768 if (p->gen_run == NULL) 769 goto fail; 770 771 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); 772 if (p->gen_run_elts == NULL) 773 goto fail; 774 775 return &p->translate; 776 777 fail: 778 if (p) 779 translate_sse_release( &p->translate ); 780 781 return NULL; 782} 783 784 785 786#else 787 788struct translate *translate_sse2_create( const struct translate_key *key ) 789{ 790 return NULL; 791} 792 793#endif 794