translate_sse.c revision fb3623b235f5caa9d76e656b1e5eda797c7c73eb
1/* 2 * Copyright 2003 Tungsten Graphics, inc. 3 * All Rights Reserved. 4 * 5 * Permission is hereby granted, free of charge, to any person obtaining a 6 * copy of this software and associated documentation files (the "Software"), 7 * to deal in the Software without restriction, including without limitation 8 * on the rights to use, copy, modify, merge, publish, distribute, sub 9 * license, and/or sell copies of the Software, and to permit persons to whom 10 * the Software is furnished to do so, subject to the following conditions: 11 * 12 * The above copyright notice and this permission notice (including the next 13 * paragraph) shall be included in all copies or substantial portions of the 14 * Software. 15 * 16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL 19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, 20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 22 * USE OR OTHER DEALINGS IN THE SOFTWARE. 23 * 24 * Authors: 25 * Keith Whitwell <keithw@tungstengraphics.com> 26 */ 27 28 29#include "pipe/p_compiler.h" 30#include "pipe/p_util.h" 31#include "util/u_simple_list.h" 32 33#include "translate.h" 34 35 36#if defined(__i386__) || defined(__386__) || defined(i386) 37 38#include "rtasm/rtasm_cpu.h" 39#include "rtasm/rtasm_x86sse.h" 40 41 42#define X 0 43#define Y 1 44#define Z 2 45#define W 3 46 47 48#ifdef WIN32 49#define RTASM __cdecl 50#else 51#define RTASM 52#endif 53 54typedef void (RTASM *run_func)( struct translate *translate, 55 unsigned start, 56 unsigned count, 57 void *output_buffer ); 58 59typedef void (RTASM *run_elts_func)( struct translate *translate, 60 const unsigned *elts, 61 unsigned count, 62 void *output_buffer ); 63 64 65 66struct translate_sse { 67 struct translate translate; 68 69 struct x86_function linear_func; 70 struct x86_function elt_func; 71 struct x86_function *func; 72 73 boolean loaded_identity; 74 boolean loaded_255; 75 boolean loaded_inv_255; 76 77 float identity[4]; 78 float float_255[4]; 79 float inv_255[4]; 80 81 struct { 82 char *input_ptr; 83 unsigned input_stride; 84 } attrib[PIPE_MAX_ATTRIBS]; 85 86 run_func gen_run; 87 run_elts_func gen_run_elts; 88 89}; 90 91static int get_offset( const void *a, const void *b ) 92{ 93 return (const char *)b - (const char *)a; 94} 95 96 97 98static struct x86_reg get_identity( struct translate_sse *p ) 99{ 100 struct x86_reg reg = x86_make_reg(file_XMM, 6); 101 102 if (!p->loaded_identity) { 103 /* Nasty: 104 */ 105 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI); 106 107 p->loaded_identity = TRUE; 108 p->identity[0] = 0; 109 p->identity[1] = 0; 110 p->identity[2] = 0; 111 p->identity[3] = 1; 112 113 sse_movups(p->func, reg, 114 x86_make_disp(translateESI, 115 get_offset(p, &p->identity[0]))); 116 } 117 118 return reg; 119} 120 121static struct x86_reg get_255( struct translate_sse *p ) 122{ 123 struct x86_reg reg = x86_make_reg(file_XMM, 6); 124 125 if (!p->loaded_255) { 126 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI); 127 128 p->loaded_255 = TRUE; 129 p->float_255[0] = 130 p->float_255[1] = 131 p->float_255[2] = 132 p->float_255[3] = 255.0f; 133 134 sse_movups(p->func, reg, 135 x86_make_disp(translateESI, 136 get_offset(p, &p->float_255[0]))); 137 } 138 139 return reg; 140 return x86_make_reg(file_XMM, 7); 141} 142 143static struct x86_reg get_inv_255( struct translate_sse *p ) 144{ 145 struct x86_reg reg = x86_make_reg(file_XMM, 5); 146 147 if (!p->loaded_inv_255) { 148 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI); 149 150 p->loaded_inv_255 = TRUE; 151 p->inv_255[0] = 152 p->inv_255[1] = 153 p->inv_255[2] = 154 p->inv_255[3] = 1.0f / 255.0f; 155 156 sse_movups(p->func, reg, 157 x86_make_disp(translateESI, 158 get_offset(p, &p->inv_255[0]))); 159 } 160 161 return reg; 162} 163 164 165static void emit_load_R32G32B32A32( struct translate_sse *p, 166 struct x86_reg data, 167 struct x86_reg arg0 ) 168{ 169 sse_movups(p->func, data, arg0); 170} 171 172static void emit_load_R32G32B32( struct translate_sse *p, 173 struct x86_reg data, 174 struct x86_reg arg0 ) 175{ 176 /* Have to jump through some hoops: 177 * 178 * c 0 0 0 179 * c 0 0 1 180 * 0 0 c 1 181 * a b c 1 182 */ 183 sse_movss(p->func, data, x86_make_disp(arg0, 8)); 184 sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) ); 185 sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); 186 sse_movlps(p->func, data, arg0); 187} 188 189static void emit_load_R32G32( struct translate_sse *p, 190 struct x86_reg data, 191 struct x86_reg arg0 ) 192{ 193 /* 0 0 0 1 194 * a b 0 1 195 */ 196 sse_movups(p->func, data, get_identity(p) ); 197 sse_movlps(p->func, data, arg0); 198} 199 200 201static void emit_load_R32( struct translate_sse *p, 202 struct x86_reg data, 203 struct x86_reg arg0 ) 204{ 205 /* a 0 0 0 206 * a 0 0 1 207 */ 208 sse_movss(p->func, data, arg0); 209 sse_orps(p->func, data, get_identity(p) ); 210} 211 212 213static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p, 214 struct x86_reg data, 215 struct x86_reg src ) 216{ 217 218 /* Load and unpack twice: 219 */ 220 sse_movss(p->func, data, src); 221 sse2_punpcklbw(p->func, data, get_identity(p)); 222 sse2_punpcklbw(p->func, data, get_identity(p)); 223 224 /* Convert to float: 225 */ 226 sse2_cvtdq2ps(p->func, data, data); 227 228 229 /* Scale by 1/255.0 230 */ 231 sse_mulps(p->func, data, get_inv_255(p)); 232} 233 234 235 236 237static void emit_store_R32G32B32A32( struct translate_sse *p, 238 struct x86_reg dest, 239 struct x86_reg dataXMM ) 240{ 241 sse_movups(p->func, dest, dataXMM); 242} 243 244static void emit_store_R32G32B32( struct translate_sse *p, 245 struct x86_reg dest, 246 struct x86_reg dataXMM ) 247{ 248 /* Emit two, shuffle, emit one. 249 */ 250 sse_movlps(p->func, dest, dataXMM); 251 sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ 252 sse_movss(p->func, x86_make_disp(dest,8), dataXMM); 253} 254 255static void emit_store_R32G32( struct translate_sse *p, 256 struct x86_reg dest, 257 struct x86_reg dataXMM ) 258{ 259 sse_movlps(p->func, dest, dataXMM); 260} 261 262static void emit_store_R32( struct translate_sse *p, 263 struct x86_reg dest, 264 struct x86_reg dataXMM ) 265{ 266 sse_movss(p->func, dest, dataXMM); 267} 268 269 270 271static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p, 272 struct x86_reg dest, 273 struct x86_reg dataXMM ) 274{ 275 /* Scale by 255.0 276 */ 277 sse_mulps(p->func, dataXMM, get_255(p)); 278 279 /* Pack and emit: 280 */ 281 sse2_cvtps2dq(p->func, dataXMM, dataXMM); 282 sse2_packssdw(p->func, dataXMM, dataXMM); 283 sse2_packuswb(p->func, dataXMM, dataXMM); 284 sse_movss(p->func, dest, dataXMM); 285} 286 287 288 289 290 291static void get_src_ptr( struct translate_sse *p, 292 struct x86_reg srcEAX, 293 struct x86_reg translateREG, 294 struct x86_reg eltREG, 295 unsigned a ) 296{ 297 struct x86_reg input_ptr = 298 x86_make_disp(translateREG, 299 get_offset(p, &p->attrib[a].input_ptr)); 300 301 struct x86_reg input_stride = 302 x86_make_disp(translateREG, 303 get_offset(p, &p->attrib[a].input_stride)); 304 305 /* Calculate pointer to current attrib: 306 */ 307 x86_mov(p->func, srcEAX, input_stride); 308 x86_imul(p->func, srcEAX, eltREG); 309 x86_add(p->func, srcEAX, input_ptr); 310} 311 312 313/* Extended swizzles? Maybe later. 314 */ 315static void emit_swizzle( struct translate_sse *p, 316 struct x86_reg dest, 317 struct x86_reg src, 318 unsigned shuffle ) 319{ 320 sse_shufps(p->func, dest, src, shuffle); 321} 322 323 324static boolean translate_attr( struct translate_sse *p, 325 const struct translate_element *a, 326 struct x86_reg srcECX, 327 struct x86_reg dstEAX) 328{ 329 struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); 330 331 switch (a->input_format) { 332 case PIPE_FORMAT_R32_FLOAT: 333 emit_load_R32(p, dataXMM, srcECX); 334 break; 335 case PIPE_FORMAT_R32G32_FLOAT: 336 emit_load_R32G32(p, dataXMM, srcECX); 337 break; 338 case PIPE_FORMAT_R32G32B32_FLOAT: 339 emit_load_R32G32B32(p, dataXMM, srcECX); 340 break; 341 case PIPE_FORMAT_R32G32B32A32_FLOAT: 342 emit_load_R32G32B32A32(p, dataXMM, srcECX); 343 break; 344 case PIPE_FORMAT_B8G8R8A8_UNORM: 345 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 346 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 347 break; 348 case PIPE_FORMAT_R8G8B8A8_UNORM: 349 emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX); 350 break; 351 default: 352 return FALSE; 353 } 354 355 switch (a->output_format) { 356 case PIPE_FORMAT_R32_FLOAT: 357 emit_store_R32(p, dstEAX, dataXMM); 358 break; 359 case PIPE_FORMAT_R32G32_FLOAT: 360 emit_store_R32G32(p, dstEAX, dataXMM); 361 break; 362 case PIPE_FORMAT_R32G32B32_FLOAT: 363 emit_store_R32G32B32(p, dstEAX, dataXMM); 364 break; 365 case PIPE_FORMAT_R32G32B32A32_FLOAT: 366 emit_store_R32G32B32A32(p, dstEAX, dataXMM); 367 break; 368 case PIPE_FORMAT_B8G8R8A8_UNORM: 369 emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W)); 370 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 371 break; 372 case PIPE_FORMAT_R8G8B8A8_UNORM: 373 emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM); 374 break; 375 default: 376 return FALSE; 377 } 378 379 return TRUE; 380} 381 382/* Build run( struct translate *translate, 383 * unsigned start, 384 * unsigned count, 385 * void *output_buffer ) 386 * or 387 * run_elts( struct translate *translate, 388 * unsigned *elts, 389 * unsigned count, 390 * void *output_buffer ) 391 * 392 * Lots of hardcoding 393 * 394 * EAX -- pointer to current output vertex 395 * ECX -- pointer to current attribute 396 * 397 */ 398static boolean build_vertex_emit( struct translate_sse *p, 399 struct x86_function *func, 400 boolean linear ) 401{ 402 struct x86_reg vertexECX = x86_make_reg(file_REG32, reg_AX); 403 struct x86_reg idxEBX = x86_make_reg(file_REG32, reg_BX); 404 struct x86_reg srcEAX = x86_make_reg(file_REG32, reg_CX); 405 struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP); 406 struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI); 407 int fixup, label; 408 unsigned j; 409 410 p->func = func; 411 p->loaded_inv_255 = FALSE; 412 p->loaded_255 = FALSE; 413 p->loaded_identity = FALSE; 414 415 x86_init_func(p->func); 416 417 /* Push a few regs? 418 */ 419 x86_push(p->func, countEBP); 420 x86_push(p->func, translateESI); 421 x86_push(p->func, idxEBX); 422 423 /* Get vertex count, compare to zero 424 */ 425 x86_xor(p->func, idxEBX, idxEBX); 426 x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3)); 427 x86_cmp(p->func, countEBP, idxEBX); 428 fixup = x86_jcc_forward(p->func, cc_E); 429 430 /* If linear, idx is the current element, otherwise it is a pointer 431 * to the current element. 432 */ 433 x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2)); 434 435 /* Initialize destination register. 436 */ 437 x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4)); 438 439 /* Move argument 1 (translate_sse pointer) into a reg: 440 */ 441 x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1)); 442 443 444 /* always load, needed or not: 445 */ 446 447 /* Note address for loop jump */ 448 label = x86_get_label(p->func); 449 450 451 for (j = 0; j < p->translate.key.nr_elements; j++) { 452 const struct translate_element *a = &p->translate.key.element[j]; 453 454 struct x86_reg destEAX = x86_make_disp(vertexECX, 455 a->output_offset); 456 457 /* Figure out source pointer address: 458 */ 459 if (linear) { 460 get_src_ptr(p, srcEAX, translateESI, idxEBX, j); 461 } 462 else { 463 get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j); 464 } 465 466 if (!translate_attr( p, a, x86_deref(srcEAX), destEAX )) 467 return FALSE; 468 } 469 470 /* Next vertex: 471 */ 472 x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride)); 473 474 /* Incr index 475 */ /* Emit code for each of the attributes. Currently routes 476 * everything through SSE registers, even when it might be more 477 * efficient to stick with regular old x86. No optimization or 478 * other tricks - enough new ground to cover here just getting 479 * things working. 480 */ 481 482 if (linear) { 483 x86_inc(p->func, idxEBX); 484 } 485 else { 486 x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4)); 487 } 488 489 /* decr count, loop if not zero 490 */ 491 x86_dec(p->func, countEBP); 492 x86_test(p->func, countEBP, countEBP); 493 x86_jcc(p->func, cc_NZ, label); 494 495 /* Exit mmx state? 496 */ 497 if (p->func->need_emms) 498 mmx_emms(p->func); 499 500 /* Land forward jump here: 501 */ 502 x86_fixup_fwd_jump(p->func, fixup); 503 504 /* Pop regs and return 505 */ 506 507 x86_pop(p->func, idxEBX); 508 x86_pop(p->func, translateESI); 509 x86_pop(p->func, countEBP); 510 x86_ret(p->func); 511 512 return TRUE; 513} 514 515 516 517 518 519 520 521static void translate_sse_set_buffer( struct translate *translate, 522 unsigned buf, 523 const void *ptr, 524 unsigned stride ) 525{ 526 struct translate_sse *p = (struct translate_sse *)translate; 527 unsigned i; 528 529 for (i = 0; i < p->translate.key.nr_elements; i++) { 530 if (p->translate.key.element[i].input_buffer == buf) { 531 p->attrib[i].input_ptr = ((char *)ptr + 532 p->translate.key.element[i].input_offset); 533 p->attrib[i].input_stride = stride; 534 } 535 } 536} 537 538 539static void translate_sse_release( struct translate *translate ) 540{ 541 struct translate_sse *p = (struct translate_sse *)translate; 542 543 x86_release_func( &p->linear_func ); 544 x86_release_func( &p->elt_func ); 545 546 FREE(p); 547} 548 549static void translate_sse_run_elts( struct translate *translate, 550 const unsigned *elts, 551 unsigned count, 552 void *output_buffer ) 553{ 554 struct translate_sse *p = (struct translate_sse *)translate; 555 556 p->gen_run_elts( translate, 557 elts, 558 count, 559 output_buffer ); 560} 561 562static void translate_sse_run( struct translate *translate, 563 unsigned start, 564 unsigned count, 565 void *output_buffer ) 566{ 567 struct translate_sse *p = (struct translate_sse *)translate; 568 569 p->gen_run( translate, 570 start, 571 count, 572 output_buffer ); 573} 574 575 576struct translate *translate_sse2_create( const struct translate_key *key ) 577{ 578 struct translate_sse *p = NULL; 579 580 if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2()) 581 goto fail; 582 583 p = CALLOC_STRUCT( translate_sse ); 584 if (p == NULL) 585 goto fail; 586 587 p->translate.key = *key; 588 p->translate.release = translate_sse_release; 589 p->translate.set_buffer = translate_sse_set_buffer; 590 p->translate.run_elts = translate_sse_run_elts; 591 p->translate.run = translate_sse_run; 592 593 if (!build_vertex_emit(p, &p->linear_func, TRUE)) 594 goto fail; 595 596 if (!build_vertex_emit(p, &p->elt_func, FALSE)) 597 goto fail; 598 599 p->gen_run = (run_func)x86_get_func(&p->linear_func); 600 if (p->gen_run == NULL) 601 goto fail; 602 603 p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func); 604 if (p->gen_run_elts == NULL) 605 goto fail; 606 607 return &p->translate; 608 609 fail: 610 if (p) 611 translate_sse_release( &p->translate ); 612 613 return NULL; 614} 615 616 617 618#else 619 620void translate_create_sse( const struct translate_key *key ) 621{ 622 return NULL; 623} 624 625#endif 626