translate_sse.c revision 7ca0ce38340144794267609646048b3820d594ab
1/*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29#include "pipe/p_config.h"
30#include "pipe/p_compiler.h"
31#include "util/u_memory.h"
32#include "util/u_math.h"
33
34#include "translate.h"
35
36
37#if defined(PIPE_ARCH_X86)
38
39#include "rtasm/rtasm_cpu.h"
40#include "rtasm/rtasm_x86sse.h"
41
42
43#define X    0
44#define Y    1
45#define Z    2
46#define W    3
47
48
49typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50                                     unsigned start,
51                                     unsigned count,
52                                     void *output_buffer );
53
54typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
55                                          const unsigned *elts,
56                                          unsigned count,
57                                          void *output_buffer );
58
59struct translate_buffer {
60   const void *base_ptr;
61   unsigned stride;
62   void *ptr;                   /* updated per vertex */
63};
64
65
66struct translate_sse {
67   struct translate translate;
68
69   struct x86_function linear_func;
70   struct x86_function elt_func;
71   struct x86_function *func;
72
73   boolean loaded_identity;
74   boolean loaded_255;
75   boolean loaded_inv_255;
76
77   float identity[4];
78   float float_255[4];
79   float inv_255[4];
80
81   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
82   unsigned nr_buffers;
83
84   run_func      gen_run;
85   run_elts_func gen_run_elts;
86
87   /* these are actually known values, but putting them in a struct
88    * like this is helpful to keep them in sync across the file.
89    */
90   struct x86_reg tmp_EAX;
91   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
92   struct x86_reg outbuf_ECX;
93   struct x86_reg machine_EDX;
94   struct x86_reg count_ESI;    /* decrements to zero */
95};
96
97static int get_offset( const void *a, const void *b )
98{
99   return (const char *)b - (const char *)a;
100}
101
102
103
104static struct x86_reg get_identity( struct translate_sse *p )
105{
106   struct x86_reg reg = x86_make_reg(file_XMM, 6);
107
108   if (!p->loaded_identity) {
109      p->loaded_identity = TRUE;
110      p->identity[0] = 0;
111      p->identity[1] = 0;
112      p->identity[2] = 0;
113      p->identity[3] = 1;
114
115      sse_movups(p->func, reg,
116		 x86_make_disp(p->machine_EDX,
117			       get_offset(p, &p->identity[0])));
118   }
119
120   return reg;
121}
122
123static struct x86_reg get_255( struct translate_sse *p )
124{
125   struct x86_reg reg = x86_make_reg(file_XMM, 7);
126
127   if (!p->loaded_255) {
128      p->loaded_255 = TRUE;
129      p->float_255[0] =
130	 p->float_255[1] =
131	 p->float_255[2] =
132	 p->float_255[3] = 255.0f;
133
134      sse_movups(p->func, reg,
135		 x86_make_disp(p->machine_EDX,
136			       get_offset(p, &p->float_255[0])));
137   }
138
139   return reg;
140}
141
142static struct x86_reg get_inv_255( struct translate_sse *p )
143{
144   struct x86_reg reg = x86_make_reg(file_XMM, 5);
145
146   if (!p->loaded_inv_255) {
147      p->loaded_inv_255 = TRUE;
148      p->inv_255[0] =
149	 p->inv_255[1] =
150	 p->inv_255[2] =
151	 p->inv_255[3] = 1.0f / 255.0f;
152
153      sse_movups(p->func, reg,
154		 x86_make_disp(p->machine_EDX,
155			       get_offset(p, &p->inv_255[0])));
156   }
157
158   return reg;
159}
160
161
162static void emit_load_R32G32B32A32( struct translate_sse *p,
163				    struct x86_reg data,
164				    struct x86_reg arg0 )
165{
166   sse_movups(p->func, data, arg0);
167}
168
169static void emit_load_R32G32B32( struct translate_sse *p,
170				 struct x86_reg data,
171				 struct x86_reg arg0 )
172{
173   /* Have to jump through some hoops:
174    *
175    * c 0 0 0
176    * c 0 0 1
177    * 0 0 c 1
178    * a b c 1
179    */
180   sse_movss(p->func, data, x86_make_disp(arg0, 8));
181   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
182   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
183   sse_movlps(p->func, data, arg0);
184}
185
186static void emit_load_R32G32( struct translate_sse *p,
187			   struct x86_reg data,
188			   struct x86_reg arg0 )
189{
190   /* 0 0 0 1
191    * a b 0 1
192    */
193   sse_movups(p->func, data, get_identity(p) );
194   sse_movlps(p->func, data, arg0);
195}
196
197
198static void emit_load_R32( struct translate_sse *p,
199			   struct x86_reg data,
200			   struct x86_reg arg0 )
201{
202   /* a 0 0 0
203    * a 0 0 1
204    */
205   sse_movss(p->func, data, arg0);
206   sse_orps(p->func, data, get_identity(p) );
207}
208
209
210static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
211				       struct x86_reg data,
212				       struct x86_reg src )
213{
214
215   /* Load and unpack twice:
216    */
217   sse_movss(p->func, data, src);
218   sse2_punpcklbw(p->func, data, get_identity(p));
219   sse2_punpcklbw(p->func, data, get_identity(p));
220
221   /* Convert to float:
222    */
223   sse2_cvtdq2ps(p->func, data, data);
224
225
226   /* Scale by 1/255.0
227    */
228   sse_mulps(p->func, data, get_inv_255(p));
229}
230
231
232
233
234static void emit_store_R32G32B32A32( struct translate_sse *p,
235				     struct x86_reg dest,
236				     struct x86_reg dataXMM )
237{
238   sse_movups(p->func, dest, dataXMM);
239}
240
241static void emit_store_R32G32B32( struct translate_sse *p,
242				  struct x86_reg dest,
243				  struct x86_reg dataXMM )
244{
245   /* Emit two, shuffle, emit one.
246    */
247   sse_movlps(p->func, dest, dataXMM);
248   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
249   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
250}
251
252static void emit_store_R32G32( struct translate_sse *p,
253			       struct x86_reg dest,
254			       struct x86_reg dataXMM )
255{
256   sse_movlps(p->func, dest, dataXMM);
257}
258
259static void emit_store_R32( struct translate_sse *p,
260			    struct x86_reg dest,
261			    struct x86_reg dataXMM )
262{
263   sse_movss(p->func, dest, dataXMM);
264}
265
266
267
268static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
269				       struct x86_reg dest,
270				       struct x86_reg dataXMM )
271{
272   /* Scale by 255.0
273    */
274   sse_mulps(p->func, dataXMM, get_255(p));
275
276   /* Pack and emit:
277    */
278   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
279   sse2_packssdw(p->func, dataXMM, dataXMM);
280   sse2_packuswb(p->func, dataXMM, dataXMM);
281   sse_movss(p->func, dest, dataXMM);
282}
283
284
285
286
287
288/* Extended swizzles?  Maybe later.
289 */
290static void emit_swizzle( struct translate_sse *p,
291			  struct x86_reg dest,
292			  struct x86_reg src,
293			  unsigned char shuffle )
294{
295   sse_shufps(p->func, dest, src, shuffle);
296}
297
298
299static boolean translate_attr( struct translate_sse *p,
300			       const struct translate_element *a,
301			       struct x86_reg srcECX,
302			       struct x86_reg dstEAX)
303{
304   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
305
306   switch (a->input_format) {
307   case PIPE_FORMAT_R32_FLOAT:
308      emit_load_R32(p, dataXMM, srcECX);
309      break;
310   case PIPE_FORMAT_R32G32_FLOAT:
311      emit_load_R32G32(p, dataXMM, srcECX);
312      break;
313   case PIPE_FORMAT_R32G32B32_FLOAT:
314      emit_load_R32G32B32(p, dataXMM, srcECX);
315      break;
316   case PIPE_FORMAT_R32G32B32A32_FLOAT:
317      emit_load_R32G32B32A32(p, dataXMM, srcECX);
318      break;
319   case PIPE_FORMAT_B8G8R8A8_UNORM:
320      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
321      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
322      break;
323   case PIPE_FORMAT_R8G8B8A8_UNORM:
324      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
325      break;
326   default:
327      return FALSE;
328   }
329
330   switch (a->output_format) {
331   case PIPE_FORMAT_R32_FLOAT:
332      emit_store_R32(p, dstEAX, dataXMM);
333      break;
334   case PIPE_FORMAT_R32G32_FLOAT:
335      emit_store_R32G32(p, dstEAX, dataXMM);
336      break;
337   case PIPE_FORMAT_R32G32B32_FLOAT:
338      emit_store_R32G32B32(p, dstEAX, dataXMM);
339      break;
340   case PIPE_FORMAT_R32G32B32A32_FLOAT:
341      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
342      break;
343   case PIPE_FORMAT_B8G8R8A8_UNORM:
344      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
345      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
346      break;
347   case PIPE_FORMAT_R8G8B8A8_UNORM:
348      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
349      break;
350   default:
351      return FALSE;
352   }
353
354   return TRUE;
355}
356
357
358static boolean init_inputs( struct translate_sse *p,
359                            boolean linear )
360{
361   unsigned i;
362   if (linear) {
363      for (i = 0; i < p->nr_buffers; i++) {
364         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
365                                                     get_offset(p, &p->buffer[i].stride));
366         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
367                                                     get_offset(p, &p->buffer[i].ptr));
368         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
369                                                     get_offset(p, &p->buffer[i].base_ptr));
370         struct x86_reg elt = p->idx_EBX;
371         struct x86_reg tmp = p->tmp_EAX;
372
373
374         /* Calculate pointer to first attrib:
375          */
376         x86_mov(p->func, tmp, buf_stride);
377         x86_imul(p->func, tmp, elt);
378         x86_add(p->func, tmp, buf_base_ptr);
379
380
381         /* In the linear case, keep the buffer pointer instead of the
382          * index number.
383          */
384         if (p->nr_buffers == 1)
385            x86_mov( p->func, elt, tmp );
386         else
387            x86_mov( p->func, buf_ptr, tmp );
388      }
389   }
390
391   return TRUE;
392}
393
394
395static struct x86_reg get_buffer_ptr( struct translate_sse *p,
396                                      boolean linear,
397                                      unsigned buf_idx,
398                                      struct x86_reg elt )
399{
400   if (linear && p->nr_buffers == 1) {
401      return p->idx_EBX;
402   }
403   else if (linear) {
404      struct x86_reg ptr = p->tmp_EAX;
405      struct x86_reg buf_ptr =
406         x86_make_disp(p->machine_EDX,
407                       get_offset(p, &p->buffer[buf_idx].ptr));
408
409      x86_mov(p->func, ptr, buf_ptr);
410      return ptr;
411   }
412   else {
413      struct x86_reg ptr = p->tmp_EAX;
414
415      struct x86_reg buf_stride =
416         x86_make_disp(p->machine_EDX,
417                       get_offset(p, &p->buffer[buf_idx].stride));
418
419      struct x86_reg buf_base_ptr =
420         x86_make_disp(p->machine_EDX,
421                       get_offset(p, &p->buffer[buf_idx].base_ptr));
422
423
424
425      /* Calculate pointer to current attrib:
426       */
427      x86_mov(p->func, ptr, buf_stride);
428      x86_imul(p->func, ptr, elt);
429      x86_add(p->func, ptr, buf_base_ptr);
430      return ptr;
431   }
432}
433
434
435
436static boolean incr_inputs( struct translate_sse *p,
437                            boolean linear )
438{
439   if (linear && p->nr_buffers == 1) {
440      struct x86_reg stride = x86_make_disp(p->machine_EDX,
441                                            get_offset(p, &p->buffer[0].stride));
442
443      x86_add(p->func, p->idx_EBX, stride);
444      sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
445   }
446   else if (linear) {
447      unsigned i;
448
449      /* Is this worthwhile??
450       */
451      for (i = 0; i < p->nr_buffers; i++) {
452         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
453                                                get_offset(p, &p->buffer[i].ptr));
454         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
455                                                   get_offset(p, &p->buffer[i].stride));
456
457         x86_mov(p->func, p->tmp_EAX, buf_ptr);
458         x86_add(p->func, p->tmp_EAX, buf_stride);
459         if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
460         x86_mov(p->func, buf_ptr, p->tmp_EAX);
461      }
462   }
463   else {
464      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
465   }
466
467   return TRUE;
468}
469
470
471/* Build run( struct translate *machine,
472 *            unsigned start,
473 *            unsigned count,
474 *            void *output_buffer )
475 * or
476 *  run_elts( struct translate *machine,
477 *            unsigned *elts,
478 *            unsigned count,
479 *            void *output_buffer )
480 *
481 *  Lots of hardcoding
482 *
483 * EAX -- pointer to current output vertex
484 * ECX -- pointer to current attribute
485 *
486 */
487static boolean build_vertex_emit( struct translate_sse *p,
488				  struct x86_function *func,
489				  boolean linear )
490{
491   int fixup, label;
492   unsigned j;
493
494   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
495   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
496   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
497   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
498   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
499
500   p->func = func;
501   p->loaded_inv_255 = FALSE;
502   p->loaded_255 = FALSE;
503   p->loaded_identity = FALSE;
504
505   x86_init_func(p->func);
506
507   /* Push a few regs?
508    */
509   x86_push(p->func, p->idx_EBX);
510   x86_push(p->func, p->count_ESI);
511
512   /* Load arguments into regs:
513    */
514   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
515   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
516   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
517   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4));
518
519   /* Get vertex count, compare to zero
520    */
521   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
522   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
523   fixup = x86_jcc_forward(p->func, cc_E);
524
525   /* always load, needed or not:
526    */
527   init_inputs(p, linear);
528
529   /* Note address for loop jump
530    */
531   label = x86_get_label(p->func);
532   {
533      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
534      int last_vb = -1;
535      struct x86_reg vb;
536
537      for (j = 0; j < p->translate.key.nr_elements; j++) {
538         const struct translate_element *a = &p->translate.key.element[j];
539
540         /* Figure out source pointer address:
541          */
542         if (a->input_buffer != last_vb) {
543            last_vb = a->input_buffer;
544            vb = get_buffer_ptr(p, linear, a->input_buffer, elt);
545         }
546
547         if (!translate_attr( p, a,
548                              x86_make_disp(vb, a->input_offset),
549                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
550            return FALSE;
551      }
552
553      /* Next output vertex:
554       */
555      x86_lea(p->func,
556              p->outbuf_ECX,
557              x86_make_disp(p->outbuf_ECX,
558                            p->translate.key.output_stride));
559
560      /* Incr index
561       */
562      incr_inputs( p, linear );
563   }
564
565   /* decr count, loop if not zero
566    */
567   x86_dec(p->func, p->count_ESI);
568   x86_jcc(p->func, cc_NZ, label);
569
570   /* Exit mmx state?
571    */
572   if (p->func->need_emms)
573      mmx_emms(p->func);
574
575   /* Land forward jump here:
576    */
577   x86_fixup_fwd_jump(p->func, fixup);
578
579   /* Pop regs and return
580    */
581
582   x86_pop(p->func, p->count_ESI);
583   x86_pop(p->func, p->idx_EBX);
584   x86_ret(p->func);
585
586   return TRUE;
587}
588
589
590
591
592
593
594
595static void translate_sse_set_buffer( struct translate *translate,
596				unsigned buf,
597				const void *ptr,
598				unsigned stride )
599{
600   struct translate_sse *p = (struct translate_sse *)translate;
601
602   if (buf < p->nr_buffers) {
603      p->buffer[buf].base_ptr = (char *)ptr;
604      p->buffer[buf].stride = stride;
605   }
606
607   if (0) debug_printf("%s %d/%d: %p %d\n",
608                       __FUNCTION__, buf,
609                       p->nr_buffers,
610                       ptr, stride);
611}
612
613
614static void translate_sse_release( struct translate *translate )
615{
616   struct translate_sse *p = (struct translate_sse *)translate;
617
618   x86_release_func( &p->linear_func );
619   x86_release_func( &p->elt_func );
620
621   FREE(p);
622}
623
624static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
625			      const unsigned *elts,
626			      unsigned count,
627			      void *output_buffer )
628{
629   struct translate_sse *p = (struct translate_sse *)translate;
630
631   p->gen_run_elts( translate,
632		    elts,
633		    count,
634		    output_buffer );
635}
636
637static void PIPE_CDECL translate_sse_run( struct translate *translate,
638			 unsigned start,
639			 unsigned count,
640                         unsigned instance_id,
641			 void *output_buffer )
642{
643   struct translate_sse *p = (struct translate_sse *)translate;
644
645   p->gen_run( translate,
646	       start,
647	       count,
648	       output_buffer );
649}
650
651
652struct translate *translate_sse2_create( const struct translate_key *key )
653{
654   struct translate_sse *p = NULL;
655   unsigned i;
656
657   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
658      goto fail;
659
660   p = CALLOC_STRUCT( translate_sse );
661   if (p == NULL)
662      goto fail;
663
664   p->translate.key = *key;
665   p->translate.release = translate_sse_release;
666   p->translate.set_buffer = translate_sse_set_buffer;
667   p->translate.run_elts = translate_sse_run_elts;
668   p->translate.run = translate_sse_run;
669
670   for (i = 0; i < key->nr_elements; i++)
671      p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
672
673   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
674
675   if (!build_vertex_emit(p, &p->linear_func, TRUE))
676      goto fail;
677
678   if (!build_vertex_emit(p, &p->elt_func, FALSE))
679      goto fail;
680
681   p->gen_run = (run_func)x86_get_func(&p->linear_func);
682   if (p->gen_run == NULL)
683      goto fail;
684
685   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
686   if (p->gen_run_elts == NULL)
687      goto fail;
688
689   return &p->translate;
690
691 fail:
692   if (p)
693      translate_sse_release( &p->translate );
694
695   return NULL;
696}
697
698
699
700#else
701
702struct translate *translate_sse2_create( const struct translate_key *key )
703{
704   return NULL;
705}
706
707#endif
708