translate_sse.c revision 8618e6aa16bdba2c8b08124261bbaedaf7e22447
1/*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29#include "pipe/p_compiler.h"
30#include "pipe/p_util.h"
31#include "util/u_simple_list.h"
32
33#include "translate.h"
34
35
36#if defined(__i386__) || defined(__386__) || defined(i386)
37
38#include "rtasm/rtasm_cpu.h"
39#include "rtasm/rtasm_x86sse.h"
40
41
42#define X    0
43#define Y    1
44#define Z    2
45#define W    3
46
47
48#ifdef WIN32
49#define RTASM __cdecl
50#else
51#define RTASM
52#endif
53
54typedef void (RTASM *run_func)( struct translate *translate,
55                                unsigned start,
56                                unsigned count,
57                                void *output_buffer );
58
59typedef void (RTASM *run_elts_func)( struct translate *translate,
60                                     const unsigned *elts,
61                                     unsigned count,
62                                     void *output_buffer );
63
64
65
66struct translate_sse {
67   struct translate translate;
68
69   struct x86_function linear_func;
70   struct x86_function elt_func;
71   struct x86_function *func;
72
73   boolean loaded_identity;
74   boolean loaded_255;
75   boolean loaded_inv_255;
76
77   float identity[4];
78   float float_255[4];
79   float inv_255[4];
80
81   struct {
82      char *input_ptr;
83      unsigned input_stride;
84   } attrib[PIPE_MAX_ATTRIBS];
85
86   run_func      gen_run;
87   run_elts_func gen_run_elts;
88
89};
90
91static int get_offset( const void *a, const void *b )
92{
93   return (const char *)b - (const char *)a;
94}
95
96
97
98static struct x86_reg get_identity( struct translate_sse *p )
99{
100   struct x86_reg reg = x86_make_reg(file_XMM, 6);
101
102   if (!p->loaded_identity) {
103      /* Nasty:
104       */
105      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
106
107      p->loaded_identity = TRUE;
108      p->identity[0] = 0;
109      p->identity[1] = 0;
110      p->identity[2] = 0;
111      p->identity[3] = 1;
112
113      sse_movups(p->func, reg,
114		 x86_make_disp(translateESI,
115			       get_offset(p, &p->identity[0])));
116   }
117
118   return reg;
119}
120
121static struct x86_reg get_255( struct translate_sse *p )
122{
123   struct x86_reg reg = x86_make_reg(file_XMM, 6);
124
125   if (!p->loaded_255) {
126      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
127
128      p->loaded_255 = TRUE;
129      p->float_255[0] =
130	 p->float_255[1] =
131	 p->float_255[2] =
132	 p->float_255[3] = 255.0f;
133
134      sse_movups(p->func, reg,
135		 x86_make_disp(translateESI,
136			       get_offset(p, &p->float_255[0])));
137   }
138
139   return reg;
140   return x86_make_reg(file_XMM, 7);
141}
142
143static struct x86_reg get_inv_255( struct translate_sse *p )
144{
145   struct x86_reg reg = x86_make_reg(file_XMM, 5);
146
147   if (!p->loaded_inv_255) {
148      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
149
150      p->loaded_inv_255 = TRUE;
151      p->inv_255[0] =
152	 p->inv_255[1] =
153	 p->inv_255[2] =
154	 p->inv_255[3] = 1.0f / 255.0f;
155
156      sse_movups(p->func, reg,
157		 x86_make_disp(translateESI,
158			       get_offset(p, &p->inv_255[0])));
159   }
160
161   return reg;
162}
163
164
165static void emit_load_R32G32B32A32( struct translate_sse *p,
166				    struct x86_reg data,
167				    struct x86_reg arg0 )
168{
169   sse_movups(p->func, data, arg0);
170}
171
172static void emit_load_R32G32B32( struct translate_sse *p,
173				 struct x86_reg data,
174				 struct x86_reg arg0 )
175{
176   /* Have to jump through some hoops:
177    *
178    * c 0 0 0
179    * c 0 0 1
180    * 0 0 c 1
181    * a b c 1
182    */
183   sse_movss(p->func, data, x86_make_disp(arg0, 8));
184   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
185   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
186   sse_movlps(p->func, data, arg0);
187}
188
189static void emit_load_R32G32( struct translate_sse *p,
190			   struct x86_reg data,
191			   struct x86_reg arg0 )
192{
193   /* 0 0 0 1
194    * a b 0 1
195    */
196   sse_movups(p->func, data, get_identity(p) );
197   sse_movlps(p->func, data, arg0);
198}
199
200
201static void emit_load_R32( struct translate_sse *p,
202			   struct x86_reg data,
203			   struct x86_reg arg0 )
204{
205   /* a 0 0 0
206    * a 0 0 1
207    */
208   sse_movss(p->func, data, arg0);
209   sse_orps(p->func, data, get_identity(p) );
210}
211
212
213static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
214				       struct x86_reg data,
215				       struct x86_reg src )
216{
217
218   /* Load and unpack twice:
219    */
220   sse_movss(p->func, data, src);
221   sse2_punpcklbw(p->func, data, get_identity(p));
222   sse2_punpcklbw(p->func, data, get_identity(p));
223
224   /* Convert to float:
225    */
226   sse2_cvtdq2ps(p->func, data, data);
227
228
229   /* Scale by 1/255.0
230    */
231   sse_mulps(p->func, data, get_inv_255(p));
232}
233
234
235
236
237static void emit_store_R32G32B32A32( struct translate_sse *p,
238				     struct x86_reg dest,
239				     struct x86_reg dataXMM )
240{
241   sse_movups(p->func, dest, dataXMM);
242}
243
244static void emit_store_R32G32B32( struct translate_sse *p,
245				  struct x86_reg dest,
246				  struct x86_reg dataXMM )
247{
248   /* Emit two, shuffle, emit one.
249    */
250   sse_movlps(p->func, dest, dataXMM);
251   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
252   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
253}
254
255static void emit_store_R32G32( struct translate_sse *p,
256			       struct x86_reg dest,
257			       struct x86_reg dataXMM )
258{
259   sse_movlps(p->func, dest, dataXMM);
260}
261
262static void emit_store_R32( struct translate_sse *p,
263			    struct x86_reg dest,
264			    struct x86_reg dataXMM )
265{
266   sse_movss(p->func, dest, dataXMM);
267}
268
269
270
271static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
272				       struct x86_reg dest,
273				       struct x86_reg dataXMM )
274{
275   /* Scale by 255.0
276    */
277   sse_mulps(p->func, dataXMM, get_255(p));
278
279   /* Pack and emit:
280    */
281   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
282   sse2_packssdw(p->func, dataXMM, dataXMM);
283   sse2_packuswb(p->func, dataXMM, dataXMM);
284   sse_movss(p->func, dest, dataXMM);
285}
286
287
288
289
290
291static void get_src_ptr( struct translate_sse *p,
292			 struct x86_reg srcEAX,
293			 struct x86_reg translateREG,
294			 struct x86_reg eltREG,
295			 unsigned a )
296{
297   struct x86_reg input_ptr =
298      x86_make_disp(translateREG,
299		    get_offset(p, &p->attrib[a].input_ptr));
300
301   struct x86_reg input_stride =
302      x86_make_disp(translateREG,
303		    get_offset(p, &p->attrib[a].input_stride));
304
305   /* Calculate pointer to current attrib:
306    */
307   x86_mov(p->func, srcEAX, input_stride);
308   x86_imul(p->func, srcEAX, eltREG);
309   x86_add(p->func, srcEAX, input_ptr);
310}
311
312
313/* Extended swizzles?  Maybe later.
314 */
315static void emit_swizzle( struct translate_sse *p,
316			  struct x86_reg dest,
317			  struct x86_reg src,
318			  unsigned shuffle )
319{
320   sse_shufps(p->func, dest, src, shuffle);
321}
322
323
324static boolean translate_attr( struct translate_sse *p,
325			       const struct translate_element *a,
326			       struct x86_reg srcECX,
327			       struct x86_reg dstEAX)
328{
329   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
330
331   switch (a->input_format) {
332   case PIPE_FORMAT_R32_FLOAT:
333      emit_load_R32(p, dataXMM, srcECX);
334      break;
335   case PIPE_FORMAT_R32G32_FLOAT:
336      emit_load_R32G32(p, dataXMM, srcECX);
337      break;
338   case PIPE_FORMAT_R32G32B32_FLOAT:
339      emit_load_R32G32B32(p, dataXMM, srcECX);
340      break;
341   case PIPE_FORMAT_R32G32B32A32_FLOAT:
342      emit_load_R32G32B32A32(p, dataXMM, srcECX);
343      break;
344   case PIPE_FORMAT_B8G8R8A8_UNORM:
345      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
346      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
347      break;
348   case PIPE_FORMAT_R8G8B8A8_UNORM:
349      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
350      break;
351   default:
352      return FALSE;
353   }
354
355   switch (a->output_format) {
356   case PIPE_FORMAT_R32_FLOAT:
357      emit_store_R32(p, dstEAX, dataXMM);
358      break;
359   case PIPE_FORMAT_R32G32_FLOAT:
360      emit_store_R32G32(p, dstEAX, dataXMM);
361      break;
362   case PIPE_FORMAT_R32G32B32_FLOAT:
363      emit_store_R32G32B32(p, dstEAX, dataXMM);
364      break;
365   case PIPE_FORMAT_R32G32B32A32_FLOAT:
366      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
367      break;
368   case PIPE_FORMAT_B8G8R8A8_UNORM:
369      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
370      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
371      break;
372   case PIPE_FORMAT_R8G8B8A8_UNORM:
373      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
374      break;
375   default:
376      return FALSE;
377   }
378
379   return TRUE;
380}
381
382/* Build run( struct translate *translate,
383 *            unsigned start,
384 *            unsigned count,
385 *            void *output_buffer )
386 * or
387 *  run_elts( struct translate *translate,
388 *            unsigned *elts,
389 *            unsigned count,
390 *            void *output_buffer )
391 *
392 *  Lots of hardcoding
393 *
394 * EAX -- pointer to current output vertex
395 * ECX -- pointer to current attribute
396 *
397 */
398static boolean build_vertex_emit( struct translate_sse *p,
399				  struct x86_function *func,
400				  boolean linear )
401{
402   struct x86_reg vertexECX    = x86_make_reg(file_REG32, reg_AX);
403   struct x86_reg idxEBX       = x86_make_reg(file_REG32, reg_BX);
404   struct x86_reg srcEAX       = x86_make_reg(file_REG32, reg_CX);
405   struct x86_reg countEBP     = x86_make_reg(file_REG32, reg_BP);
406   struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
407   int fixup, label;
408   unsigned j;
409
410   p->func = func;
411   p->loaded_inv_255 = FALSE;
412   p->loaded_255 = FALSE;
413   p->loaded_identity = FALSE;
414
415   x86_init_func(p->func);
416
417   /* Push a few regs?
418    */
419   x86_push(p->func, countEBP);
420   x86_push(p->func, translateESI);
421   x86_push(p->func, idxEBX);
422
423   /* Get vertex count, compare to zero
424    */
425   x86_xor(p->func, idxEBX, idxEBX);
426   x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3));
427   x86_cmp(p->func, countEBP, idxEBX);
428   fixup = x86_jcc_forward(p->func, cc_E);
429
430   /* If linear, idx is the current element, otherwise it is a pointer
431    * to the current element.
432    */
433   x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2));
434
435   /* Initialize destination register.
436    */
437   x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4));
438
439   /* Move argument 1 (translate_sse pointer) into a reg:
440    */
441   x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1));
442
443
444   /* always load, needed or not:
445    */
446
447   /* Note address for loop jump */
448   label = x86_get_label(p->func);
449
450
451   for (j = 0; j < p->translate.key.nr_elements; j++) {
452      const struct translate_element *a = &p->translate.key.element[j];
453
454      struct x86_reg destEAX = x86_make_disp(vertexECX,
455					     a->output_offset);
456
457      /* Figure out source pointer address:
458       */
459      if (linear) {
460	 get_src_ptr(p, srcEAX, translateESI, idxEBX, j);
461      }
462      else {
463	 get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j);
464      }
465
466      if (!translate_attr( p, a, x86_deref(srcEAX), destEAX ))
467	 return FALSE;
468   }
469
470   /* Next vertex:
471    */
472   x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
473
474   /* Incr index
475    */
476   if (linear) {
477      x86_inc(p->func, idxEBX);
478   }
479   else {
480      x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4));
481   }
482
483   /* decr count, loop if not zero
484    */
485   x86_dec(p->func, countEBP);
486   x86_test(p->func, countEBP, countEBP);
487   x86_jcc(p->func, cc_NZ, label);
488
489   /* Exit mmx state?
490    */
491   if (p->func->need_emms)
492      mmx_emms(p->func);
493
494   /* Land forward jump here:
495    */
496   x86_fixup_fwd_jump(p->func, fixup);
497
498   /* Pop regs and return
499    */
500
501   x86_pop(p->func, idxEBX);
502   x86_pop(p->func, translateESI);
503   x86_pop(p->func, countEBP);
504   x86_ret(p->func);
505
506   return TRUE;
507}
508
509
510
511
512
513
514
515static void translate_sse_set_buffer( struct translate *translate,
516				unsigned buf,
517				const void *ptr,
518				unsigned stride )
519{
520   struct translate_sse *p = (struct translate_sse *)translate;
521   unsigned i;
522
523   for (i = 0; i < p->translate.key.nr_elements; i++) {
524      if (p->translate.key.element[i].input_buffer == buf) {
525	 p->attrib[i].input_ptr = ((char *)ptr +
526				    p->translate.key.element[i].input_offset);
527	 p->attrib[i].input_stride = stride;
528      }
529   }
530}
531
532
533static void translate_sse_release( struct translate *translate )
534{
535   struct translate_sse *p = (struct translate_sse *)translate;
536
537   x86_release_func( &p->linear_func );
538   x86_release_func( &p->elt_func );
539
540   FREE(p);
541}
542
543static void translate_sse_run_elts( struct translate *translate,
544			      const unsigned *elts,
545			      unsigned count,
546			      void *output_buffer )
547{
548   struct translate_sse *p = (struct translate_sse *)translate;
549
550   p->gen_run_elts( translate,
551		    elts,
552		    count,
553		    output_buffer );
554}
555
556static void translate_sse_run( struct translate *translate,
557			 unsigned start,
558			 unsigned count,
559			 void *output_buffer )
560{
561   struct translate_sse *p = (struct translate_sse *)translate;
562
563   p->gen_run( translate,
564	       start,
565	       count,
566	       output_buffer );
567}
568
569
570struct translate *translate_sse2_create( const struct translate_key *key )
571{
572   struct translate_sse *p = NULL;
573
574   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
575      goto fail;
576
577   p = CALLOC_STRUCT( translate_sse );
578   if (p == NULL)
579      goto fail;
580
581   p->translate.key = *key;
582   p->translate.release = translate_sse_release;
583   p->translate.set_buffer = translate_sse_set_buffer;
584   p->translate.run_elts = translate_sse_run_elts;
585   p->translate.run = translate_sse_run;
586
587   if (!build_vertex_emit(p, &p->linear_func, TRUE))
588      goto fail;
589
590   if (!build_vertex_emit(p, &p->elt_func, FALSE))
591      goto fail;
592
593   p->gen_run = (run_func)x86_get_func(&p->linear_func);
594   if (p->gen_run == NULL)
595      goto fail;
596
597   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
598   if (p->gen_run_elts == NULL)
599      goto fail;
600
601   return &p->translate;
602
603 fail:
604   if (p)
605      translate_sse_release( &p->translate );
606
607   return NULL;
608}
609
610
611
612#else
613
614void translate_create_sse( const struct translate_key *key )
615{
616   return NULL;
617}
618
619#endif
620