translate_sse.c revision fc431a58dc1446383edc11aec2a0b7de5b363e5e
1/*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29#include "pipe/p_config.h"
30#include "pipe/p_compiler.h"
31#include "util/u_memory.h"
32#include "util/u_math.h"
33
34#include "translate.h"
35
36
37#if defined(PIPE_ARCH_X86)
38
39#include "rtasm/rtasm_cpu.h"
40#include "rtasm/rtasm_x86sse.h"
41
42
43#define X    0
44#define Y    1
45#define Z    2
46#define W    3
47
48
49typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50                                     unsigned start,
51                                     unsigned count,
52                                     unsigned instance_id,
53                                     void *output_buffer);
54
55typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
56                                          const unsigned *elts,
57                                          unsigned count,
58                                          unsigned instance_id,
59                                          void *output_buffer);
60
61struct translate_buffer {
62   const void *base_ptr;
63   unsigned stride;
64   unsigned max_index;
65};
66
67struct translate_buffer_varient {
68   unsigned buffer_index;
69   unsigned instance_divisor;
70   void *ptr;                    /* updated either per vertex or per instance */
71};
72
73
74#define ELEMENT_BUFFER_INSTANCE_ID  1001
75
76
77struct translate_sse {
78   struct translate translate;
79
80   struct x86_function linear_func;
81   struct x86_function elt_func;
82   struct x86_function *func;
83
84   boolean loaded_identity;
85   boolean loaded_255;
86   boolean loaded_inv_255;
87
88   float identity[4];
89   float float_255[4];
90   float inv_255[4];
91
92   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
93   unsigned nr_buffers;
94
95   /* Multiple buffer varients can map to a single buffer. */
96   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
97   unsigned nr_buffer_varients;
98
99   /* Multiple elements can map to a single buffer varient. */
100   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
101
102   boolean use_instancing;
103   unsigned instance_id;
104
105   run_func      gen_run;
106   run_elts_func gen_run_elts;
107
108   /* these are actually known values, but putting them in a struct
109    * like this is helpful to keep them in sync across the file.
110    */
111   struct x86_reg tmp_EAX;
112   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
113   struct x86_reg outbuf_ECX;
114   struct x86_reg machine_EDX;
115   struct x86_reg count_ESI;    /* decrements to zero */
116};
117
118static int get_offset( const void *a, const void *b )
119{
120   return (const char *)b - (const char *)a;
121}
122
123
124
125static struct x86_reg get_identity( struct translate_sse *p )
126{
127   struct x86_reg reg = x86_make_reg(file_XMM, 6);
128
129   if (!p->loaded_identity) {
130      p->loaded_identity = TRUE;
131      p->identity[0] = 0;
132      p->identity[1] = 0;
133      p->identity[2] = 0;
134      p->identity[3] = 1;
135
136      sse_movups(p->func, reg,
137		 x86_make_disp(p->machine_EDX,
138			       get_offset(p, &p->identity[0])));
139   }
140
141   return reg;
142}
143
144static struct x86_reg get_255( struct translate_sse *p )
145{
146   struct x86_reg reg = x86_make_reg(file_XMM, 7);
147
148   if (!p->loaded_255) {
149      p->loaded_255 = TRUE;
150      p->float_255[0] =
151	 p->float_255[1] =
152	 p->float_255[2] =
153	 p->float_255[3] = 255.0f;
154
155      sse_movups(p->func, reg,
156		 x86_make_disp(p->machine_EDX,
157			       get_offset(p, &p->float_255[0])));
158   }
159
160   return reg;
161}
162
163static struct x86_reg get_inv_255( struct translate_sse *p )
164{
165   struct x86_reg reg = x86_make_reg(file_XMM, 5);
166
167   if (!p->loaded_inv_255) {
168      p->loaded_inv_255 = TRUE;
169      p->inv_255[0] =
170	 p->inv_255[1] =
171	 p->inv_255[2] =
172	 p->inv_255[3] = 1.0f / 255.0f;
173
174      sse_movups(p->func, reg,
175		 x86_make_disp(p->machine_EDX,
176			       get_offset(p, &p->inv_255[0])));
177   }
178
179   return reg;
180}
181
182
183static void emit_load_R32G32B32A32( struct translate_sse *p,
184				    struct x86_reg data,
185				    struct x86_reg arg0 )
186{
187   sse_movups(p->func, data, arg0);
188}
189
190static void emit_load_R32G32B32( struct translate_sse *p,
191				 struct x86_reg data,
192				 struct x86_reg arg0 )
193{
194   /* Have to jump through some hoops:
195    *
196    * c 0 0 0
197    * c 0 0 1
198    * 0 0 c 1
199    * a b c 1
200    */
201   sse_movss(p->func, data, x86_make_disp(arg0, 8));
202   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
203   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
204   sse_movlps(p->func, data, arg0);
205}
206
207static void emit_load_R32G32( struct translate_sse *p,
208			   struct x86_reg data,
209			   struct x86_reg arg0 )
210{
211   /* 0 0 0 1
212    * a b 0 1
213    */
214   sse_movups(p->func, data, get_identity(p) );
215   sse_movlps(p->func, data, arg0);
216}
217
218
219static void emit_load_R32( struct translate_sse *p,
220			   struct x86_reg data,
221			   struct x86_reg arg0 )
222{
223   /* a 0 0 0
224    * a 0 0 1
225    */
226   sse_movss(p->func, data, arg0);
227   sse_orps(p->func, data, get_identity(p) );
228}
229
230
231static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
232				       struct x86_reg data,
233				       struct x86_reg src )
234{
235
236   /* Load and unpack twice:
237    */
238   sse_movss(p->func, data, src);
239   sse2_punpcklbw(p->func, data, get_identity(p));
240   sse2_punpcklbw(p->func, data, get_identity(p));
241
242   /* Convert to float:
243    */
244   sse2_cvtdq2ps(p->func, data, data);
245
246
247   /* Scale by 1/255.0
248    */
249   sse_mulps(p->func, data, get_inv_255(p));
250}
251
252
253
254
255static void emit_store_R32G32B32A32( struct translate_sse *p,
256				     struct x86_reg dest,
257				     struct x86_reg dataXMM )
258{
259   sse_movups(p->func, dest, dataXMM);
260}
261
262static void emit_store_R32G32B32( struct translate_sse *p,
263				  struct x86_reg dest,
264				  struct x86_reg dataXMM )
265{
266   /* Emit two, shuffle, emit one.
267    */
268   sse_movlps(p->func, dest, dataXMM);
269   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
270   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
271}
272
273static void emit_store_R32G32( struct translate_sse *p,
274			       struct x86_reg dest,
275			       struct x86_reg dataXMM )
276{
277   sse_movlps(p->func, dest, dataXMM);
278}
279
280static void emit_store_R32( struct translate_sse *p,
281			    struct x86_reg dest,
282			    struct x86_reg dataXMM )
283{
284   sse_movss(p->func, dest, dataXMM);
285}
286
287
288
289static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
290				       struct x86_reg dest,
291				       struct x86_reg dataXMM )
292{
293   /* Scale by 255.0
294    */
295   sse_mulps(p->func, dataXMM, get_255(p));
296
297   /* Pack and emit:
298    */
299   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
300   sse2_packssdw(p->func, dataXMM, dataXMM);
301   sse2_packuswb(p->func, dataXMM, dataXMM);
302   sse_movss(p->func, dest, dataXMM);
303}
304
305
306
307
308
309/* Extended swizzles?  Maybe later.
310 */
311static void emit_swizzle( struct translate_sse *p,
312			  struct x86_reg dest,
313			  struct x86_reg src,
314			  unsigned char shuffle )
315{
316   sse_shufps(p->func, dest, src, shuffle);
317}
318
319
320static boolean translate_attr( struct translate_sse *p,
321			       const struct translate_element *a,
322			       struct x86_reg srcECX,
323			       struct x86_reg dstEAX)
324{
325   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
326
327   switch (a->input_format) {
328   case PIPE_FORMAT_R32_FLOAT:
329      emit_load_R32(p, dataXMM, srcECX);
330      break;
331   case PIPE_FORMAT_R32G32_FLOAT:
332      emit_load_R32G32(p, dataXMM, srcECX);
333      break;
334   case PIPE_FORMAT_R32G32B32_FLOAT:
335      emit_load_R32G32B32(p, dataXMM, srcECX);
336      break;
337   case PIPE_FORMAT_R32G32B32A32_FLOAT:
338      emit_load_R32G32B32A32(p, dataXMM, srcECX);
339      break;
340   case PIPE_FORMAT_B8G8R8A8_UNORM:
341      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
342      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
343      break;
344   case PIPE_FORMAT_R8G8B8A8_UNORM:
345      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
346      break;
347   default:
348      return FALSE;
349   }
350
351   switch (a->output_format) {
352   case PIPE_FORMAT_R32_FLOAT:
353      emit_store_R32(p, dstEAX, dataXMM);
354      break;
355   case PIPE_FORMAT_R32G32_FLOAT:
356      emit_store_R32G32(p, dstEAX, dataXMM);
357      break;
358   case PIPE_FORMAT_R32G32B32_FLOAT:
359      emit_store_R32G32B32(p, dstEAX, dataXMM);
360      break;
361   case PIPE_FORMAT_R32G32B32A32_FLOAT:
362      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
363      break;
364   case PIPE_FORMAT_B8G8R8A8_UNORM:
365      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
366      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
367      break;
368   case PIPE_FORMAT_R8G8B8A8_UNORM:
369      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
370      break;
371   default:
372      return FALSE;
373   }
374
375   return TRUE;
376}
377
378
379static boolean init_inputs( struct translate_sse *p,
380                            boolean linear )
381{
382   unsigned i;
383   struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
384                                              get_offset(p, &p->instance_id));
385
386   for (i = 0; i < p->nr_buffer_varients; i++) {
387      struct translate_buffer_varient *varient = &p->buffer_varient[i];
388      struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
389
390      if (linear || varient->instance_divisor) {
391         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
392                                                     get_offset(p, &buffer->stride));
393         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
394                                                     get_offset(p, &varient->ptr));
395         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
396                                                     get_offset(p, &buffer->base_ptr));
397         struct x86_reg elt = p->idx_EBX;
398         struct x86_reg tmp_EAX = p->tmp_EAX;
399
400         /* Calculate pointer to first attrib:
401          *   base_ptr + stride * index, where index depends on instance divisor
402          */
403         if (varient->instance_divisor) {
404            /* Our index is instance ID divided by instance divisor.
405             */
406            x86_mov(p->func, tmp_EAX, instance_id);
407
408            if (varient->instance_divisor != 1) {
409               struct x86_reg tmp_EDX = p->machine_EDX;
410               struct x86_reg tmp_ECX = p->outbuf_ECX;
411
412               /* TODO: Add x86_shr() to rtasm and use it whenever
413                *       instance divisor is power of two.
414                */
415
416               x86_push(p->func, tmp_EDX);
417               x86_push(p->func, tmp_ECX);
418               x86_xor(p->func, tmp_EDX, tmp_EDX);
419               x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
420               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
421               x86_pop(p->func, tmp_ECX);
422               x86_pop(p->func, tmp_EDX);
423            }
424         } else {
425            x86_mov(p->func, tmp_EAX, elt);
426         }
427
428         /*
429          * TODO: Respect translate_buffer::max_index.
430          */
431
432         x86_imul(p->func, tmp_EAX, buf_stride);
433         x86_add(p->func, tmp_EAX, buf_base_ptr);
434
435
436         /* In the linear case, keep the buffer pointer instead of the
437          * index number.
438          */
439         if (linear && p->nr_buffer_varients == 1)
440            x86_mov(p->func, elt, tmp_EAX);
441         else
442            x86_mov(p->func, buf_ptr, tmp_EAX);
443      }
444   }
445
446   return TRUE;
447}
448
449
450static struct x86_reg get_buffer_ptr( struct translate_sse *p,
451                                      boolean linear,
452                                      unsigned var_idx,
453                                      struct x86_reg elt )
454{
455   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
456      return x86_make_disp(p->machine_EDX,
457                           get_offset(p, &p->instance_id));
458   }
459   if (linear && p->nr_buffer_varients == 1) {
460      return p->idx_EBX;
461   }
462   else if (linear || p->buffer_varient[var_idx].instance_divisor) {
463      struct x86_reg ptr = p->tmp_EAX;
464      struct x86_reg buf_ptr =
465         x86_make_disp(p->machine_EDX,
466                       get_offset(p, &p->buffer_varient[var_idx].ptr));
467
468      x86_mov(p->func, ptr, buf_ptr);
469      return ptr;
470   }
471   else {
472      struct x86_reg ptr = p->tmp_EAX;
473      const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
474
475      struct x86_reg buf_stride =
476         x86_make_disp(p->machine_EDX,
477                       get_offset(p, &p->buffer[varient->buffer_index].stride));
478
479      struct x86_reg buf_base_ptr =
480         x86_make_disp(p->machine_EDX,
481                       get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
482
483
484
485      /* Calculate pointer to current attrib:
486       */
487      x86_mov(p->func, ptr, buf_stride);
488      x86_imul(p->func, ptr, elt);
489      x86_add(p->func, ptr, buf_base_ptr);
490      return ptr;
491   }
492}
493
494
495
496static boolean incr_inputs( struct translate_sse *p,
497                            boolean linear )
498{
499   if (linear && p->nr_buffer_varients == 1) {
500      struct x86_reg stride = x86_make_disp(p->machine_EDX,
501                                            get_offset(p, &p->buffer[0].stride));
502
503      if (p->buffer_varient[0].instance_divisor == 0) {
504         x86_add(p->func, p->idx_EBX, stride);
505         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
506      }
507   }
508   else if (linear) {
509      unsigned i;
510
511      /* Is this worthwhile??
512       */
513      for (i = 0; i < p->nr_buffer_varients; i++) {
514         struct translate_buffer_varient *varient = &p->buffer_varient[i];
515         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
516                                                get_offset(p, &varient->ptr));
517         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
518                                                   get_offset(p, &p->buffer[varient->buffer_index].stride));
519
520         if (varient->instance_divisor == 0) {
521            x86_mov(p->func, p->tmp_EAX, buf_ptr);
522            x86_add(p->func, p->tmp_EAX, buf_stride);
523            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
524            x86_mov(p->func, buf_ptr, p->tmp_EAX);
525         }
526      }
527   }
528   else {
529      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
530   }
531
532   return TRUE;
533}
534
535
536/* Build run( struct translate *machine,
537 *            unsigned start,
538 *            unsigned count,
539 *            void *output_buffer )
540 * or
541 *  run_elts( struct translate *machine,
542 *            unsigned *elts,
543 *            unsigned count,
544 *            void *output_buffer )
545 *
546 *  Lots of hardcoding
547 *
548 * EAX -- pointer to current output vertex
549 * ECX -- pointer to current attribute
550 *
551 */
552static boolean build_vertex_emit( struct translate_sse *p,
553				  struct x86_function *func,
554				  boolean linear )
555{
556   int fixup, label;
557   unsigned j;
558
559   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
560   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
561   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
562   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
563   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
564
565   p->func = func;
566   p->loaded_inv_255 = FALSE;
567   p->loaded_255 = FALSE;
568   p->loaded_identity = FALSE;
569
570   x86_init_func(p->func);
571
572   /* Push a few regs?
573    */
574   x86_push(p->func, p->idx_EBX);
575   x86_push(p->func, p->count_ESI);
576
577   /* Load arguments into regs:
578    */
579   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
580   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
581   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
582   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
583
584   /* Load instance ID.
585    */
586   if (p->use_instancing) {
587      x86_mov(p->func,
588              p->tmp_EAX,
589              x86_fn_arg(p->func, 4));
590      x86_mov(p->func,
591              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
592              p->tmp_EAX);
593   }
594
595   /* Get vertex count, compare to zero
596    */
597   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
598   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
599   fixup = x86_jcc_forward(p->func, cc_E);
600
601   /* always load, needed or not:
602    */
603   init_inputs(p, linear);
604
605   /* Note address for loop jump
606    */
607   label = x86_get_label(p->func);
608   {
609      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
610      int last_varient = -1;
611      struct x86_reg vb;
612
613      for (j = 0; j < p->translate.key.nr_elements; j++) {
614         const struct translate_element *a = &p->translate.key.element[j];
615         unsigned varient = p->element_to_buffer_varient[j];
616
617         /* Figure out source pointer address:
618          */
619         if (varient != last_varient) {
620            last_varient = varient;
621            vb = get_buffer_ptr(p, linear, varient, elt);
622         }
623
624         if (!translate_attr( p, a,
625                              x86_make_disp(vb, a->input_offset),
626                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
627            return FALSE;
628      }
629
630      /* Next output vertex:
631       */
632      x86_lea(p->func,
633              p->outbuf_ECX,
634              x86_make_disp(p->outbuf_ECX,
635                            p->translate.key.output_stride));
636
637      /* Incr index
638       */
639      incr_inputs( p, linear );
640   }
641
642   /* decr count, loop if not zero
643    */
644   x86_dec(p->func, p->count_ESI);
645   x86_jcc(p->func, cc_NZ, label);
646
647   /* Exit mmx state?
648    */
649   if (p->func->need_emms)
650      mmx_emms(p->func);
651
652   /* Land forward jump here:
653    */
654   x86_fixup_fwd_jump(p->func, fixup);
655
656   /* Pop regs and return
657    */
658
659   x86_pop(p->func, p->count_ESI);
660   x86_pop(p->func, p->idx_EBX);
661   x86_ret(p->func);
662
663   return TRUE;
664}
665
666
667
668
669
670
671
672static void translate_sse_set_buffer( struct translate *translate,
673				unsigned buf,
674				const void *ptr,
675				unsigned stride,
676				unsigned max_index )
677{
678   struct translate_sse *p = (struct translate_sse *)translate;
679
680   if (buf < p->nr_buffers) {
681      p->buffer[buf].base_ptr = (char *)ptr;
682      p->buffer[buf].stride = stride;
683      p->buffer[buf].max_index = max_index;
684   }
685
686   if (0) debug_printf("%s %d/%d: %p %d\n",
687                       __FUNCTION__, buf,
688                       p->nr_buffers,
689                       ptr, stride);
690}
691
692
693static void translate_sse_release( struct translate *translate )
694{
695   struct translate_sse *p = (struct translate_sse *)translate;
696
697   x86_release_func( &p->linear_func );
698   x86_release_func( &p->elt_func );
699
700   FREE(p);
701}
702
703static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
704			      const unsigned *elts,
705			      unsigned count,
706                              unsigned instance_id,
707			      void *output_buffer )
708{
709   struct translate_sse *p = (struct translate_sse *)translate;
710
711   p->gen_run_elts( translate,
712		    elts,
713		    count,
714                    instance_id,
715                    output_buffer);
716}
717
718static void PIPE_CDECL translate_sse_run( struct translate *translate,
719			 unsigned start,
720			 unsigned count,
721                         unsigned instance_id,
722			 void *output_buffer )
723{
724   struct translate_sse *p = (struct translate_sse *)translate;
725
726   p->gen_run( translate,
727	       start,
728	       count,
729               instance_id,
730               output_buffer);
731}
732
733
734struct translate *translate_sse2_create( const struct translate_key *key )
735{
736   struct translate_sse *p = NULL;
737   unsigned i;
738
739   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
740      goto fail;
741
742   p = CALLOC_STRUCT( translate_sse );
743   if (p == NULL)
744      goto fail;
745
746   p->translate.key = *key;
747   p->translate.release = translate_sse_release;
748   p->translate.set_buffer = translate_sse_set_buffer;
749   p->translate.run_elts = translate_sse_run_elts;
750   p->translate.run = translate_sse_run;
751
752   for (i = 0; i < key->nr_elements; i++) {
753      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
754         unsigned j;
755
756         p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
757
758         if (key->element[i].instance_divisor) {
759            p->use_instancing = TRUE;
760         }
761
762         /*
763          * Map vertex element to vertex buffer varient.
764          */
765         for (j = 0; j < p->nr_buffer_varients; j++) {
766            if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
767                p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
768               break;
769            }
770         }
771         if (j == p->nr_buffer_varients) {
772            p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
773            p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
774            p->nr_buffer_varients++;
775         }
776         p->element_to_buffer_varient[i] = j;
777      } else {
778         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
779
780         p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
781      }
782   }
783
784   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
785
786   if (!build_vertex_emit(p, &p->linear_func, TRUE))
787      goto fail;
788
789   if (!build_vertex_emit(p, &p->elt_func, FALSE))
790      goto fail;
791
792   p->gen_run = (run_func)x86_get_func(&p->linear_func);
793   if (p->gen_run == NULL)
794      goto fail;
795
796   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
797   if (p->gen_run_elts == NULL)
798      goto fail;
799
800   return &p->translate;
801
802 fail:
803   if (p)
804      translate_sse_release( &p->translate );
805
806   return NULL;
807}
808
809
810
811#else
812
813struct translate *translate_sse2_create( const struct translate_key *key )
814{
815   return NULL;
816}
817
818#endif
819