translate_sse.c revision 4a4e29a9ab96d44fca9bb25064e12715aac85cbd
1/*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29#include "pipe/p_config.h"
30#include "pipe/p_compiler.h"
31#include "util/u_memory.h"
32#include "util/u_math.h"
33
34#include "translate.h"
35
36
37#if defined(PIPE_ARCH_X86)
38
39#include "rtasm/rtasm_cpu.h"
40#include "rtasm/rtasm_x86sse.h"
41
42
43#define X    0
44#define Y    1
45#define Z    2
46#define W    3
47
48
49struct translate_buffer {
50   const void *base_ptr;
51   unsigned stride;
52   unsigned max_index;
53};
54
55struct translate_buffer_varient {
56   unsigned buffer_index;
57   unsigned instance_divisor;
58   void *ptr;                    /* updated either per vertex or per instance */
59};
60
61
62#define ELEMENT_BUFFER_INSTANCE_ID  1001
63
64
65struct translate_sse {
66   struct translate translate;
67
68   struct x86_function linear_func;
69   struct x86_function elt_func;
70   struct x86_function elt16_func;
71   struct x86_function elt8_func;
72   struct x86_function *func;
73
74   boolean loaded_identity;
75   boolean loaded_255;
76   boolean loaded_inv_255;
77
78   float identity[4];
79   float float_255[4];
80   float inv_255[4];
81
82   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
83   unsigned nr_buffers;
84
85   /* Multiple buffer varients can map to a single buffer. */
86   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
87   unsigned nr_buffer_varients;
88
89   /* Multiple elements can map to a single buffer varient. */
90   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
91
92   boolean use_instancing;
93   unsigned instance_id;
94
95   /* these are actually known values, but putting them in a struct
96    * like this is helpful to keep them in sync across the file.
97    */
98   struct x86_reg tmp_EAX;
99   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
100   struct x86_reg outbuf_ECX;
101   struct x86_reg machine_EDX;
102   struct x86_reg count_ESI;    /* decrements to zero */
103};
104
105static int get_offset( const void *a, const void *b )
106{
107   return (const char *)b - (const char *)a;
108}
109
110
111
112static struct x86_reg get_identity( struct translate_sse *p )
113{
114   struct x86_reg reg = x86_make_reg(file_XMM, 6);
115
116   if (!p->loaded_identity) {
117      p->loaded_identity = TRUE;
118      p->identity[0] = 0;
119      p->identity[1] = 0;
120      p->identity[2] = 0;
121      p->identity[3] = 1;
122
123      sse_movups(p->func, reg,
124		 x86_make_disp(p->machine_EDX,
125			       get_offset(p, &p->identity[0])));
126   }
127
128   return reg;
129}
130
131static struct x86_reg get_255( struct translate_sse *p )
132{
133   struct x86_reg reg = x86_make_reg(file_XMM, 7);
134
135   if (!p->loaded_255) {
136      p->loaded_255 = TRUE;
137      p->float_255[0] =
138	 p->float_255[1] =
139	 p->float_255[2] =
140	 p->float_255[3] = 255.0f;
141
142      sse_movups(p->func, reg,
143		 x86_make_disp(p->machine_EDX,
144			       get_offset(p, &p->float_255[0])));
145   }
146
147   return reg;
148}
149
150static struct x86_reg get_inv_255( struct translate_sse *p )
151{
152   struct x86_reg reg = x86_make_reg(file_XMM, 5);
153
154   if (!p->loaded_inv_255) {
155      p->loaded_inv_255 = TRUE;
156      p->inv_255[0] =
157	 p->inv_255[1] =
158	 p->inv_255[2] =
159	 p->inv_255[3] = 1.0f / 255.0f;
160
161      sse_movups(p->func, reg,
162		 x86_make_disp(p->machine_EDX,
163			       get_offset(p, &p->inv_255[0])));
164   }
165
166   return reg;
167}
168
169
170static void emit_load_R32G32B32A32( struct translate_sse *p,
171				    struct x86_reg data,
172				    struct x86_reg arg0 )
173{
174   sse_movups(p->func, data, arg0);
175}
176
177static void emit_load_R32G32B32( struct translate_sse *p,
178				 struct x86_reg data,
179				 struct x86_reg arg0 )
180{
181   /* Have to jump through some hoops:
182    *
183    * c 0 0 0
184    * c 0 0 1
185    * 0 0 c 1
186    * a b c 1
187    */
188   sse_movss(p->func, data, x86_make_disp(arg0, 8));
189   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
190   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
191   sse_movlps(p->func, data, arg0);
192}
193
194static void emit_load_R32G32( struct translate_sse *p,
195			   struct x86_reg data,
196			   struct x86_reg arg0 )
197{
198   /* 0 0 0 1
199    * a b 0 1
200    */
201   sse_movups(p->func, data, get_identity(p) );
202   sse_movlps(p->func, data, arg0);
203}
204
205
206static void emit_load_R32( struct translate_sse *p,
207			   struct x86_reg data,
208			   struct x86_reg arg0 )
209{
210   /* a 0 0 0
211    * a 0 0 1
212    */
213   sse_movss(p->func, data, arg0);
214   sse_orps(p->func, data, get_identity(p) );
215}
216
217
218static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
219				       struct x86_reg data,
220				       struct x86_reg src )
221{
222
223   /* Load and unpack twice:
224    */
225   sse_movss(p->func, data, src);
226   sse2_punpcklbw(p->func, data, get_identity(p));
227   sse2_punpcklbw(p->func, data, get_identity(p));
228
229   /* Convert to float:
230    */
231   sse2_cvtdq2ps(p->func, data, data);
232
233
234   /* Scale by 1/255.0
235    */
236   sse_mulps(p->func, data, get_inv_255(p));
237}
238
239
240
241
242static void emit_store_R32G32B32A32( struct translate_sse *p,
243				     struct x86_reg dest,
244				     struct x86_reg dataXMM )
245{
246   sse_movups(p->func, dest, dataXMM);
247}
248
249static void emit_store_R32G32B32( struct translate_sse *p,
250				  struct x86_reg dest,
251				  struct x86_reg dataXMM )
252{
253   /* Emit two, shuffle, emit one.
254    */
255   sse_movlps(p->func, dest, dataXMM);
256   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
257   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
258}
259
260static void emit_store_R32G32( struct translate_sse *p,
261			       struct x86_reg dest,
262			       struct x86_reg dataXMM )
263{
264   sse_movlps(p->func, dest, dataXMM);
265}
266
267static void emit_store_R32( struct translate_sse *p,
268			    struct x86_reg dest,
269			    struct x86_reg dataXMM )
270{
271   sse_movss(p->func, dest, dataXMM);
272}
273
274
275
276static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
277				       struct x86_reg dest,
278				       struct x86_reg dataXMM )
279{
280   /* Scale by 255.0
281    */
282   sse_mulps(p->func, dataXMM, get_255(p));
283
284   /* Pack and emit:
285    */
286   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
287   sse2_packssdw(p->func, dataXMM, dataXMM);
288   sse2_packuswb(p->func, dataXMM, dataXMM);
289   sse_movss(p->func, dest, dataXMM);
290}
291
292
293
294
295
296/* Extended swizzles?  Maybe later.
297 */
298static void emit_swizzle( struct translate_sse *p,
299			  struct x86_reg dest,
300			  struct x86_reg src,
301			  unsigned char shuffle )
302{
303   sse_shufps(p->func, dest, src, shuffle);
304}
305
306
307static boolean translate_attr( struct translate_sse *p,
308			       const struct translate_element *a,
309			       struct x86_reg srcECX,
310			       struct x86_reg dstEAX)
311{
312   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
313
314   switch (a->input_format) {
315   case PIPE_FORMAT_R32_FLOAT:
316      emit_load_R32(p, dataXMM, srcECX);
317      break;
318   case PIPE_FORMAT_R32G32_FLOAT:
319      emit_load_R32G32(p, dataXMM, srcECX);
320      break;
321   case PIPE_FORMAT_R32G32B32_FLOAT:
322      emit_load_R32G32B32(p, dataXMM, srcECX);
323      break;
324   case PIPE_FORMAT_R32G32B32A32_FLOAT:
325      emit_load_R32G32B32A32(p, dataXMM, srcECX);
326      break;
327   case PIPE_FORMAT_B8G8R8A8_UNORM:
328      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
329      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
330      break;
331   case PIPE_FORMAT_R8G8B8A8_UNORM:
332      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
333      break;
334   default:
335      return FALSE;
336   }
337
338   switch (a->output_format) {
339   case PIPE_FORMAT_R32_FLOAT:
340      emit_store_R32(p, dstEAX, dataXMM);
341      break;
342   case PIPE_FORMAT_R32G32_FLOAT:
343      emit_store_R32G32(p, dstEAX, dataXMM);
344      break;
345   case PIPE_FORMAT_R32G32B32_FLOAT:
346      emit_store_R32G32B32(p, dstEAX, dataXMM);
347      break;
348   case PIPE_FORMAT_R32G32B32A32_FLOAT:
349      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
350      break;
351   case PIPE_FORMAT_B8G8R8A8_UNORM:
352      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
353      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
354      break;
355   case PIPE_FORMAT_R8G8B8A8_UNORM:
356      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
357      break;
358   default:
359      return FALSE;
360   }
361
362   return TRUE;
363}
364
365
366static boolean init_inputs( struct translate_sse *p,
367                            unsigned index_size )
368{
369   unsigned i;
370   struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
371                                              get_offset(p, &p->instance_id));
372
373   for (i = 0; i < p->nr_buffer_varients; i++) {
374      struct translate_buffer_varient *varient = &p->buffer_varient[i];
375      struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
376
377      if (!index_size || varient->instance_divisor) {
378         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
379                                                     get_offset(p, &buffer->stride));
380         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
381                                                     get_offset(p, &varient->ptr));
382         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
383                                                     get_offset(p, &buffer->base_ptr));
384         struct x86_reg elt = p->idx_EBX;
385         struct x86_reg tmp_EAX = p->tmp_EAX;
386
387         /* Calculate pointer to first attrib:
388          *   base_ptr + stride * index, where index depends on instance divisor
389          */
390         if (varient->instance_divisor) {
391            /* Our index is instance ID divided by instance divisor.
392             */
393            x86_mov(p->func, tmp_EAX, instance_id);
394
395            if (varient->instance_divisor != 1) {
396               struct x86_reg tmp_EDX = p->machine_EDX;
397               struct x86_reg tmp_ECX = p->outbuf_ECX;
398
399               /* TODO: Add x86_shr() to rtasm and use it whenever
400                *       instance divisor is power of two.
401                */
402
403               x86_push(p->func, tmp_EDX);
404               x86_push(p->func, tmp_ECX);
405               x86_xor(p->func, tmp_EDX, tmp_EDX);
406               x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
407               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
408               x86_pop(p->func, tmp_ECX);
409               x86_pop(p->func, tmp_EDX);
410            }
411         } else {
412            x86_mov(p->func, tmp_EAX, elt);
413         }
414
415         /*
416          * TODO: Respect translate_buffer::max_index.
417          */
418
419         x86_imul(p->func, tmp_EAX, buf_stride);
420         x86_add(p->func, tmp_EAX, buf_base_ptr);
421
422
423         /* In the linear case, keep the buffer pointer instead of the
424          * index number.
425          */
426         if (!index_size && p->nr_buffer_varients == 1)
427            x86_mov(p->func, elt, tmp_EAX);
428         else
429            x86_mov(p->func, buf_ptr, tmp_EAX);
430      }
431   }
432
433   return TRUE;
434}
435
436
437static struct x86_reg get_buffer_ptr( struct translate_sse *p,
438                                      unsigned index_size,
439                                      unsigned var_idx,
440                                      struct x86_reg elt )
441{
442   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
443      return x86_make_disp(p->machine_EDX,
444                           get_offset(p, &p->instance_id));
445   }
446   if (!index_size && p->nr_buffer_varients == 1) {
447      return p->idx_EBX;
448   }
449   else if (!index_size || p->buffer_varient[var_idx].instance_divisor) {
450      struct x86_reg ptr = p->tmp_EAX;
451      struct x86_reg buf_ptr =
452         x86_make_disp(p->machine_EDX,
453                       get_offset(p, &p->buffer_varient[var_idx].ptr));
454
455      x86_mov(p->func, ptr, buf_ptr);
456      return ptr;
457   }
458   else {
459      struct x86_reg ptr = p->tmp_EAX;
460      const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
461
462      struct x86_reg buf_stride =
463         x86_make_disp(p->machine_EDX,
464                       get_offset(p, &p->buffer[varient->buffer_index].stride));
465
466      struct x86_reg buf_base_ptr =
467         x86_make_disp(p->machine_EDX,
468                       get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
469
470
471
472      /* Calculate pointer to current attrib:
473       */
474      switch(index_size)
475      {
476      case 1:
477         x86_movzx8(p->func, ptr, elt);
478         break;
479      case 2:
480         x86_movzx16(p->func, ptr, elt);
481         break;
482      case 4:
483         x86_mov(p->func, ptr, elt);
484         break;
485      }
486      x86_imul(p->func, ptr, buf_stride);
487      x86_add(p->func, ptr, buf_base_ptr);
488      return ptr;
489   }
490}
491
492
493
494static boolean incr_inputs( struct translate_sse *p,
495                            unsigned index_size )
496{
497   if (!index_size && p->nr_buffer_varients == 1) {
498      struct x86_reg stride = x86_make_disp(p->machine_EDX,
499                                            get_offset(p, &p->buffer[0].stride));
500
501      if (p->buffer_varient[0].instance_divisor == 0) {
502         x86_add(p->func, p->idx_EBX, stride);
503         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
504      }
505   }
506   else if (!index_size) {
507      unsigned i;
508
509      /* Is this worthwhile??
510       */
511      for (i = 0; i < p->nr_buffer_varients; i++) {
512         struct translate_buffer_varient *varient = &p->buffer_varient[i];
513         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
514                                                get_offset(p, &varient->ptr));
515         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
516                                                   get_offset(p, &p->buffer[varient->buffer_index].stride));
517
518         if (varient->instance_divisor == 0) {
519            x86_mov(p->func, p->tmp_EAX, buf_ptr);
520            x86_add(p->func, p->tmp_EAX, buf_stride);
521            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
522            x86_mov(p->func, buf_ptr, p->tmp_EAX);
523         }
524      }
525   }
526   else {
527      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, index_size));
528   }
529
530   return TRUE;
531}
532
533
534/* Build run( struct translate *machine,
535 *            unsigned start,
536 *            unsigned count,
537 *            void *output_buffer )
538 * or
539 *  run_elts( struct translate *machine,
540 *            unsigned *elts,
541 *            unsigned count,
542 *            void *output_buffer )
543 *
544 *  Lots of hardcoding
545 *
546 * EAX -- pointer to current output vertex
547 * ECX -- pointer to current attribute
548 *
549 */
550static boolean build_vertex_emit( struct translate_sse *p,
551				  struct x86_function *func,
552				  unsigned index_size )
553{
554   int fixup, label;
555   unsigned j;
556
557   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
558   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
559   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
560   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
561   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
562
563   p->func = func;
564   p->loaded_inv_255 = FALSE;
565   p->loaded_255 = FALSE;
566   p->loaded_identity = FALSE;
567
568   x86_init_func(p->func);
569
570   /* Push a few regs?
571    */
572   x86_push(p->func, p->idx_EBX);
573   x86_push(p->func, p->count_ESI);
574
575   /* Load arguments into regs:
576    */
577   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
578   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
579   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
580   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
581
582   /* Load instance ID.
583    */
584   if (p->use_instancing) {
585      x86_mov(p->func,
586              p->tmp_EAX,
587              x86_fn_arg(p->func, 4));
588      x86_mov(p->func,
589              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
590              p->tmp_EAX);
591   }
592
593   /* Get vertex count, compare to zero
594    */
595   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
596   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
597   fixup = x86_jcc_forward(p->func, cc_E);
598
599   /* always load, needed or not:
600    */
601   init_inputs(p, index_size);
602
603   /* Note address for loop jump
604    */
605   label = x86_get_label(p->func);
606   {
607      struct x86_reg elt = !index_size ? p->idx_EBX : x86_deref(p->idx_EBX);
608      int last_varient = -1;
609      struct x86_reg vb;
610
611      for (j = 0; j < p->translate.key.nr_elements; j++) {
612         const struct translate_element *a = &p->translate.key.element[j];
613         unsigned varient = p->element_to_buffer_varient[j];
614
615         /* Figure out source pointer address:
616          */
617         if (varient != last_varient) {
618            last_varient = varient;
619            vb = get_buffer_ptr(p, index_size, varient, elt);
620         }
621
622         if (!translate_attr( p, a,
623                              x86_make_disp(vb, a->input_offset),
624                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
625            return FALSE;
626      }
627
628      /* Next output vertex:
629       */
630      x86_lea(p->func,
631              p->outbuf_ECX,
632              x86_make_disp(p->outbuf_ECX,
633                            p->translate.key.output_stride));
634
635      /* Incr index
636       */
637      incr_inputs( p, index_size );
638   }
639
640   /* decr count, loop if not zero
641    */
642   x86_dec(p->func, p->count_ESI);
643   x86_jcc(p->func, cc_NZ, label);
644
645   /* Exit mmx state?
646    */
647   if (p->func->need_emms)
648      mmx_emms(p->func);
649
650   /* Land forward jump here:
651    */
652   x86_fixup_fwd_jump(p->func, fixup);
653
654   /* Pop regs and return
655    */
656
657   x86_pop(p->func, p->count_ESI);
658   x86_pop(p->func, p->idx_EBX);
659   x86_ret(p->func);
660
661   return TRUE;
662}
663
664
665
666
667
668
669
670static void translate_sse_set_buffer( struct translate *translate,
671				unsigned buf,
672				const void *ptr,
673				unsigned stride,
674				unsigned max_index )
675{
676   struct translate_sse *p = (struct translate_sse *)translate;
677
678   if (buf < p->nr_buffers) {
679      p->buffer[buf].base_ptr = (char *)ptr;
680      p->buffer[buf].stride = stride;
681      p->buffer[buf].max_index = max_index;
682   }
683
684   if (0) debug_printf("%s %d/%d: %p %d\n",
685                       __FUNCTION__, buf,
686                       p->nr_buffers,
687                       ptr, stride);
688}
689
690
691static void translate_sse_release( struct translate *translate )
692{
693   struct translate_sse *p = (struct translate_sse *)translate;
694
695   x86_release_func( &p->linear_func );
696   x86_release_func( &p->elt_func );
697
698   FREE(p);
699}
700
701
702struct translate *translate_sse2_create( const struct translate_key *key )
703{
704   struct translate_sse *p = NULL;
705   unsigned i;
706
707   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
708      goto fail;
709
710   p = CALLOC_STRUCT( translate_sse );
711   if (p == NULL)
712      goto fail;
713
714   p->translate.key = *key;
715   p->translate.release = translate_sse_release;
716   p->translate.set_buffer = translate_sse_set_buffer;
717
718   for (i = 0; i < key->nr_elements; i++) {
719      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
720         unsigned j;
721
722         p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
723
724         if (key->element[i].instance_divisor) {
725            p->use_instancing = TRUE;
726         }
727
728         /*
729          * Map vertex element to vertex buffer varient.
730          */
731         for (j = 0; j < p->nr_buffer_varients; j++) {
732            if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
733                p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
734               break;
735            }
736         }
737         if (j == p->nr_buffer_varients) {
738            p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
739            p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
740            p->nr_buffer_varients++;
741         }
742         p->element_to_buffer_varient[i] = j;
743      } else {
744         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
745
746         p->element_to_buffer_varient[i] = ELEMENT_BUFFER_INSTANCE_ID;
747      }
748   }
749
750   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
751
752   if (!build_vertex_emit(p, &p->linear_func, 0))
753      goto fail;
754
755   if (!build_vertex_emit(p, &p->elt_func, 4))
756      goto fail;
757
758   if (!build_vertex_emit(p, &p->elt16_func, 2))
759      goto fail;
760
761   if (!build_vertex_emit(p, &p->elt8_func, 1))
762      goto fail;
763
764   p->translate.run = (void*)x86_get_func(&p->linear_func);
765   if (p->translate.run == NULL)
766      goto fail;
767
768   p->translate.run_elts = (void*)x86_get_func(&p->elt_func);
769   if (p->translate.run_elts == NULL)
770      goto fail;
771
772   p->translate.run_elts16 = (void*)x86_get_func(&p->elt16_func);
773   if (p->translate.run_elts16 == NULL)
774      goto fail;
775
776   p->translate.run_elts8 = (void*)x86_get_func(&p->elt8_func);
777   if (p->translate.run_elts8 == NULL)
778      goto fail;
779
780   return &p->translate;
781
782 fail:
783   if (p)
784      translate_sse_release( &p->translate );
785
786   return NULL;
787}
788
789
790
791#else
792
793struct translate *translate_sse2_create( const struct translate_key *key )
794{
795   return NULL;
796}
797
798#endif
799