translate_sse.c revision 09c0287b84725098c0b365668231ddf00487c84c
1/*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29#include "pipe/p_config.h"
30#include "pipe/p_compiler.h"
31#include "util/u_memory.h"
32#include "util/u_math.h"
33
34#include "translate.h"
35
36
37#if defined(PIPE_ARCH_X86)
38
39#include "rtasm/rtasm_cpu.h"
40#include "rtasm/rtasm_x86sse.h"
41
42
43#define X    0
44#define Y    1
45#define Z    2
46#define W    3
47
48
49typedef void (PIPE_CDECL *run_func)( struct translate *translate,
50                                     unsigned start,
51                                     unsigned count,
52                                     unsigned instance_id,
53                                     void *output_buffer );
54
55typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
56                                          const unsigned *elts,
57                                          unsigned count,
58                                          void *output_buffer );
59
60struct translate_buffer {
61   const void *base_ptr;
62   unsigned stride;
63};
64
65struct translate_buffer_varient {
66   unsigned buffer_index;
67   unsigned instance_divisor;
68   void *ptr;                    /* updated either per vertex or per instance */
69};
70
71
72struct translate_sse {
73   struct translate translate;
74
75   struct x86_function linear_func;
76   struct x86_function elt_func;
77   struct x86_function *func;
78
79   boolean loaded_identity;
80   boolean loaded_255;
81   boolean loaded_inv_255;
82
83   float identity[4];
84   float float_255[4];
85   float inv_255[4];
86
87   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
88   unsigned nr_buffers;
89
90   /* Multiple buffer varients can map to a single buffer. */
91   struct translate_buffer_varient buffer_varient[PIPE_MAX_ATTRIBS];
92   unsigned nr_buffer_varients;
93
94   /* Multiple elements can map to a single buffer varient. */
95   unsigned element_to_buffer_varient[PIPE_MAX_ATTRIBS];
96
97   boolean use_instancing;
98   unsigned instance_id;
99
100   run_func      gen_run;
101   run_elts_func gen_run_elts;
102
103   /* these are actually known values, but putting them in a struct
104    * like this is helpful to keep them in sync across the file.
105    */
106   struct x86_reg tmp_EAX;
107   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
108   struct x86_reg outbuf_ECX;
109   struct x86_reg machine_EDX;
110   struct x86_reg count_ESI;    /* decrements to zero */
111};
112
113static int get_offset( const void *a, const void *b )
114{
115   return (const char *)b - (const char *)a;
116}
117
118
119
120static struct x86_reg get_identity( struct translate_sse *p )
121{
122   struct x86_reg reg = x86_make_reg(file_XMM, 6);
123
124   if (!p->loaded_identity) {
125      p->loaded_identity = TRUE;
126      p->identity[0] = 0;
127      p->identity[1] = 0;
128      p->identity[2] = 0;
129      p->identity[3] = 1;
130
131      sse_movups(p->func, reg,
132		 x86_make_disp(p->machine_EDX,
133			       get_offset(p, &p->identity[0])));
134   }
135
136   return reg;
137}
138
139static struct x86_reg get_255( struct translate_sse *p )
140{
141   struct x86_reg reg = x86_make_reg(file_XMM, 7);
142
143   if (!p->loaded_255) {
144      p->loaded_255 = TRUE;
145      p->float_255[0] =
146	 p->float_255[1] =
147	 p->float_255[2] =
148	 p->float_255[3] = 255.0f;
149
150      sse_movups(p->func, reg,
151		 x86_make_disp(p->machine_EDX,
152			       get_offset(p, &p->float_255[0])));
153   }
154
155   return reg;
156}
157
158static struct x86_reg get_inv_255( struct translate_sse *p )
159{
160   struct x86_reg reg = x86_make_reg(file_XMM, 5);
161
162   if (!p->loaded_inv_255) {
163      p->loaded_inv_255 = TRUE;
164      p->inv_255[0] =
165	 p->inv_255[1] =
166	 p->inv_255[2] =
167	 p->inv_255[3] = 1.0f / 255.0f;
168
169      sse_movups(p->func, reg,
170		 x86_make_disp(p->machine_EDX,
171			       get_offset(p, &p->inv_255[0])));
172   }
173
174   return reg;
175}
176
177
178static void emit_load_R32G32B32A32( struct translate_sse *p,
179				    struct x86_reg data,
180				    struct x86_reg arg0 )
181{
182   sse_movups(p->func, data, arg0);
183}
184
185static void emit_load_R32G32B32( struct translate_sse *p,
186				 struct x86_reg data,
187				 struct x86_reg arg0 )
188{
189   /* Have to jump through some hoops:
190    *
191    * c 0 0 0
192    * c 0 0 1
193    * 0 0 c 1
194    * a b c 1
195    */
196   sse_movss(p->func, data, x86_make_disp(arg0, 8));
197   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
198   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
199   sse_movlps(p->func, data, arg0);
200}
201
202static void emit_load_R32G32( struct translate_sse *p,
203			   struct x86_reg data,
204			   struct x86_reg arg0 )
205{
206   /* 0 0 0 1
207    * a b 0 1
208    */
209   sse_movups(p->func, data, get_identity(p) );
210   sse_movlps(p->func, data, arg0);
211}
212
213
214static void emit_load_R32( struct translate_sse *p,
215			   struct x86_reg data,
216			   struct x86_reg arg0 )
217{
218   /* a 0 0 0
219    * a 0 0 1
220    */
221   sse_movss(p->func, data, arg0);
222   sse_orps(p->func, data, get_identity(p) );
223}
224
225
226static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
227				       struct x86_reg data,
228				       struct x86_reg src )
229{
230
231   /* Load and unpack twice:
232    */
233   sse_movss(p->func, data, src);
234   sse2_punpcklbw(p->func, data, get_identity(p));
235   sse2_punpcklbw(p->func, data, get_identity(p));
236
237   /* Convert to float:
238    */
239   sse2_cvtdq2ps(p->func, data, data);
240
241
242   /* Scale by 1/255.0
243    */
244   sse_mulps(p->func, data, get_inv_255(p));
245}
246
247
248
249
250static void emit_store_R32G32B32A32( struct translate_sse *p,
251				     struct x86_reg dest,
252				     struct x86_reg dataXMM )
253{
254   sse_movups(p->func, dest, dataXMM);
255}
256
257static void emit_store_R32G32B32( struct translate_sse *p,
258				  struct x86_reg dest,
259				  struct x86_reg dataXMM )
260{
261   /* Emit two, shuffle, emit one.
262    */
263   sse_movlps(p->func, dest, dataXMM);
264   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
265   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
266}
267
268static void emit_store_R32G32( struct translate_sse *p,
269			       struct x86_reg dest,
270			       struct x86_reg dataXMM )
271{
272   sse_movlps(p->func, dest, dataXMM);
273}
274
275static void emit_store_R32( struct translate_sse *p,
276			    struct x86_reg dest,
277			    struct x86_reg dataXMM )
278{
279   sse_movss(p->func, dest, dataXMM);
280}
281
282
283
284static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
285				       struct x86_reg dest,
286				       struct x86_reg dataXMM )
287{
288   /* Scale by 255.0
289    */
290   sse_mulps(p->func, dataXMM, get_255(p));
291
292   /* Pack and emit:
293    */
294   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
295   sse2_packssdw(p->func, dataXMM, dataXMM);
296   sse2_packuswb(p->func, dataXMM, dataXMM);
297   sse_movss(p->func, dest, dataXMM);
298}
299
300
301
302
303
304/* Extended swizzles?  Maybe later.
305 */
306static void emit_swizzle( struct translate_sse *p,
307			  struct x86_reg dest,
308			  struct x86_reg src,
309			  unsigned char shuffle )
310{
311   sse_shufps(p->func, dest, src, shuffle);
312}
313
314
315static boolean translate_attr( struct translate_sse *p,
316			       const struct translate_element *a,
317			       struct x86_reg srcECX,
318			       struct x86_reg dstEAX)
319{
320   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
321
322   switch (a->input_format) {
323   case PIPE_FORMAT_R32_FLOAT:
324      emit_load_R32(p, dataXMM, srcECX);
325      break;
326   case PIPE_FORMAT_R32G32_FLOAT:
327      emit_load_R32G32(p, dataXMM, srcECX);
328      break;
329   case PIPE_FORMAT_R32G32B32_FLOAT:
330      emit_load_R32G32B32(p, dataXMM, srcECX);
331      break;
332   case PIPE_FORMAT_R32G32B32A32_FLOAT:
333      emit_load_R32G32B32A32(p, dataXMM, srcECX);
334      break;
335   case PIPE_FORMAT_B8G8R8A8_UNORM:
336      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
337      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
338      break;
339   case PIPE_FORMAT_R8G8B8A8_UNORM:
340      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
341      break;
342   default:
343      return FALSE;
344   }
345
346   switch (a->output_format) {
347   case PIPE_FORMAT_R32_FLOAT:
348      emit_store_R32(p, dstEAX, dataXMM);
349      break;
350   case PIPE_FORMAT_R32G32_FLOAT:
351      emit_store_R32G32(p, dstEAX, dataXMM);
352      break;
353   case PIPE_FORMAT_R32G32B32_FLOAT:
354      emit_store_R32G32B32(p, dstEAX, dataXMM);
355      break;
356   case PIPE_FORMAT_R32G32B32A32_FLOAT:
357      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
358      break;
359   case PIPE_FORMAT_B8G8R8A8_UNORM:
360      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
361      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
362      break;
363   case PIPE_FORMAT_R8G8B8A8_UNORM:
364      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
365      break;
366   default:
367      return FALSE;
368   }
369
370   return TRUE;
371}
372
373
374static boolean init_inputs( struct translate_sse *p,
375                            boolean linear )
376{
377   unsigned i;
378   if (linear) {
379      struct x86_reg instance_id = x86_make_disp(p->machine_EDX,
380                                                 get_offset(p, &p->instance_id));
381
382      for (i = 0; i < p->nr_buffer_varients; i++) {
383         struct translate_buffer_varient *varient = &p->buffer_varient[i];
384         struct translate_buffer *buffer = &p->buffer[varient->buffer_index];
385         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
386                                                     get_offset(p, &buffer->stride));
387         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
388                                                     get_offset(p, &varient->ptr));
389         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
390                                                     get_offset(p, &buffer->base_ptr));
391         struct x86_reg elt = p->idx_EBX;
392         struct x86_reg tmp_EAX = p->tmp_EAX;
393
394         /* Calculate pointer to first attrib:
395          *   base_ptr + stride * index, where index depends on instance divisor
396          */
397         if (varient->instance_divisor) {
398            /* Our index is instance ID divided by instance divisor.
399             */
400            x86_mov(p->func, tmp_EAX, instance_id);
401
402            if (varient->instance_divisor != 1) {
403               struct x86_reg tmp_EDX = p->machine_EDX;
404               struct x86_reg tmp_ECX = p->outbuf_ECX;
405
406               /* TODO: Add x86_shr() to rtasm and use it whenever
407                *       instance divisor is power of two.
408                */
409
410               x86_push(p->func, tmp_EDX);
411               x86_push(p->func, tmp_ECX);
412               x86_xor(p->func, tmp_EDX, tmp_EDX);
413               x86_mov_reg_imm(p->func, tmp_ECX, varient->instance_divisor);
414               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
415               x86_pop(p->func, tmp_ECX);
416               x86_pop(p->func, tmp_EDX);
417            }
418         } else {
419            x86_mov(p->func, tmp_EAX, elt);
420         }
421         x86_imul(p->func, tmp_EAX, buf_stride);
422         x86_add(p->func, tmp_EAX, buf_base_ptr);
423
424
425         /* In the linear case, keep the buffer pointer instead of the
426          * index number.
427          */
428         if (p->nr_buffer_varients == 1)
429            x86_mov(p->func, elt, tmp_EAX);
430         else
431            x86_mov(p->func, buf_ptr, tmp_EAX);
432      }
433   }
434
435   return TRUE;
436}
437
438
439static struct x86_reg get_buffer_ptr( struct translate_sse *p,
440                                      boolean linear,
441                                      unsigned var_idx,
442                                      struct x86_reg elt )
443{
444   if (linear && p->nr_buffer_varients == 1) {
445      return p->idx_EBX;
446   }
447   else if (linear) {
448      struct x86_reg ptr = p->tmp_EAX;
449      struct x86_reg buf_ptr =
450         x86_make_disp(p->machine_EDX,
451                       get_offset(p, &p->buffer_varient[var_idx].ptr));
452
453      x86_mov(p->func, ptr, buf_ptr);
454      return ptr;
455   }
456   else {
457      struct x86_reg ptr = p->tmp_EAX;
458      const struct translate_buffer_varient *varient = &p->buffer_varient[var_idx];
459
460      struct x86_reg buf_stride =
461         x86_make_disp(p->machine_EDX,
462                       get_offset(p, &p->buffer[varient->buffer_index].stride));
463
464      struct x86_reg buf_base_ptr =
465         x86_make_disp(p->machine_EDX,
466                       get_offset(p, &p->buffer[varient->buffer_index].base_ptr));
467
468
469
470      /* Calculate pointer to current attrib:
471       */
472      x86_mov(p->func, ptr, buf_stride);
473      x86_imul(p->func, ptr, elt);
474      x86_add(p->func, ptr, buf_base_ptr);
475      return ptr;
476   }
477}
478
479
480
481static boolean incr_inputs( struct translate_sse *p,
482                            boolean linear )
483{
484   if (linear && p->nr_buffer_varients == 1) {
485      struct x86_reg stride = x86_make_disp(p->machine_EDX,
486                                            get_offset(p, &p->buffer[0].stride));
487
488      if (p->buffer_varient[0].instance_divisor == 0) {
489         x86_add(p->func, p->idx_EBX, stride);
490         sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
491      }
492   }
493   else if (linear) {
494      unsigned i;
495
496      /* Is this worthwhile??
497       */
498      for (i = 0; i < p->nr_buffer_varients; i++) {
499         struct translate_buffer_varient *varient = &p->buffer_varient[i];
500         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
501                                                get_offset(p, &varient->ptr));
502         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
503                                                   get_offset(p, &p->buffer[varient->buffer_index].stride));
504
505         if (varient->instance_divisor == 0) {
506            x86_mov(p->func, p->tmp_EAX, buf_ptr);
507            x86_add(p->func, p->tmp_EAX, buf_stride);
508            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
509            x86_mov(p->func, buf_ptr, p->tmp_EAX);
510         }
511      }
512   }
513   else {
514      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
515   }
516
517   return TRUE;
518}
519
520
521/* Build run( struct translate *machine,
522 *            unsigned start,
523 *            unsigned count,
524 *            void *output_buffer )
525 * or
526 *  run_elts( struct translate *machine,
527 *            unsigned *elts,
528 *            unsigned count,
529 *            void *output_buffer )
530 *
531 *  Lots of hardcoding
532 *
533 * EAX -- pointer to current output vertex
534 * ECX -- pointer to current attribute
535 *
536 */
537static boolean build_vertex_emit( struct translate_sse *p,
538				  struct x86_function *func,
539				  boolean linear )
540{
541   int fixup, label;
542   unsigned j;
543
544   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
545   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
546   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
547   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
548   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
549
550   p->func = func;
551   p->loaded_inv_255 = FALSE;
552   p->loaded_255 = FALSE;
553   p->loaded_identity = FALSE;
554
555   x86_init_func(p->func);
556
557   /* Push a few regs?
558    */
559   x86_push(p->func, p->idx_EBX);
560   x86_push(p->func, p->count_ESI);
561
562   /* Load arguments into regs:
563    */
564   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
565   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
566   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
567   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 5));
568
569   /* Load instance ID.
570    */
571   if (p->use_instancing) {
572      x86_mov(p->func,
573              p->tmp_EAX,
574              x86_fn_arg(p->func, 4));
575      x86_mov(p->func,
576              x86_make_disp(p->machine_EDX, get_offset(p, &p->instance_id)),
577              p->tmp_EAX);
578   }
579
580   /* Get vertex count, compare to zero
581    */
582   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
583   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
584   fixup = x86_jcc_forward(p->func, cc_E);
585
586   /* always load, needed or not:
587    */
588   init_inputs(p, linear);
589
590   /* Note address for loop jump
591    */
592   label = x86_get_label(p->func);
593   {
594      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
595      int last_varient = -1;
596      struct x86_reg vb;
597
598      for (j = 0; j < p->translate.key.nr_elements; j++) {
599         const struct translate_element *a = &p->translate.key.element[j];
600         unsigned varient = p->element_to_buffer_varient[j];
601
602         /* Figure out source pointer address:
603          */
604         if (varient != last_varient) {
605            last_varient = varient;
606            vb = get_buffer_ptr(p, linear, varient, elt);
607         }
608
609         if (!translate_attr( p, a,
610                              x86_make_disp(vb, a->input_offset),
611                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
612            return FALSE;
613      }
614
615      /* Next output vertex:
616       */
617      x86_lea(p->func,
618              p->outbuf_ECX,
619              x86_make_disp(p->outbuf_ECX,
620                            p->translate.key.output_stride));
621
622      /* Incr index
623       */
624      incr_inputs( p, linear );
625   }
626
627   /* decr count, loop if not zero
628    */
629   x86_dec(p->func, p->count_ESI);
630   x86_jcc(p->func, cc_NZ, label);
631
632   /* Exit mmx state?
633    */
634   if (p->func->need_emms)
635      mmx_emms(p->func);
636
637   /* Land forward jump here:
638    */
639   x86_fixup_fwd_jump(p->func, fixup);
640
641   /* Pop regs and return
642    */
643
644   x86_pop(p->func, p->count_ESI);
645   x86_pop(p->func, p->idx_EBX);
646   x86_ret(p->func);
647
648   return TRUE;
649}
650
651
652
653
654
655
656
657static void translate_sse_set_buffer( struct translate *translate,
658				unsigned buf,
659				const void *ptr,
660				unsigned stride )
661{
662   struct translate_sse *p = (struct translate_sse *)translate;
663
664   if (buf < p->nr_buffers) {
665      p->buffer[buf].base_ptr = (char *)ptr;
666      p->buffer[buf].stride = stride;
667   }
668
669   if (0) debug_printf("%s %d/%d: %p %d\n",
670                       __FUNCTION__, buf,
671                       p->nr_buffers,
672                       ptr, stride);
673}
674
675
676static void translate_sse_release( struct translate *translate )
677{
678   struct translate_sse *p = (struct translate_sse *)translate;
679
680   x86_release_func( &p->linear_func );
681   x86_release_func( &p->elt_func );
682
683   FREE(p);
684}
685
686static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
687			      const unsigned *elts,
688			      unsigned count,
689			      void *output_buffer )
690{
691   struct translate_sse *p = (struct translate_sse *)translate;
692
693   p->gen_run_elts( translate,
694		    elts,
695		    count,
696		    output_buffer );
697}
698
699static void PIPE_CDECL translate_sse_run( struct translate *translate,
700			 unsigned start,
701			 unsigned count,
702                         unsigned instance_id,
703			 void *output_buffer )
704{
705   struct translate_sse *p = (struct translate_sse *)translate;
706
707   p->gen_run( translate,
708	       start,
709	       count,
710               instance_id,
711	       output_buffer );
712}
713
714
715struct translate *translate_sse2_create( const struct translate_key *key )
716{
717   struct translate_sse *p = NULL;
718   unsigned i;
719
720   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
721      goto fail;
722
723   p = CALLOC_STRUCT( translate_sse );
724   if (p == NULL)
725      goto fail;
726
727   p->translate.key = *key;
728   p->translate.release = translate_sse_release;
729   p->translate.set_buffer = translate_sse_set_buffer;
730   p->translate.run_elts = translate_sse_run_elts;
731   p->translate.run = translate_sse_run;
732
733   for (i = 0; i < key->nr_elements; i++) {
734      unsigned j;
735
736      p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
737
738      if (key->element[i].instance_divisor) {
739         p->use_instancing = TRUE;
740      }
741
742      /*
743       * Map vertex element to vertex buffer varient.
744       */
745      for (j = 0; j < p->nr_buffer_varients; j++) {
746         if (p->buffer_varient[j].buffer_index == key->element[i].input_buffer &&
747             p->buffer_varient[j].instance_divisor == key->element[i].instance_divisor) {
748            break;
749         }
750      }
751      if (j == p->nr_buffer_varients) {
752         p->buffer_varient[j].buffer_index = key->element[i].input_buffer;
753         p->buffer_varient[j].instance_divisor = key->element[i].instance_divisor;
754         p->nr_buffer_varients++;
755      }
756      p->element_to_buffer_varient[i] = j;
757   }
758
759   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
760
761   if (!build_vertex_emit(p, &p->linear_func, TRUE))
762      goto fail;
763
764   if (!build_vertex_emit(p, &p->elt_func, FALSE))
765      goto fail;
766
767   p->gen_run = (run_func)x86_get_func(&p->linear_func);
768   if (p->gen_run == NULL)
769      goto fail;
770
771   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
772   if (p->gen_run_elts == NULL)
773      goto fail;
774
775   return &p->translate;
776
777 fail:
778   if (p)
779      translate_sse_release( &p->translate );
780
781   return NULL;
782}
783
784
785
786#else
787
788struct translate *translate_sse2_create( const struct translate_key *key )
789{
790   return NULL;
791}
792
793#endif
794