translate_sse.c revision f4dd0991719ef3e2606920c5100b372181c60899
1/*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29#include "pipe/p_config.h"
30#include "pipe/p_compiler.h"
31#include "util/u_memory.h"
32#include "util/u_math.h"
33#include "util/u_format.h"
34
35#include "translate.h"
36
37
38#if defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))
39
40#include "rtasm/rtasm_cpu.h"
41#include "rtasm/rtasm_x86sse.h"
42
43
44#define X    0
45#define Y    1
46#define Z    2
47#define W    3
48
49
50struct translate_buffer {
51   const void *base_ptr;
52   uintptr_t stride;
53   unsigned max_index;
54};
55
56struct translate_buffer_variant {
57   unsigned buffer_index;
58   unsigned instance_divisor;
59   void *ptr;                    /* updated either per vertex or per instance */
60};
61
62
63#define ELEMENT_BUFFER_INSTANCE_ID  1001
64
65#define NUM_CONSTS 7
66
67enum
68{
69   CONST_IDENTITY,
70   CONST_INV_127,
71   CONST_INV_255,
72   CONST_INV_32767,
73   CONST_INV_65535,
74   CONST_INV_2147483647,
75   CONST_255
76};
77
78#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
79static float consts[NUM_CONSTS][4] = {
80      {0, 0, 0, 1},
81      C(1.0 / 127.0),
82      C(1.0 / 255.0),
83      C(1.0 / 32767.0),
84      C(1.0 / 65535.0),
85      C(1.0 / 2147483647.0),
86      C(255.0)
87};
88#undef C
89
90struct translate_sse {
91   struct translate translate;
92
93   struct x86_function linear_func;
94   struct x86_function elt_func;
95   struct x86_function elt16_func;
96   struct x86_function elt8_func;
97   struct x86_function *func;
98
99   PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
100   int8_t reg_to_const[16];
101   int8_t const_to_reg[NUM_CONSTS];
102
103   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
104   unsigned nr_buffers;
105
106   /* Multiple buffer variants can map to a single buffer. */
107   struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS];
108   unsigned nr_buffer_variants;
109
110   /* Multiple elements can map to a single buffer variant. */
111   unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS];
112
113   boolean use_instancing;
114   unsigned instance_id;
115
116   /* these are actually known values, but putting them in a struct
117    * like this is helpful to keep them in sync across the file.
118    */
119   struct x86_reg tmp_EAX;
120   struct x86_reg tmp2_EDX;
121   struct x86_reg src_ECX;
122   struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
123   struct x86_reg machine_EDI;
124   struct x86_reg outbuf_EBX;
125   struct x86_reg count_EBP;    /* decrements to zero */
126};
127
128static int get_offset( const void *a, const void *b )
129{
130   return (const char *)b - (const char *)a;
131}
132
133static struct x86_reg get_const( struct translate_sse *p, unsigned id)
134{
135   struct x86_reg reg;
136   unsigned i;
137
138   if(p->const_to_reg[id] >= 0)
139      return x86_make_reg(file_XMM, p->const_to_reg[id]);
140
141   for(i = 2; i < 8; ++i)
142   {
143      if(p->reg_to_const[i] < 0)
144         break;
145   }
146
147   /* TODO: be smarter here */
148   if(i == 8)
149      --i;
150
151   reg = x86_make_reg(file_XMM, i);
152
153   if(p->reg_to_const[i] >= 0)
154      p->const_to_reg[p->reg_to_const[i]] = -1;
155
156   p->reg_to_const[i] = id;
157   p->const_to_reg[id] = i;
158
159   /* TODO: this should happen outside the loop, if possible */
160   sse_movaps(p->func, reg,
161         x86_make_disp(p->machine_EDI,
162               get_offset(p, &p->consts[id][0])));
163
164   return reg;
165}
166
167/* load the data in a SSE2 register, padding with zeros */
168static boolean emit_load_sse2( struct translate_sse *p,
169				       struct x86_reg data,
170				       struct x86_reg src,
171				       unsigned size)
172{
173   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
174   struct x86_reg tmp = p->tmp_EAX;
175   switch(size)
176   {
177   case 1:
178      x86_movzx8(p->func, tmp, src);
179      sse2_movd(p->func, data, tmp);
180      break;
181   case 2:
182      x86_movzx16(p->func, tmp, src);
183      sse2_movd(p->func, data, tmp);
184      break;
185   case 3:
186      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
187      x86_shl_imm(p->func, tmp, 16);
188      x86_mov16(p->func, tmp, src);
189      sse2_movd(p->func, data, tmp);
190      break;
191   case 4:
192      sse2_movd(p->func, data, src);
193      break;
194   case 6:
195      sse2_movd(p->func, data, src);
196      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
197      sse2_movd(p->func, tmpXMM, tmp);
198      sse2_punpckldq(p->func, data, tmpXMM);
199      break;
200   case 8:
201      sse2_movq(p->func, data, src);
202      break;
203   case 12:
204      sse2_movq(p->func, data, src);
205      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
206      sse2_punpcklqdq(p->func, data, tmpXMM);
207      break;
208   case 16:
209      sse2_movdqu(p->func, data, src);
210      break;
211   default:
212      return FALSE;
213   }
214   return TRUE;
215}
216
217/* this value can be passed for the out_chans argument */
218#define CHANNELS_0001 5
219
220/* this function will load #chans float values, and will
221 * pad the register with zeroes at least up to out_chans.
222 *
223 * If out_chans is set to CHANNELS_0001, then the fourth
224 * value will be padded with 1. Only pass this value if
225 * chans < 4 or results are undefined.
226 */
227static void emit_load_float32( struct translate_sse *p,
228                                       struct x86_reg data,
229                                       struct x86_reg arg0,
230                                       unsigned out_chans,
231                                       unsigned chans)
232{
233   switch(chans)
234   {
235   case 1:
236      /* a 0 0 0
237       * a 0 0 1
238       */
239      sse_movss(p->func, data, arg0);
240      if(out_chans == CHANNELS_0001)
241         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
242      break;
243   case 2:
244      /* 0 0 0 1
245       * a b 0 1
246       */
247      if(out_chans == CHANNELS_0001)
248         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
249      else if(out_chans > 2)
250         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
251      sse_movlps(p->func, data, arg0);
252      break;
253   case 3:
254      /* Have to jump through some hoops:
255       *
256       * c 0 0 0
257       * c 0 0 1 if out_chans == CHANNELS_0001
258       * 0 0 c 0/1
259       * a b c 0/1
260       */
261      sse_movss(p->func, data, x86_make_disp(arg0, 8));
262      if(out_chans == CHANNELS_0001)
263         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
264      sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
265      sse_movlps(p->func, data, arg0);
266      break;
267   case 4:
268      sse_movups(p->func, data, arg0);
269      break;
270   }
271}
272
273/* this function behaves like emit_load_float32, but loads
274   64-bit floating point numbers, converting them to 32-bit
275  ones */
276static void emit_load_float64to32( struct translate_sse *p,
277                                       struct x86_reg data,
278                                       struct x86_reg arg0,
279                                       unsigned out_chans,
280                                       unsigned chans)
281{
282   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
283   switch(chans)
284   {
285   case 1:
286      sse2_movsd(p->func, data, arg0);
287      if(out_chans > 1)
288         sse2_cvtpd2ps(p->func, data, data);
289      else
290         sse2_cvtsd2ss(p->func, data, data);
291      if(out_chans == CHANNELS_0001)
292         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W)  );
293      break;
294   case 2:
295      sse2_movupd(p->func, data, arg0);
296      sse2_cvtpd2ps(p->func, data, data);
297      if(out_chans == CHANNELS_0001)
298         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
299      else if(out_chans > 2)
300         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
301       break;
302   case 3:
303      sse2_movupd(p->func, data, arg0);
304      sse2_cvtpd2ps(p->func, data, data);
305      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
306      if(out_chans > 3)
307         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
308      else
309         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
310      sse_movlhps(p->func, data, tmpXMM);
311      if(out_chans == CHANNELS_0001)
312         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
313      break;
314   case 4:
315      sse2_movupd(p->func, data, arg0);
316      sse2_cvtpd2ps(p->func, data, data);
317      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
318      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
319      sse_movlhps(p->func, data, tmpXMM);
320      break;
321   }
322}
323
324static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
325{
326   if(x86_target(p->func) != X86_32)
327      x64_mov64(p->func, dst_gpr, src_gpr);
328   else
329   {
330      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
331      if(x86_target_caps(p->func) & X86_SSE2)
332         sse2_movq(p->func, dst_xmm, src_xmm);
333      else
334         sse_movlps(p->func, dst_xmm, src_xmm);
335   }
336}
337
338static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
339{
340   emit_mov64(p, dst_gpr, dst_xmm, src, src);
341}
342
343static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
344{
345   emit_mov64(p, dst, dst, src_gpr, src_xmm);
346}
347
348static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
349{
350   if(x86_target_caps(p->func) & X86_SSE2)
351      sse2_movdqu(p->func, dst, src);
352   else
353      sse_movups(p->func, dst, src);
354}
355
356/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
357 * but may or may not be good on older processors
358 * TODO: may perhaps want to use non-temporal stores here if possible
359 */
360static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
361{
362   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
363   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
364   struct x86_reg dataGPR = p->tmp_EAX;
365   struct x86_reg dataGPR2 = p->tmp2_EDX;
366
367   if(size < 8)
368   {
369      switch (size)
370      {
371      case 1:
372         x86_mov8(p->func, dataGPR, src);
373         x86_mov8(p->func, dst, dataGPR);
374         break;
375      case 2:
376         x86_mov16(p->func, dataGPR, src);
377         x86_mov16(p->func, dst, dataGPR);
378         break;
379      case 3:
380         x86_mov16(p->func, dataGPR, src);
381         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
382         x86_mov16(p->func, dst, dataGPR);
383         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
384         break;
385      case 4:
386         x86_mov(p->func, dataGPR, src);
387         x86_mov(p->func, dst, dataGPR);
388         break;
389      case 6:
390         x86_mov(p->func, dataGPR, src);
391         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
392         x86_mov(p->func, dst, dataGPR);
393         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
394         break;
395      }
396   }
397   else if(!(x86_target_caps(p->func) & X86_SSE))
398   {
399      unsigned i = 0;
400      assert((size & 3) == 0);
401      for(i = 0; i < size; i += 4)
402      {
403         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
404         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
405      }
406   }
407   else
408   {
409      switch(size)
410      {
411      case 8:
412         emit_load64(p, dataGPR, dataXMM, src);
413         emit_store64(p, dst, dataGPR, dataXMM);
414         break;
415      case 12:
416         emit_load64(p, dataGPR2, dataXMM, src);
417         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
418         emit_store64(p, dst, dataGPR2, dataXMM);
419         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
420         break;
421      case 16:
422         emit_mov128(p, dataXMM, src);
423         emit_mov128(p, dst, dataXMM);
424         break;
425      case 24:
426         emit_mov128(p, dataXMM, src);
427         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
428         emit_mov128(p, dst, dataXMM);
429         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
430         break;
431      case 32:
432         emit_mov128(p, dataXMM, src);
433         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
434         emit_mov128(p, dst, dataXMM);
435         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
436         break;
437      default:
438         assert(0);
439      }
440   }
441}
442
443static boolean translate_attr_convert( struct translate_sse *p,
444                               const struct translate_element *a,
445                               struct x86_reg src,
446                               struct x86_reg dst)
447
448{
449   const struct util_format_description* input_desc = util_format_description(a->input_format);
450   const struct util_format_description* output_desc = util_format_description(a->output_format);
451   unsigned i;
452   boolean id_swizzle = TRUE;
453   unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
454   unsigned needed_chans = 0;
455   unsigned imms[2] = {0, 0x3f800000};
456
457   if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
458      return FALSE;
459
460   if(input_desc->channel[0].size & 7)
461      return FALSE;
462
463   if(input_desc->colorspace != output_desc->colorspace)
464      return FALSE;
465
466   for(i = 1; i < input_desc->nr_channels; ++i)
467   {
468      if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
469         return FALSE;
470   }
471
472   for(i = 1; i < output_desc->nr_channels; ++i)
473   {
474      if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
475         return FALSE;
476   }
477
478   for(i = 0; i < output_desc->nr_channels; ++i)
479   {
480      if(output_desc->swizzle[i] < 4)
481         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
482   }
483
484   if((x86_target_caps(p->func) & X86_SSE) && (0
485         || a->output_format == PIPE_FORMAT_R32_FLOAT
486         || a->output_format == PIPE_FORMAT_R32G32_FLOAT
487         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
488         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
489   {
490      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
491
492      for(i = 0; i < output_desc->nr_channels; ++i)
493      {
494         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
495            swizzle[i] = i;
496      }
497
498      for(i = 0; i < output_desc->nr_channels; ++i)
499      {
500         if(swizzle[i] < 4)
501            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
502         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
503            id_swizzle = FALSE;
504      }
505
506      if(needed_chans > 0)
507      {
508         switch(input_desc->channel[0].type)
509         {
510         case UTIL_FORMAT_TYPE_UNSIGNED:
511            if(!(x86_target_caps(p->func) & X86_SSE2))
512               return FALSE;
513            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
514
515            /* TODO: add support for SSE4.1 pmovzx */
516            switch(input_desc->channel[0].size)
517            {
518            case 8:
519               /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
520               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
521               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
522               break;
523            case 16:
524               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
525               break;
526            case 32: /* we lose precision here */
527               sse2_psrld_imm(p->func, dataXMM, 1);
528               break;
529            default:
530               return FALSE;
531            }
532            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
533            if(input_desc->channel[0].normalized)
534            {
535               struct x86_reg factor;
536               switch(input_desc->channel[0].size)
537               {
538               case 8:
539                  factor = get_const(p, CONST_INV_255);
540                  break;
541               case 16:
542                  factor = get_const(p, CONST_INV_65535);
543                  break;
544               case 32:
545                  factor = get_const(p, CONST_INV_2147483647);
546                  break;
547               default:
548                  assert(0);
549                  factor.disp = 0;
550                  factor.file = 0;
551                  factor.idx = 0;
552                  factor.mod = 0;
553                  break;
554               }
555               sse_mulps(p->func, dataXMM, factor);
556            }
557            else if(input_desc->channel[0].size == 32)
558               sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
559            break;
560         case UTIL_FORMAT_TYPE_SIGNED:
561            if(!(x86_target_caps(p->func) & X86_SSE2))
562               return FALSE;
563            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
564
565            /* TODO: add support for SSE4.1 pmovsx */
566            switch(input_desc->channel[0].size)
567            {
568            case 8:
569               sse2_punpcklbw(p->func, dataXMM, dataXMM);
570               sse2_punpcklbw(p->func, dataXMM, dataXMM);
571               sse2_psrad_imm(p->func, dataXMM, 24);
572               break;
573            case 16:
574               sse2_punpcklwd(p->func, dataXMM, dataXMM);
575               sse2_psrad_imm(p->func, dataXMM, 16);
576               break;
577            case 32: /* we lose precision here */
578               break;
579            default:
580               return FALSE;
581            }
582            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
583            if(input_desc->channel[0].normalized)
584            {
585               struct x86_reg factor;
586               switch(input_desc->channel[0].size)
587               {
588               case 8:
589                  factor = get_const(p, CONST_INV_127);
590                  break;
591               case 16:
592                  factor = get_const(p, CONST_INV_32767);
593                  break;
594               case 32:
595                  factor = get_const(p, CONST_INV_2147483647);
596                  break;
597               default:
598                  assert(0);
599                  factor.disp = 0;
600                  factor.file = 0;
601                  factor.idx = 0;
602                  factor.mod = 0;
603                  break;
604               }
605               sse_mulps(p->func, dataXMM, factor);
606            }
607            break;
608
609            break;
610         case UTIL_FORMAT_TYPE_FLOAT:
611            if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
612               return FALSE;
613            if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
614            {
615               swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
616               needed_chans = CHANNELS_0001;
617            }
618            switch(input_desc->channel[0].size)
619            {
620            case 32:
621               emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
622               break;
623            case 64: /* we lose precision here */
624               if(!(x86_target_caps(p->func) & X86_SSE2))
625                  return FALSE;
626               emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
627               break;
628            default:
629               return FALSE;
630            }
631            break;
632         default:
633            return FALSE;
634         }
635
636         if(!id_swizzle)
637            sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
638      }
639
640      if(output_desc->nr_channels >= 4
641            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
642            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
643            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
644            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
645            )
646         sse_movups(p->func, dst, dataXMM);
647      else
648      {
649         if(output_desc->nr_channels >= 2
650               && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
651               && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
652            sse_movlps(p->func, dst, dataXMM);
653         else
654         {
655            if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
656               sse_movss(p->func, dst, dataXMM);
657            else
658               x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
659
660            if(output_desc->nr_channels >= 2)
661            {
662               if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
663               {
664                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
665                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
666               }
667               else
668                  x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
669            }
670         }
671
672         if(output_desc->nr_channels >= 3)
673         {
674            if(output_desc->nr_channels >= 4
675                  && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
676                  && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
677               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
678            else
679            {
680               if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
681               {
682                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
683                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
684               }
685               else
686                  x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
687
688               if(output_desc->nr_channels >= 4)
689               {
690                  if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
691                  {
692                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
693                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
694                  }
695                  else
696                     x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
697               }
698            }
699         }
700      }
701      return TRUE;
702   }
703   else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
704         && output_desc->channel[0].normalized == input_desc->channel[0].normalized
705         && (0
706               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
707               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
708               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
709               ))
710   {
711      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
712      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
713      struct x86_reg tmp = p->tmp_EAX;
714      unsigned imms[2] = {0, 1};
715
716      for(i = 0; i < output_desc->nr_channels; ++i)
717      {
718         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
719            swizzle[i] = i;
720      }
721
722      for(i = 0; i < output_desc->nr_channels; ++i)
723      {
724         if(swizzle[i] < 4)
725            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
726         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
727            id_swizzle = FALSE;
728      }
729
730      if(needed_chans > 0)
731      {
732         emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
733
734         switch(input_desc->channel[0].type)
735         {
736         case UTIL_FORMAT_TYPE_UNSIGNED:
737            if(input_desc->channel[0].normalized)
738            {
739               sse2_punpcklbw(p->func, dataXMM, dataXMM);
740               if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
741        	       sse2_psrlw_imm(p->func, dataXMM, 1);
742            }
743            else
744               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
745            break;
746         case UTIL_FORMAT_TYPE_SIGNED:
747            if(input_desc->channel[0].normalized)
748            {
749               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
750               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
751               sse2_psllw_imm(p->func, dataXMM, 9);
752               sse2_psrlw_imm(p->func, dataXMM, 8);
753               sse2_por(p->func, tmpXMM, dataXMM);
754               sse2_psrlw_imm(p->func, dataXMM, 7);
755               sse2_por(p->func, tmpXMM, dataXMM);
756               {
757                  struct x86_reg t = dataXMM;
758                  dataXMM = tmpXMM;
759                  tmpXMM = t;
760               }
761            }
762            else
763            {
764               sse2_punpcklbw(p->func, dataXMM, dataXMM);
765               sse2_psraw_imm(p->func, dataXMM, 8);
766            }
767            break;
768         default:
769            assert(0);
770         }
771
772         if(output_desc->channel[0].normalized)
773            imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
774
775         if(!id_swizzle)
776            sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
777      }
778
779      if(output_desc->nr_channels >= 4
780            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
781            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
782            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
783            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
784            )
785         sse2_movq(p->func, dst, dataXMM);
786      else
787      {
788         if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
789         {
790            if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
791               sse2_movd(p->func, dst, dataXMM);
792            else
793            {
794               sse2_movd(p->func, tmp, dataXMM);
795               x86_mov16(p->func, dst, tmp);
796               if(output_desc->nr_channels >= 2)
797                  x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
798            }
799         }
800         else
801         {
802            if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
803               x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
804            else
805            {
806               x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
807               if(output_desc->nr_channels >= 2)
808               {
809                  sse2_movd(p->func, tmp, dataXMM);
810                  x86_shr_imm(p->func, tmp, 16);
811                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
812               }
813            }
814         }
815
816         if(output_desc->nr_channels >= 3)
817         {
818            if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
819            {
820               if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
821               {
822                  sse2_psrlq_imm(p->func, dataXMM, 32);
823                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
824               }
825               else
826               {
827                  sse2_psrlq_imm(p->func, dataXMM, 32);
828                  sse2_movd(p->func, tmp, dataXMM);
829                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
830                  if(output_desc->nr_channels >= 4)
831                  {
832                     x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
833                  }
834               }
835            }
836            else
837            {
838               if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
839                  x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
840               else
841               {
842                  x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
843
844                  if(output_desc->nr_channels >= 4)
845                  {
846                     sse2_psrlq_imm(p->func, dataXMM, 48);
847                     sse2_movd(p->func, tmp, dataXMM);
848                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
849                  }
850               }
851            }
852         }
853      }
854      return TRUE;
855   }
856   else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
857   {
858      struct x86_reg tmp = p->tmp_EAX;
859      unsigned i;
860      if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
861                     && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
862                     && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
863                     && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
864                     && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
865      {
866         /* TODO: support movbe */
867         x86_mov(p->func, tmp, src);
868         x86_bswap(p->func, tmp);
869         x86_mov(p->func, dst, tmp);
870         return TRUE;
871      }
872
873      for(i = 0; i < output_desc->nr_channels; ++i)
874      {
875         switch(output_desc->channel[0].size)
876         {
877         case 8:
878            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
879            {
880               unsigned v = 0;
881               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
882               {
883                  switch(output_desc->channel[0].type)
884                  {
885                  case UTIL_FORMAT_TYPE_UNSIGNED:
886                     v = output_desc->channel[0].normalized ? 0xff : 1;
887                     break;
888                  case UTIL_FORMAT_TYPE_SIGNED:
889                     v = output_desc->channel[0].normalized ? 0x7f : 1;
890                     break;
891                  default:
892                     return FALSE;
893                  }
894               }
895               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
896            }
897            else
898            {
899               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
900               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
901            }
902            break;
903         case 16:
904            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
905            {
906               unsigned v = 0;
907               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
908               {
909                  switch(output_desc->channel[1].type)
910                  {
911                  case UTIL_FORMAT_TYPE_UNSIGNED:
912                     v = output_desc->channel[1].normalized ? 0xffff : 1;
913                     break;
914                  case UTIL_FORMAT_TYPE_SIGNED:
915                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
916                     break;
917                  case UTIL_FORMAT_TYPE_FLOAT:
918                     v = 0x3c00;
919                     break;
920                  default:
921                     return FALSE;
922                  }
923               }
924               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
925            }
926            else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
927               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
928            else
929            {
930               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
931               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
932            }
933            break;
934         case 32:
935            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
936            {
937               unsigned v = 0;
938               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
939               {
940                  switch(output_desc->channel[1].type)
941                  {
942                  case UTIL_FORMAT_TYPE_UNSIGNED:
943                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
944                     break;
945                  case UTIL_FORMAT_TYPE_SIGNED:
946                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
947                     break;
948                  case UTIL_FORMAT_TYPE_FLOAT:
949                     v = 0x3f800000;
950                     break;
951                  default:
952                     return FALSE;
953                  }
954               }
955               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
956            }
957            else
958            {
959               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
960               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
961            }
962            break;
963         case 64:
964            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
965            {
966               unsigned l = 0;
967               unsigned h = 0;
968               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
969               {
970                  switch(output_desc->channel[1].type)
971                  {
972                  case UTIL_FORMAT_TYPE_UNSIGNED:
973                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
974                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
975                     break;
976                  case UTIL_FORMAT_TYPE_SIGNED:
977                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
978                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
979                     break;
980                  case UTIL_FORMAT_TYPE_FLOAT:
981                     h = 0x3ff00000;
982                     l = 0;
983                     break;
984                  default:
985                     return FALSE;
986                  }
987               }
988               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
989               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
990            }
991            else
992            {
993               if(x86_target_caps(p->func) & X86_SSE)
994               {
995                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
996                  emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
997                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
998               }
999               else
1000               {
1001                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1002                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1003                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
1004                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1005               }
1006            }
1007            break;
1008         default:
1009            return FALSE;
1010         }
1011      }
1012      return TRUE;
1013   }
1014   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1015   else if((x86_target_caps(p->func) & X86_SSE2) &&
1016         a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
1017               || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1018               || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
1019         ))
1020   {
1021      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1022
1023      /* load */
1024      sse_movups(p->func, dataXMM, src);
1025
1026      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
1027         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
1028
1029      /* scale by 255.0 */
1030      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1031
1032      /* pack and emit */
1033      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1034      sse2_packssdw(p->func, dataXMM, dataXMM);
1035      sse2_packuswb(p->func, dataXMM, dataXMM);
1036      sse2_movd(p->func, dst, dataXMM);
1037
1038      return TRUE;
1039   }
1040
1041   return FALSE;
1042}
1043
1044static boolean translate_attr( struct translate_sse *p,
1045			       const struct translate_element *a,
1046			       struct x86_reg src,
1047			       struct x86_reg dst)
1048{
1049   if(a->input_format == a->output_format)
1050   {
1051      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1052      return TRUE;
1053   }
1054
1055   return translate_attr_convert(p, a, src, dst);
1056}
1057
1058static boolean init_inputs( struct translate_sse *p,
1059                            unsigned index_size )
1060{
1061   unsigned i;
1062   struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
1063                                              get_offset(p, &p->instance_id));
1064
1065   for (i = 0; i < p->nr_buffer_variants; i++) {
1066      struct translate_buffer_variant *variant = &p->buffer_variant[i];
1067      struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1068
1069      if (!index_size || variant->instance_divisor) {
1070         struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI,
1071                                                     get_offset(p, &buffer->max_index));
1072         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
1073                                                     get_offset(p, &buffer->stride));
1074         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
1075                                                     get_offset(p, &variant->ptr));
1076         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
1077                                                     get_offset(p, &buffer->base_ptr));
1078         struct x86_reg elt = p->idx_ESI;
1079         struct x86_reg tmp_EAX = p->tmp_EAX;
1080
1081         /* Calculate pointer to first attrib:
1082          *   base_ptr + stride * index, where index depends on instance divisor
1083          */
1084         if (variant->instance_divisor) {
1085            /* Our index is instance ID divided by instance divisor.
1086             */
1087            x86_mov(p->func, tmp_EAX, instance_id);
1088
1089            if (variant->instance_divisor != 1) {
1090               struct x86_reg tmp_EDX = p->tmp2_EDX;
1091               struct x86_reg tmp_ECX = p->src_ECX;
1092
1093               /* TODO: Add x86_shr() to rtasm and use it whenever
1094                *       instance divisor is power of two.
1095                */
1096
1097               x86_xor(p->func, tmp_EDX, tmp_EDX);
1098               x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1099               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
1100            }
1101
1102            /* XXX we need to clamp the index here too, but to a
1103             * per-array max value, not the draw->pt.max_index value
1104             * that's being given to us via translate->set_buffer().
1105             */
1106         } else {
1107            x86_mov(p->func, tmp_EAX, elt);
1108
1109            /* Clamp to max_index
1110             */
1111            x86_cmp(p->func, tmp_EAX, buf_max_index);
1112            x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1113         }
1114
1115         x86_imul(p->func, tmp_EAX, buf_stride);
1116         x64_rexw(p->func);
1117         x86_add(p->func, tmp_EAX, buf_base_ptr);
1118
1119         x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1120
1121         /* In the linear case, keep the buffer pointer instead of the
1122          * index number.
1123          */
1124         if (!index_size && p->nr_buffer_variants == 1)
1125         {
1126            x64_rexw(p->func);
1127            x86_mov(p->func, elt, tmp_EAX);
1128         }
1129         else
1130         {
1131            x64_rexw(p->func);
1132            x86_mov(p->func, buf_ptr, tmp_EAX);
1133         }
1134      }
1135   }
1136
1137   return TRUE;
1138}
1139
1140
1141static struct x86_reg get_buffer_ptr( struct translate_sse *p,
1142                                      unsigned index_size,
1143                                      unsigned var_idx,
1144                                      struct x86_reg elt )
1145{
1146   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1147      return x86_make_disp(p->machine_EDI,
1148                           get_offset(p, &p->instance_id));
1149   }
1150   if (!index_size && p->nr_buffer_variants == 1) {
1151      return p->idx_ESI;
1152   }
1153   else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1154      struct x86_reg ptr = p->src_ECX;
1155      struct x86_reg buf_ptr =
1156         x86_make_disp(p->machine_EDI,
1157                       get_offset(p, &p->buffer_variant[var_idx].ptr));
1158
1159      x64_rexw(p->func);
1160      x86_mov(p->func, ptr, buf_ptr);
1161      return ptr;
1162   }
1163   else {
1164      struct x86_reg ptr = p->src_ECX;
1165      const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx];
1166
1167      struct x86_reg buf_stride =
1168         x86_make_disp(p->machine_EDI,
1169                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1170
1171      struct x86_reg buf_base_ptr =
1172         x86_make_disp(p->machine_EDI,
1173                       get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1174
1175      struct x86_reg buf_max_index =
1176         x86_make_disp(p->machine_EDI,
1177                       get_offset(p, &p->buffer[variant->buffer_index].max_index));
1178
1179
1180
1181      /* Calculate pointer to current attrib:
1182       */
1183      switch(index_size)
1184      {
1185      case 1:
1186         x86_movzx8(p->func, ptr, elt);
1187         break;
1188      case 2:
1189         x86_movzx16(p->func, ptr, elt);
1190         break;
1191      case 4:
1192         x86_mov(p->func, ptr, elt);
1193         break;
1194      }
1195
1196      /* Clamp to max_index
1197       */
1198      x86_cmp(p->func, ptr, buf_max_index);
1199      x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1200
1201      x86_imul(p->func, ptr, buf_stride);
1202      x64_rexw(p->func);
1203      x86_add(p->func, ptr, buf_base_ptr);
1204      return ptr;
1205   }
1206}
1207
1208
1209
1210static boolean incr_inputs( struct translate_sse *p,
1211                            unsigned index_size )
1212{
1213   if (!index_size && p->nr_buffer_variants == 1) {
1214      struct x86_reg stride = x86_make_disp(p->machine_EDI,
1215                                            get_offset(p, &p->buffer[0].stride));
1216
1217      if (p->buffer_variant[0].instance_divisor == 0) {
1218         x64_rexw(p->func);
1219         x86_add(p->func, p->idx_ESI, stride);
1220         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1221      }
1222   }
1223   else if (!index_size) {
1224      unsigned i;
1225
1226      /* Is this worthwhile??
1227       */
1228      for (i = 0; i < p->nr_buffer_variants; i++) {
1229         struct translate_buffer_variant *variant = &p->buffer_variant[i];
1230         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1231                                                get_offset(p, &variant->ptr));
1232         struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1233                                                   get_offset(p, &p->buffer[variant->buffer_index].stride));
1234
1235         if (variant->instance_divisor == 0) {
1236            x86_mov(p->func, p->tmp_EAX, buf_stride);
1237            x64_rexw(p->func);
1238            x86_add(p->func, p->tmp_EAX, buf_ptr);
1239            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1240            x64_rexw(p->func);
1241            x86_mov(p->func, buf_ptr, p->tmp_EAX);
1242         }
1243      }
1244   }
1245   else {
1246      x64_rexw(p->func);
1247      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1248   }
1249
1250   return TRUE;
1251}
1252
1253
1254/* Build run( struct translate *machine,
1255 *            unsigned start,
1256 *            unsigned count,
1257 *            void *output_buffer )
1258 * or
1259 *  run_elts( struct translate *machine,
1260 *            unsigned *elts,
1261 *            unsigned count,
1262 *            void *output_buffer )
1263 *
1264 *  Lots of hardcoding
1265 *
1266 * EAX -- pointer to current output vertex
1267 * ECX -- pointer to current attribute
1268 *
1269 */
1270static boolean build_vertex_emit( struct translate_sse *p,
1271				  struct x86_function *func,
1272				  unsigned index_size )
1273{
1274   int fixup, label;
1275   unsigned j;
1276
1277   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1278   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1279
1280   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
1281   p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
1282   p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
1283   p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
1284   p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
1285   p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
1286   p->src_ECX     = x86_make_reg(file_REG32, reg_CX);
1287
1288   p->func = func;
1289
1290   x86_init_func(p->func);
1291
1292   if(x86_target(p->func) == X86_64_WIN64_ABI)
1293   {
1294	   /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1295	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
1296	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
1297   }
1298
1299   x86_push(p->func, p->outbuf_EBX);
1300   x86_push(p->func, p->count_EBP);
1301
1302/* on non-Win64 x86-64, these are already in the right registers */
1303   if(x86_target(p->func) != X86_64_STD_ABI)
1304   {
1305      x86_push(p->func, p->machine_EDI);
1306      x86_push(p->func, p->idx_ESI);
1307
1308      x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1309      x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1310   }
1311
1312   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1313
1314   if(x86_target(p->func) != X86_32)
1315      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1316   else
1317      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 5));
1318
1319   /* Load instance ID.
1320    */
1321   if (p->use_instancing) {
1322      x86_mov(p->func,
1323              p->tmp_EAX,
1324              x86_fn_arg(p->func, 4));
1325      x86_mov(p->func,
1326              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1327              p->tmp_EAX);
1328   }
1329
1330   /* Get vertex count, compare to zero
1331    */
1332   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1333   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1334   fixup = x86_jcc_forward(p->func, cc_E);
1335
1336   /* always load, needed or not:
1337    */
1338   init_inputs(p, index_size);
1339
1340   /* Note address for loop jump
1341    */
1342   label = x86_get_label(p->func);
1343   {
1344      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1345      int last_variant = -1;
1346      struct x86_reg vb;
1347
1348      for (j = 0; j < p->translate.key.nr_elements; j++) {
1349         const struct translate_element *a = &p->translate.key.element[j];
1350         unsigned variant = p->element_to_buffer_variant[j];
1351
1352         /* Figure out source pointer address:
1353          */
1354         if (variant != last_variant) {
1355            last_variant = variant;
1356            vb = get_buffer_ptr(p, index_size, variant, elt);
1357         }
1358
1359         if (!translate_attr( p, a,
1360                              x86_make_disp(vb, a->input_offset),
1361                              x86_make_disp(p->outbuf_EBX, a->output_offset)))
1362            return FALSE;
1363      }
1364
1365      /* Next output vertex:
1366       */
1367      x64_rexw(p->func);
1368      x86_lea(p->func,
1369              p->outbuf_EBX,
1370              x86_make_disp(p->outbuf_EBX,
1371                            p->translate.key.output_stride));
1372
1373      /* Incr index
1374       */
1375      incr_inputs( p, index_size );
1376   }
1377
1378   /* decr count, loop if not zero
1379    */
1380   x86_dec(p->func, p->count_EBP);
1381   x86_jcc(p->func, cc_NZ, label);
1382
1383   /* Exit mmx state?
1384    */
1385   if (p->func->need_emms)
1386      mmx_emms(p->func);
1387
1388   /* Land forward jump here:
1389    */
1390   x86_fixup_fwd_jump(p->func, fixup);
1391
1392   /* Pop regs and return
1393    */
1394
1395   if(x86_target(p->func) != X86_64_STD_ABI)
1396   {
1397      x86_pop(p->func, p->idx_ESI);
1398      x86_pop(p->func, p->machine_EDI);
1399   }
1400
1401   x86_pop(p->func, p->count_EBP);
1402   x86_pop(p->func, p->outbuf_EBX);
1403
1404   if(x86_target(p->func) == X86_64_WIN64_ABI)
1405   {
1406	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1407	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1408   }
1409   x86_ret(p->func);
1410
1411   return TRUE;
1412}
1413
1414
1415
1416
1417
1418
1419
1420static void translate_sse_set_buffer( struct translate *translate,
1421				unsigned buf,
1422				const void *ptr,
1423				unsigned stride,
1424				unsigned max_index )
1425{
1426   struct translate_sse *p = (struct translate_sse *)translate;
1427
1428   if (buf < p->nr_buffers) {
1429      p->buffer[buf].base_ptr = (char *)ptr;
1430      p->buffer[buf].stride = stride;
1431      p->buffer[buf].max_index = max_index;
1432   }
1433
1434   if (0) debug_printf("%s %d/%d: %p %d\n",
1435                       __FUNCTION__, buf,
1436                       p->nr_buffers,
1437                       ptr, stride);
1438}
1439
1440
1441static void translate_sse_release( struct translate *translate )
1442{
1443   struct translate_sse *p = (struct translate_sse *)translate;
1444
1445   x86_release_func( &p->linear_func );
1446   x86_release_func( &p->elt_func );
1447
1448   os_free_aligned(p);
1449}
1450
1451
1452struct translate *translate_sse2_create( const struct translate_key *key )
1453{
1454   struct translate_sse *p = NULL;
1455   unsigned i;
1456
1457   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1458   if (!rtasm_cpu_has_sse())
1459      goto fail;
1460
1461   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1462   if (p == NULL)
1463      goto fail;
1464   memset(p, 0, sizeof(*p));
1465   memcpy(p->consts, consts, sizeof(consts));
1466
1467   p->translate.key = *key;
1468   p->translate.release = translate_sse_release;
1469   p->translate.set_buffer = translate_sse_set_buffer;
1470
1471   for (i = 0; i < key->nr_elements; i++) {
1472      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1473         unsigned j;
1474
1475         p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1476
1477         if (key->element[i].instance_divisor) {
1478            p->use_instancing = TRUE;
1479         }
1480
1481         /*
1482          * Map vertex element to vertex buffer variant.
1483          */
1484         for (j = 0; j < p->nr_buffer_variants; j++) {
1485            if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer &&
1486                p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) {
1487               break;
1488            }
1489         }
1490         if (j == p->nr_buffer_variants) {
1491            p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1492            p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor;
1493            p->nr_buffer_variants++;
1494         }
1495         p->element_to_buffer_variant[i] = j;
1496      } else {
1497         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1498
1499         p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1500      }
1501   }
1502
1503   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
1504
1505   if (!build_vertex_emit(p, &p->linear_func, 0))
1506      goto fail;
1507
1508   if (!build_vertex_emit(p, &p->elt_func, 4))
1509      goto fail;
1510
1511   if (!build_vertex_emit(p, &p->elt16_func, 2))
1512      goto fail;
1513
1514   if (!build_vertex_emit(p, &p->elt8_func, 1))
1515      goto fail;
1516
1517   p->translate.run = (run_func) x86_get_func(&p->linear_func);
1518   if (p->translate.run == NULL)
1519      goto fail;
1520
1521   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1522   if (p->translate.run_elts == NULL)
1523      goto fail;
1524
1525   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1526   if (p->translate.run_elts16 == NULL)
1527      goto fail;
1528
1529   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1530   if (p->translate.run_elts8 == NULL)
1531      goto fail;
1532
1533   return &p->translate;
1534
1535 fail:
1536   if (p)
1537      translate_sse_release( &p->translate );
1538
1539   return NULL;
1540}
1541
1542
1543
1544#else
1545
1546struct translate *translate_sse2_create( const struct translate_key *key )
1547{
1548   return NULL;
1549}
1550
1551#endif
1552