translate_sse.c revision 7400bc4b6fb0c20a935cd108afa92814eeafec6d
1/*
2 * Copyright 2003 Tungsten Graphics, inc.
3 * All Rights Reserved.
4 *
5 * Permission is hereby granted, free of charge, to any person obtaining a
6 * copy of this software and associated documentation files (the "Software"),
7 * to deal in the Software without restriction, including without limitation
8 * on the rights to use, copy, modify, merge, publish, distribute, sub
9 * license, and/or sell copies of the Software, and to permit persons to whom
10 * the Software is furnished to do so, subject to the following conditions:
11 *
12 * The above copyright notice and this permission notice (including the next
13 * paragraph) shall be included in all copies or substantial portions of the
14 * Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23 *
24 * Authors:
25 *    Keith Whitwell <keithw@tungstengraphics.com>
26 */
27
28
29#include "pipe/p_compiler.h"
30#include "pipe/p_util.h"
31#include "util/u_simple_list.h"
32
33#include "translate.h"
34
35
36#if defined(__i386__) || defined(__386__) || defined(i386)
37
38#include "rtasm/rtasm_cpu.h"
39#include "rtasm/rtasm_x86sse.h"
40
41
42#define X    0
43#define Y    1
44#define Z    2
45#define W    3
46
47
48typedef void (*run_func)( struct translate *translate,
49			  unsigned start,
50			  unsigned count,
51			  void *output_buffer );
52
53typedef void (*run_elts_func)( struct translate *translate,
54			       const unsigned *elts,
55			       unsigned count,
56			       void *output_buffer );
57
58
59
60struct translate_sse {
61   struct translate translate;
62
63   struct x86_function linear_func;
64   struct x86_function elt_func;
65   struct x86_function *func;
66
67   boolean loaded_identity;
68   boolean loaded_255;
69   boolean loaded_inv_255;
70
71   float identity[4];
72   float float_255[4];
73   float inv_255[4];
74
75   struct {
76      char *input_ptr;
77      unsigned input_stride;
78   } attrib[PIPE_MAX_ATTRIBS];
79
80   run_func      gen_run;
81   run_elts_func gen_run_elts;
82
83};
84
85static int get_offset( const void *a, const void *b )
86{
87   return (const char *)b - (const char *)a;
88}
89
90
91
92static struct x86_reg get_identity( struct translate_sse *p )
93{
94   struct x86_reg reg = x86_make_reg(file_XMM, 6);
95
96   if (!p->loaded_identity) {
97      /* Nasty:
98       */
99      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
100
101      p->loaded_identity = TRUE;
102      p->identity[0] = 0;
103      p->identity[1] = 0;
104      p->identity[2] = 0;
105      p->identity[3] = 1;
106
107      sse_movups(p->func, reg,
108		 x86_make_disp(translateESI,
109			       get_offset(p, &p->identity[0])));
110   }
111
112   return reg;
113}
114
115static struct x86_reg get_255( struct translate_sse *p )
116{
117   struct x86_reg reg = x86_make_reg(file_XMM, 6);
118
119   if (!p->loaded_255) {
120      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
121
122      p->loaded_255 = TRUE;
123      p->float_255[0] =
124	 p->float_255[1] =
125	 p->float_255[2] =
126	 p->float_255[3] = 255.0f;
127
128      sse_movups(p->func, reg,
129		 x86_make_disp(translateESI,
130			       get_offset(p, &p->float_255[0])));
131   }
132
133   return reg;
134   return x86_make_reg(file_XMM, 7);
135}
136
137static struct x86_reg get_inv_255( struct translate_sse *p )
138{
139   struct x86_reg reg = x86_make_reg(file_XMM, 5);
140
141   if (!p->loaded_inv_255) {
142      struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
143
144      p->loaded_inv_255 = TRUE;
145      p->inv_255[0] =
146	 p->inv_255[1] =
147	 p->inv_255[2] =
148	 p->inv_255[3] = 1.0 / 255.0f;
149
150      sse_movups(p->func, reg,
151		 x86_make_disp(translateESI,
152			       get_offset(p, &p->inv_255[0])));
153   }
154
155   return reg;
156}
157
158
159static void emit_load_R32G32B32A32( struct translate_sse *p,
160				    struct x86_reg data,
161				    struct x86_reg arg0 )
162{
163   sse_movups(p->func, data, arg0);
164}
165
166static void emit_load_R32G32B32( struct translate_sse *p,
167				 struct x86_reg data,
168				 struct x86_reg arg0 )
169{
170   /* Have to jump through some hoops:
171    *
172    * c 0 0 0
173    * c 0 0 1
174    * 0 0 c 1
175    * a b c 1
176    */
177   sse_movss(p->func, data, x86_make_disp(arg0, 8));
178   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
179   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
180   sse_movlps(p->func, data, arg0);
181}
182
183static void emit_load_R32G32( struct translate_sse *p,
184			   struct x86_reg data,
185			   struct x86_reg arg0 )
186{
187   /* 0 0 0 1
188    * a b 0 1
189    */
190   sse_movups(p->func, data, get_identity(p) );
191   sse_movlps(p->func, data, arg0);
192}
193
194
195static void emit_load_R32( struct translate_sse *p,
196			   struct x86_reg data,
197			   struct x86_reg arg0 )
198{
199   /* a 0 0 0
200    * a 0 0 1
201    */
202   sse_movss(p->func, data, arg0);
203   sse_orps(p->func, data, get_identity(p) );
204}
205
206
207static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
208				       struct x86_reg data,
209				       struct x86_reg src )
210{
211
212   /* Load and unpack twice:
213    */
214   sse_movss(p->func, data, src);
215   sse2_punpcklbw(p->func, src, get_identity(p));
216   sse2_punpcklbw(p->func, src, get_identity(p));
217
218   /* Convert to float:
219    */
220   sse2_cvtdq2ps(p->func, src, src);
221
222
223   /* Scale by 1/255.0
224    */
225   sse_mulps(p->func, src, get_inv_255(p));
226}
227
228
229
230
231static void emit_store_R32G32B32A32( struct translate_sse *p,
232				     struct x86_reg dest,
233				     struct x86_reg dataXMM )
234{
235   sse_movups(p->func, dest, dataXMM);
236}
237
238static void emit_store_R32G32B32( struct translate_sse *p,
239				  struct x86_reg dest,
240				  struct x86_reg dataXMM )
241{
242   /* Emit two, shuffle, emit one.
243    */
244   sse_movlps(p->func, dest, dataXMM);
245   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
246   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
247}
248
249static void emit_store_R32G32( struct translate_sse *p,
250			       struct x86_reg dest,
251			       struct x86_reg dataXMM )
252{
253   sse_movlps(p->func, dest, dataXMM);
254}
255
256static void emit_store_R32( struct translate_sse *p,
257			    struct x86_reg dest,
258			    struct x86_reg dataXMM )
259{
260   sse_movss(p->func, dest, dataXMM);
261}
262
263
264
265static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
266				       struct x86_reg dest,
267				       struct x86_reg dataXMM )
268{
269   /* Scale by 255.0
270    */
271   sse_mulps(p->func, dataXMM, get_255(p));
272
273   /* Pack and emit:
274    */
275   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
276   sse2_packssdw(p->func, dataXMM, dataXMM);
277   sse2_packuswb(p->func, dataXMM, dataXMM);
278   sse_movss(p->func, dest, dataXMM);
279}
280
281
282
283
284
285static void get_src_ptr( struct translate_sse *p,
286			 struct x86_reg srcEAX,
287			 struct x86_reg translateREG,
288			 struct x86_reg eltREG,
289			 unsigned a )
290{
291   struct x86_reg input_ptr =
292      x86_make_disp(translateREG,
293		    get_offset(p, &p->attrib[a].input_ptr));
294
295   struct x86_reg input_stride =
296      x86_make_disp(translateREG,
297		    get_offset(p, &p->attrib[a].input_stride));
298
299   /* Calculate pointer to current attrib:
300    */
301   x86_mov(p->func, srcEAX, input_stride);
302   x86_imul(p->func, srcEAX, eltREG);
303   x86_add(p->func, srcEAX, input_ptr);
304}
305
306
307/* Extended swizzles?  Maybe later.
308 */
309static void emit_swizzle( struct translate_sse *p,
310			  struct x86_reg dest,
311			  struct x86_reg src,
312			  unsigned shuffle )
313{
314   sse_shufps(p->func, dest, src, shuffle);
315}
316
317
318static boolean translate_attr( struct translate_sse *p,
319			       const struct translate_element *a,
320			       struct x86_reg srcECX,
321			       struct x86_reg dstEAX)
322{
323   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
324
325   switch (a->input_format) {
326   case PIPE_FORMAT_R32_FLOAT:
327      emit_load_R32(p, dataXMM, srcECX);
328      break;
329   case PIPE_FORMAT_R32G32_FLOAT:
330      emit_load_R32G32(p, dataXMM, srcECX);
331      break;
332   case PIPE_FORMAT_R32G32B32_FLOAT:
333      emit_load_R32G32B32(p, dataXMM, srcECX);
334      break;
335   case PIPE_FORMAT_R32G32B32A32_FLOAT:
336      emit_load_R32G32B32A32(p, dataXMM, srcECX);
337      break;
338   case PIPE_FORMAT_B8G8R8A8_UNORM:
339      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
340      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
341      break;
342   case PIPE_FORMAT_R8G8B8A8_UNORM:
343      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
344      break;
345   default:
346      return FALSE;
347   }
348
349   switch (a->output_format) {
350   case PIPE_FORMAT_R32_FLOAT:
351      emit_store_R32(p, dstEAX, dataXMM);
352      break;
353   case PIPE_FORMAT_R32G32_FLOAT:
354      emit_store_R32G32(p, dstEAX, dataXMM);
355      break;
356   case PIPE_FORMAT_R32G32B32_FLOAT:
357      emit_store_R32G32B32(p, dstEAX, dataXMM);
358      break;
359   case PIPE_FORMAT_R32G32B32A32_FLOAT:
360      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
361      break;
362   case PIPE_FORMAT_B8G8R8A8_UNORM:
363      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
364      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
365      break;
366   case PIPE_FORMAT_R8G8B8A8_UNORM:
367      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
368      break;
369   default:
370      return FALSE;
371   }
372
373   return TRUE;
374}
375
376/* Build run( struct translate *translate,
377 *            unsigned start,
378 *            unsigned count,
379 *            void *output_buffer )
380 * or
381 *  run_elts( struct translate *translate,
382 *            unsigned *elts,
383 *            unsigned count,
384 *            void *output_buffer )
385 *
386 *  Lots of hardcoding
387 *
388 * EAX -- pointer to current output vertex
389 * ECX -- pointer to current attribute
390 *
391 */
392static boolean build_vertex_emit( struct translate_sse *p,
393				  struct x86_function *func,
394				  boolean linear )
395{
396   struct x86_reg vertexECX    = x86_make_reg(file_REG32, reg_AX);
397   struct x86_reg idxEBX       = x86_make_reg(file_REG32, reg_BX);
398   struct x86_reg srcEAX       = x86_make_reg(file_REG32, reg_CX);
399   struct x86_reg countEBP     = x86_make_reg(file_REG32, reg_BP);
400   struct x86_reg translateESI = x86_make_reg(file_REG32, reg_SI);
401   uint8_t *fixup, *label;
402   unsigned j;
403
404   p->func = func;
405   p->loaded_inv_255 = FALSE;
406   p->loaded_255 = FALSE;
407   p->loaded_identity = FALSE;
408
409   x86_init_func(p->func);
410
411   /* Push a few regs?
412    */
413   x86_push(p->func, countEBP);
414   x86_push(p->func, translateESI);
415   x86_push(p->func, idxEBX);
416
417   /* Get vertex count, compare to zero
418    */
419   x86_xor(p->func, idxEBX, idxEBX);
420   x86_mov(p->func, countEBP, x86_fn_arg(p->func, 3));
421   x86_cmp(p->func, countEBP, idxEBX);
422   fixup = x86_jcc_forward(p->func, cc_E);
423
424   /* If linear, idx is the current element, otherwise it is a pointer
425    * to the current element.
426    */
427   x86_mov(p->func, idxEBX, x86_fn_arg(p->func, 2));
428
429   /* Initialize destination register.
430    */
431   x86_mov(p->func, vertexECX, x86_fn_arg(p->func, 4));
432
433   /* Move argument 1 (translate_sse pointer) into a reg:
434    */
435   x86_mov(p->func, translateESI, x86_fn_arg(p->func, 1));
436
437
438   /* always load, needed or not:
439    */
440
441   /* Note address for loop jump */
442   label = x86_get_label(p->func);
443
444
445   for (j = 0; j < p->translate.key.nr_elements; j++) {
446      const struct translate_element *a = &p->translate.key.element[j];
447
448      struct x86_reg destEAX = x86_make_disp(vertexECX,
449					     a->output_offset);
450
451      /* Figure out source pointer address:
452       */
453      if (linear) {
454	 get_src_ptr(p, srcEAX, translateESI, idxEBX, j);
455      }
456      else {
457	 get_src_ptr(p, srcEAX, translateESI, x86_deref(idxEBX), j);
458      }
459
460      if (!translate_attr( p, a, x86_deref(srcEAX), destEAX ))
461	 return FALSE;
462   }
463
464   /* Next vertex:
465    */
466   x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
467
468   /* Incr index
469    */   /* Emit code for each of the attributes.  Currently routes
470    * everything through SSE registers, even when it might be more
471    * efficient to stick with regular old x86.  No optimization or
472    * other tricks - enough new ground to cover here just getting
473    * things working.
474    */
475
476   if (linear) {
477      x86_inc(p->func, idxEBX);
478   }
479   else {
480      x86_lea(p->func, idxEBX, x86_make_disp(idxEBX, 4));
481   }
482
483   /* decr count, loop if not zero
484    */
485   x86_dec(p->func, countEBP);
486   x86_test(p->func, countEBP, countEBP);
487   x86_jcc(p->func, cc_NZ, label);
488
489   /* Exit mmx state?
490    */
491   if (p->func->need_emms)
492      mmx_emms(p->func);
493
494   /* Land forward jump here:
495    */
496   x86_fixup_fwd_jump(p->func, fixup);
497
498   /* Pop regs and return
499    */
500
501   x86_pop(p->func, idxEBX);
502   x86_pop(p->func, translateESI);
503   x86_pop(p->func, countEBP);
504   x86_ret(p->func);
505
506   return TRUE;
507}
508
509
510
511
512
513
514
515static void translate_sse_set_buffer( struct translate *translate,
516				unsigned buf,
517				const void *ptr,
518				unsigned stride )
519{
520   struct translate_sse *p = (struct translate_sse *)translate;
521   unsigned i;
522
523   for (i = 0; i < p->translate.key.nr_elements; i++) {
524      if (p->translate.key.element[i].input_buffer == buf) {
525	 p->attrib[i].input_ptr = ((char *)ptr +
526				    p->translate.key.element[i].input_offset);
527	 p->attrib[i].input_stride = stride;
528      }
529   }
530}
531
532
533static void translate_sse_release( struct translate *translate )
534{
535   struct translate_sse *p = (struct translate_sse *)translate;
536
537   x86_release_func( &p->linear_func );
538   x86_release_func( &p->elt_func );
539
540   FREE(p);
541}
542
543static void translate_sse_run_elts( struct translate *translate,
544			      const unsigned *elts,
545			      unsigned count,
546			      void *output_buffer )
547{
548   struct translate_sse *p = (struct translate_sse *)translate;
549
550   p->gen_run_elts( translate,
551		    elts,
552		    count,
553		    output_buffer );
554
555}
556
557static void translate_sse_run( struct translate *translate,
558			 unsigned start,
559			 unsigned count,
560			 void *output_buffer )
561{
562   struct translate_sse *p = (struct translate_sse *)translate;
563
564   p->gen_run( translate,
565	       start,
566	       count,
567	       output_buffer );
568}
569
570
571struct translate *translate_sse2_create( const struct translate_key *key )
572{
573   struct translate_sse *p = CALLOC_STRUCT( translate_sse );
574
575   if (p == NULL)
576      goto fail;
577
578   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
579      goto fail;
580
581
582   p->translate.key = *key;
583   p->translate.release = translate_sse_release;
584   p->translate.set_buffer = translate_sse_set_buffer;
585   p->translate.run_elts = translate_sse_run_elts;
586   p->translate.run = translate_sse_run;
587
588   if (!build_vertex_emit(p, &p->linear_func, TRUE))
589      goto fail;
590
591   if (!build_vertex_emit(p, &p->elt_func, FALSE))
592      goto fail;
593
594   p->gen_run = (run_func)x86_get_func(&p->linear_func);
595   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
596
597   return &p->translate;
598
599 fail:
600   if (p)
601      p->translate.release( &p->translate );
602
603   return NULL;
604}
605
606
607
608#else
609
610void translate_create_sse( const struct translate_key *key )
611{
612   return NULL;
613}
614
615#endif
616