mmx_blend.S revision d60bb2fbc8b61e9748ce9c235acd4e870a2df613
1/*
2 * Written by Jos� Fonseca <j_r_fonseca@yahoo.co.uk>
3 */
4
5#include "matypes.h"
6
7/*
8 * make the following approximation to the division (Sree)
9 *
10 *   rgb*a/255 ~= (rgb*(a+1)) >> 256
11 *
12 * which is the fastest method that satisfies the following OpenGL criteria
13 *
14 *   0*0 = 0 and 255*255 = 255
15 *
16 * note this one should be used alone
17 */
18#define GMBT_ALPHA_PLUS_ONE	0
19
20/*
21 * take the geometric series approximation to the division
22 *
23 *   t/255 = (t >> 8) + (t >> 16) + (t >> 24) ..
24 *
25 * in this case just the first two terms to fit in 16bit arithmetic
26 *
27 *   t/255 ~= (t + (t >> 8)) >> 8
28 *
29 * note that just by itself it doesn't satisfies the OpenGL criteria, as 255*255 = 254,
30 * so the special case a = 255 must be accounted or roundoff must be used
31 */
32#define GMBT_GEOMETRIC_SERIES	1
33
34/*
35 * when using a geometric series division instead of truncating the result
36 * use roundoff in the approximation (Jim Blinn)
37 *
38 *   t = rgb*a + 0x80
39 *
40 * achieving the exact results
41 */
42#define GMBT_ROUNDOFF		0
43
44/* instead of the roundoff this adds a small correction to satisfy the OpenGL criteria
45 *
46 *   t/255 ~= (t + (t >> 8) + (t >> 15)) >> 8
47 *
48 * note that although is faster than rounding off it doesn't give always the exact results
49 */
50#define GMBT_GEOMETRIC_CORRECTION	1
51
52/*
53 * do
54 *
55 *   s = (q - p)*a + q
56 *
57 * instead of
58 *
59 *   s = p*a + q*(1-a)
60 *
61 * this eliminates a multiply at the expense of
62 * complicating the roundoff but is generally worth it
63 */
64#define GMBT_SIGNED_ARITHMETIC	1
65
66#if GMBT_ROUNDOFF
67    SEG_DATA
68
69ALIGNDATA8
70const_80:
71	D_LONG 0x00800080, 0x00800080
72#endif
73
74   SEG_TEXT
75
76ALIGNTEXT16
77GLOBL GLNAME(_mesa_mmx_blend_transparency)
78
79/*
80 * void blend_transparency( GLcontext *ctx,
81 *                          GLuint n,
82 *                          const GLubyte mask[],
83 *                          GLchan rgba[][4],
84 *                          CONST GLchan dest[][4] )
85 *
86 * Common transparency blending mode.
87 */
88GLNAME( _mesa_mmx_blend_transparency ):
89
90    PUSH_L     ( EBP )
91    MOV_L      ( ESP, EBP )
92    PUSH_L     ( ESI )
93    PUSH_L     ( EDI )
94    PUSH_L     ( EBX )
95
96    MOV_L      ( REGOFF(12, EBP), ECX )		/* n */
97    CMP_L      ( CONST(0), ECX)
98    JE         ( LLBL (GMBT_return) )
99
100    MOV_L      ( REGOFF(16, EBP), EBX )		/* mask */
101    MOV_L      ( REGOFF(20, EBP), EDI )         /* rgba */
102    MOV_L      ( REGOFF(24, EBP), ESI )         /* dest */
103
104    TEST_L     ( CONST(4), EDI )		/* align rgba on an 8-byte boundary */
105    JZ         ( LLBL (GMBT_align_end) )
106
107    CMP_B      ( CONST(0), REGIND(EBX) )	/* *mask == 0 */
108    JE         ( LLBL (GMBT_align_continue) )
109
110    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
111
112    MOVD       ( REGIND(ESI), MM1 )		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */
113    MOVD       ( REGIND(EDI), MM2 )		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */
114
115    PUNPCKLBW  ( MM0, MM1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */
116    PUNPCKLBW  ( MM0, MM2 )			/*    pa1    |    pb1    |    pg1    |    pr1    */
117
118    MOVQ       ( MM2, MM3 )
119
120    PUNPCKHWD  ( MM3, MM3 )			/*    pa1    |    pa1    |           |           */
121    PUNPCKHDQ  ( MM3, MM3 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */
122
123#if GMBT_ALPHA_PLUS_ONE
124    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
125
126    PSUBW      ( MM4, MM3 )                     /*   pa1 + 1 |   pa1 + 1 |   pa1 + 1 |   pa1 + 1 */
127#endif
128
129#if GMBT_SIGNED_ARITHMETIC
130    PSUBW      ( MM1, MM2 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
131
132    PSLLW      ( CONST(8), MM1 )		/*                    q1 << 8                    */
133
134#if GMBT_ROUNDOFF
135    MOVQ       ( MM2, MM4 )
136#endif
137
138    PMULLW     ( MM3, MM2 )			/*              t1 = (q1 - p1)*pa1               */
139
140#if GMBT_ROUNDOFF
141    PSRLW      ( CONST(15), MM4 )		/*                 q1 > p1 ? 1 : 0               */
142
143    PSLLW      ( CONST(8), MM4 )		/*             q1 > p1 ? 0x100 : 0               */
144
145    PSUBW      ( MM4, MM2 )                     /*                  t1 -=? 0x100                 */
146#endif
147
148#else
149    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
150    PUNPCKLBW  ( MM0, MM4 )			/*   0x00ff  |   0x00ff  |   0x00ff  |   0x00ff  */
151    MOVQ       ( MM4, MM0 )
152
153    PMULLW     ( MM3, MM2 )			/*                     p1*pa1                    */
154
155    PSUBW      ( MM3, MM0 )			/* 255 - pa1 | 255 - pa1 | 255 - pa1 | 255 - pa1 */
156
157    PMULLW     ( MM0, MM1 )			/*                  q1*(255 - pa1)               */
158
159    PADDW      ( MM1, MM2 )			/*           t1 = p1*pa1 + q1*(255 - pa1)        */
160#endif
161
162#if GMBT_ROUNDOFF
163    MOVQ       ( CONTENT(const_80), MM4 )
164
165    PADDW      ( MM4, MM2 )                     /*                 t1 += 0x80                    */
166#endif
167
168#if GMBT_GEOMETRIC_SERIES
169    MOVQ       ( MM2, MM3 )
170
171    PSRLW      ( CONST(8), MM3 )		/*                    t1 >> 8                    */
172
173    PADDW      ( MM3, MM2 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */
174
175#if GMBT_GEOMETRIC_CORRECTION
176    PSRLW      ( CONST(7), MM3 )		/*                    t1 >> 15                   */
177
178    PADDW      ( MM3, MM2 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */
179#endif
180#endif
181
182#if GMBT_SIGNED_ARITHMETIC
183    PADDW      ( MM1, MM2 )			/*              (t1/255 + q1) << 8               */
184#endif
185
186    PSRLW      ( CONST(8), MM2 )		/*    sa1    |    sb1    |    sg1    |    sr1    */
187
188    PACKUSWB   ( MM0, MM2 )			/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */
189    MOVD       ( MM2, REGIND(EDI) )
190
191LLBL (GMBT_align_continue):
192
193    DEC_L      ( ECX )				/* n -= 1 */
194    INC_L      ( EBX )		                /* mask += 1 */
195    ADD_L      ( CONST(4), EDI )		/* rgba += 1 */
196    ADD_L      ( CONST(4), ESI )		/* dest += 1 */
197
198LLBL (GMBT_align_end):
199
200    CMP_L      ( CONST(2), ECX)
201    JB         ( LLBL (GMBT_loop_end) )
202
203ALIGNTEXT16
204LLBL (GMBT_loop_begin):
205
206    CMP_W      ( CONST(0), REGIND(EBX) )	/* *mask == 0 && *(mask + 1) == 0 */
207    JE         ( LLBL (GMBT_loop_continue) )
208
209    /* NOTE: the instruction pairing when multiple pipelines are available must be checked */
210
211    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
212
213    MOVQ       ( REGIND(ESI), MM7 )		/* qa2 | qb2 | qg2 | qr2 | qa1 | qb1 | qg1 | qr1 */
214    MOVQ       ( REGIND(EDI), MM6 )		/* pa2 | pb2 | pg2 | pr2 | pa1 | pb1 | pg1 | pr1 */
215
216    MOVQ       ( MM7, MM1 )
217    MOVQ       ( MM6, MM2 )
218
219    PUNPCKLBW  ( MM0, MM1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */
220    PUNPCKHBW  ( MM0, MM7 )                     /*    qa2    |    qb2    |    qg2    |    qr2    */
221    PUNPCKLBW  ( MM0, MM2 )			/*    pa1    |    pb1    |    pg1    |    pr1    */
222    PUNPCKHBW  ( MM0, MM6 )                     /*    pa2    |    pb2    |    pg2    |    pr2    */
223
224    MOVQ       ( MM2, MM3 )
225    MOVQ       ( MM6, MM5 )
226
227    PUNPCKHWD  ( MM3, MM3 )			/*    pa1    |    pa1    |           |           */
228    PUNPCKHWD  ( MM5, MM5 )			/*    pa2    |    pa2    |           |           */
229    PUNPCKHDQ  ( MM3, MM3 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */
230    PUNPCKHDQ  ( MM5, MM5 )                     /*    pa2    |    pa2    |    pa2    |    pa2    */
231
232#if GMBT_ALPHA_PLUS_ONE
233    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
234
235    PSUBW      ( MM4, MM3 )                     /*   pa1 + 1 |   pa1 + 1 |   pa1 + 1 |   pa1 + 1 */
236    PSUBW      ( MM4, MM5 )                     /*   pa2 + 1 |   pa2 + 1 |   pa2 + 1 |   pa2 + 1 */
237#endif
238
239#if GMBT_SIGNED_ARITHMETIC
240    PSUBW      ( MM1, MM2 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
241    PSUBW      ( MM7, MM6 )                     /* pa2 - qa2 | pb2 - qb2 | pg2 - qg2 | pr2 - qr2 */
242
243    PSLLW      ( CONST(8), MM1 )		/*                    q1 << 8                    */
244    PSLLW      ( CONST(8), MM7 )		/*                    q2 << 8                    */
245
246#if GMBT_ROUNDOFF
247    MOVQ       ( MM2, MM0 )
248    MOVQ       ( MM6, MM4 )
249#endif
250
251    PMULLW     ( MM3, MM2 )			/*              t1 = (q1 - p1)*pa1               */
252    PMULLW     ( MM5, MM6 )			/*              t2 = (q2 - p2)*pa2               */
253
254#if GMBT_ROUNDOFF
255    PSRLW      ( CONST(15), MM0 )		/*                 q1 > p1 ? 1 : 0               */
256    PSRLW      ( CONST(15), MM4 )		/*                 q2 > q2 ? 1 : 0               */
257
258    PSLLW      ( CONST(8), MM0 )		/*             q1 > p1 ? 0x100 : 0               */
259    PSLLW      ( CONST(8), MM4 )		/*             q2 > q2 ? 0x100 : 0               */
260
261    PSUBW      ( MM0, MM2 )                     /*                  t1 -=? 0x100                 */
262    PSUBW      ( MM4, MM7 )                     /*                  t2 -=? 0x100                 */
263#endif
264
265#else
266    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
267    PUNPCKLBW  ( MM0, MM4 )			/*   0x00ff  |   0x00ff  |   0x00ff  |   0x00ff  */
268    MOVQ       ( MM4, MM0 )
269
270    PMULLW     ( MM3, MM2 )			/*                     p1*pa1                    */
271    PMULLW     ( MM5, MM6 )			/*                     p2*pa2                    */
272
273    PSUBW      ( MM3, MM0 )			/* 255 - pa1 | 255 - pa1 | 255 - pa1 | 255 - pa1 */
274    PSUBW      ( MM5, MM4 )			/* 255 - pa2 | 255 - pa2 | 255 - pa2 | 255 - pa2 */
275
276    PMULLW     ( MM0, MM1 )			/*                  q1*(255 - pa1)               */
277    PMULLW     ( MM4, MM7 )			/*                  q2*(255 - pa2)               */
278
279    PADDW      ( MM1, MM2 )			/*           t1 = p1*pa1 + q1*(255 - pa1)        */
280    PADDW      ( MM7, MM6 )			/*           t2 = p2*pa2 + q2*(255 - pa2)        */
281#endif
282
283#if GMBT_ROUNDOFF
284    MOVQ       ( CONTENT(const_80), MM4 )
285
286    PADDW      ( MM4, MM2 )                     /*                 t1 += 0x80                    */
287    PADDW      ( MM4, MM6 )                     /*                 t2 += 0x80                    */
288#endif
289
290#if GMBT_GEOMETRIC_SERIES
291    MOVQ       ( MM2, MM3 )
292    MOVQ       ( MM6, MM5 )
293
294    PSRLW      ( CONST(8), MM3 )		/*                    t1 >> 8                    */
295    PSRLW      ( CONST(8), MM5 )		/*                    t2 >> 8                    */
296
297    PADDW      ( MM3, MM2 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */
298    PADDW      ( MM5, MM6 )			/*        t2 + (t2 >> 8) ~= (t2/255) << 8        */
299
300#if GMBT_GEOMETRIC_CORRECTION
301    PSRLW      ( CONST(7), MM3 )		/*                    t1 >> 15                   */
302    PSRLW      ( CONST(7), MM5 )		/*                    t2 >> 15                   */
303
304    PADDW      ( MM3, MM2 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */
305    PADDW      ( MM5, MM6 )			/*  t2 + (t2 >> 8) + (t2 >>15) ~= (t2/255) << 8  */
306#endif
307#endif
308
309#if GMBT_SIGNED_ARITHMETIC
310    PADDW      ( MM1, MM2 )			/*              (t1/255 + q1) << 8               */
311    PADDW      ( MM7, MM6 )			/*              (t2/255 + q2) << 8               */
312#endif
313
314    PSRLW      ( CONST(8), MM2 )		/*    sa1    |    sb1    |    sg1    |    sr1    */
315    PSRLW      ( CONST(8), MM6 )		/*    sa2    |    sb2    |    sg2    |    sr2    */
316
317    PACKUSWB   ( MM6, MM2 )			/* sa2 | sb2 | sg2 | sr2 | sa1 | sb1 | sg1 | sr1 */
318    MOVQ       ( MM2, REGIND(EDI) )
319
320LLBL (GMBT_loop_continue):
321
322    DEC_L      ( ECX )
323    DEC_L      ( ECX )				/* n -= 2 */
324    ADD_L      ( CONST(2), EBX )		/* mask += 2 */
325    ADD_L      ( CONST(8), EDI )		/* rgba += 2 */
326    ADD_L      ( CONST(8), ESI )		/* dest += 2 */
327    CMP_L      ( CONST(2), ECX )
328    JAE        ( LLBL (GMBT_loop_begin) )
329
330LLBL (GMBT_loop_end):
331
332    CMP_L      ( CONST(1), ECX )
333    JB         ( LLBL (GMBT_done) )
334
335    CMP_B      ( CONST(0), REGIND(EBX) )	/* *mask == 0 */
336    JE         ( LLBL (GMBT_done) )
337
338    PXOR       ( MM0, MM0 )			/*   0x0000  |   0x0000  |   0x0000  |   0x0000  */
339
340    MOVD       ( REGIND(ESI), MM1 )		/*     |     |     |     | qa1 | qb1 | qg1 | qr1 */
341    MOVD       ( REGIND(EDI), MM2 )		/*     |     |     |     | pa1 | pb1 | pg1 | pr1 */
342
343    PUNPCKLBW  ( MM0, MM1 )			/*    qa1    |    qb1    |    qg1    |    qr1    */
344    PUNPCKLBW  ( MM0, MM2 )			/*    pa1    |    pb1    |    pg1    |    pr1    */
345
346    MOVQ       ( MM2, MM3 )
347
348    PUNPCKHWD  ( MM3, MM3 )			/*    pa1    |    pa1    |           |           */
349    PUNPCKHDQ  ( MM3, MM3 )                     /*    pa1    |    pa1    |    pa1    |    pa1    */
350
351#if GMBT_ALPHA_PLUS_ONE
352    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
353
354    PSUBW      ( MM4, MM3 )                     /*   pa1 + 1 |   pa1 + 1 |   pa1 + 1 |   pa1 + 1 */
355#endif
356
357#if GMBT_SIGNED_ARITHMETIC
358    PSUBW      ( MM1, MM2 )                     /* pa1 - qa1 | pb1 - qb1 | pg1 - qg1 | pr1 - qr1 */
359
360    PSLLW      ( CONST(8), MM1 )		/*                    q1 << 8                    */
361
362#if GMBT_ROUNDOFF
363    MOVQ       ( MM2, MM4 )
364#endif
365
366    PMULLW     ( MM3, MM2 )			/*              t1 = (q1 - p1)*pa1               */
367
368#if GMBT_ROUNDOFF
369    PSRLW      ( CONST(15), MM4 )		/*                 q1 > p1 ? 1 : 0               */
370
371    PSLLW      ( CONST(8), MM4 )		/*             q1 > p1 ? 0x100 : 0               */
372
373    PSUBW      ( MM4, MM2 )                     /*                  t1 -=? 0x100                 */
374#endif
375
376#else
377    PCMPEQW    ( MM4, MM4 )			/*   0xffff  |   0xffff  |   0xffff  |   0xffff  */
378    PUNPCKLBW  ( MM0, MM4 )			/*   0x00ff  |   0x00ff  |   0x00ff  |   0x00ff  */
379    MOVQ       ( MM4, MM0 )
380
381    PMULLW     ( MM3, MM2 )			/*                     p1*pa1                    */
382
383    PSUBW      ( MM3, MM0 )			/* 255 - pa1 | 255 - pa1 | 255 - pa1 | 255 - pa1 */
384
385    PMULLW     ( MM0, MM1 )			/*                  q1*(255 - pa1)               */
386
387    PADDW      ( MM1, MM2 )			/*           t1 = p1*pa1 + q1*(255 - pa1)        */
388#endif
389
390#if GMBT_ROUNDOFF
391    MOVQ       ( CONTENT(const_80), MM4 )
392
393    PADDW      ( MM4, MM2 )                     /*                 t1 += 0x80                    */
394#endif
395
396#if GMBT_GEOMETRIC_SERIES
397    MOVQ       ( MM2, MM3 )
398
399    PSRLW      ( CONST(8), MM3 )		/*                    t1 >> 8                    */
400
401    PADDW      ( MM3, MM2 )			/*        t1 + (t1 >> 8) ~= (t1/255) << 8        */
402
403#if GMBT_GEOMETRIC_CORRECTION
404    PSRLW      ( CONST(7), MM3 )		/*                    t1 >> 15                   */
405
406    PADDW      ( MM3, MM2 )			/*  t1 + (t1 >> 8) + (t1 >>15) ~= (t1/255) << 8  */
407#endif
408#endif
409
410#if GMBT_SIGNED_ARITHMETIC
411    PADDW      ( MM1, MM2 )			/*              (t1/255 + q1) << 8               */
412#endif
413
414    PSRLW      ( CONST(8), MM2 )		/*    sa1    |    sb1    |    sg1    |    sr1    */
415
416    PACKUSWB   ( MM0, MM2 )			/*     |     |     |     | sa1 | sb1 | sg1 | sr1 */
417    MOVD       ( MM2, REGIND(EDI) )
418
419LLBL (GMBT_done):
420
421    EMMS
422
423LLBL (GMBT_return):
424
425    POP_L      ( EBX )
426    POP_L      ( EDI )
427    POP_L      ( ESI )
428    MOV_L      ( EBP, ESP )
429    POP_L      ( EBP )
430    RET
431