1;
2; pII-optimised MMX format converters for HERMES
3; Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk)
4;   and (c) 1999 Jonathan Matthew (jmatthew@uq.net.au)
5; This source code is licensed under the GNU LGPL
6; 
7; Please refer to the file COPYING.LIB contained in the distribution for
8; licensing conditions		
9;
10; COPYRIGHT NOTICE
11; 
12; This file partly contains code that is (c) Intel Corporation, specifically
13; the mode detection routine, and the converter to 15 bit (8 pixel
14; conversion routine from the mmx programming tutorial pages).
15;
16;
17; These routines aren't exactly pII optimised - it's just that as they
18; are, they're terrible on p5 MMXs, but less so on pIIs.  Someone needs to
19; optimise them for p5 MMXs..
20
21BITS 32
22
23%include "common.inc"
24	
25SDL_FUNC _ConvertMMXpII32_24RGB888
26SDL_FUNC _ConvertMMXpII32_16RGB565
27SDL_FUNC _ConvertMMXpII32_16BGR565
28SDL_FUNC _ConvertMMXpII32_16RGB555
29SDL_FUNC _ConvertMMXpII32_16BGR555
30
31;; Macros for conversion routines
32
33%macro _push_immq_mask 1
34	push dword %1
35	push dword %1
36%endmacro
37
38%macro load_immq 2
39	_push_immq_mask %2
40	movq %1, [esp]
41%endmacro
42
43%macro pand_immq 2
44	_push_immq_mask %2
45	pand %1, [esp]
46%endmacro
47
48%define CLEANUP_IMMQ_LOADS(num) \
49	add esp, byte 8 * num
50
51%define mmx32_rgb888_mask 00ffffffh
52%define mmx32_rgb565_b 000000f8h
53%define mmx32_rgb565_g 0000fc00h
54%define mmx32_rgb565_r 00f80000h
55
56%define mmx32_rgb555_rb 00f800f8h
57%define mmx32_rgb555_g 0000f800h
58%define mmx32_rgb555_mul 20000008h
59%define mmx32_bgr555_mul 00082000h
60
61SECTION .text
62
63_ConvertMMXpII32_24RGB888:
64
65        ; set up mm6 as the mask, mm7 as zero
66        load_immq mm6, mmx32_rgb888_mask
67        CLEANUP_IMMQ_LOADS(1)
68        pxor mm7, mm7
69
70        mov edx, ecx                    ; save ecx
71        and ecx, 0fffffffch             ; clear lower two bits
72        jnz .L1
73        jmp .L2
74
75.L1:
76
77        movq mm0, [esi]                 ; A R G B a r g b
78        pand mm0, mm6                   ; 0 R G B 0 r g b
79        movq mm1, [esi+8]               ; A R G B a r g b
80        pand mm1, mm6                   ; 0 R G B 0 r g b
81
82        movq mm2, mm0                   ; 0 R G B 0 r g b
83        punpckhdq mm2, mm7              ; 0 0 0 0 0 R G B
84        punpckldq mm0, mm7              ; 0 0 0 0 0 r g b
85        psllq mm2, 24                   ; 0 0 R G B 0 0 0
86        por mm0, mm2                    ; 0 0 R G B r g b
87
88        movq mm3, mm1                   ; 0 R G B 0 r g b
89        psllq mm3, 48                   ; g b 0 0 0 0 0 0
90        por mm0, mm3                    ; g b R G B r g b
91
92        movq mm4, mm1                   ; 0 R G B 0 r g b
93        punpckhdq mm4, mm7              ; 0 0 0 0 0 R G B
94        punpckldq mm1, mm7              ; 0 0 0 0 0 r g b
95        psrlq mm1, 16                   ; 0 0 0 R G B 0 r
96        psllq mm4, 8                    ; 0 0 0 0 R G B 0
97        por mm1, mm4                    ; 0 0 0 0 R G B r
98
99        movq [edi], mm0
100        add esi, BYTE 16
101        movd [edi+8], mm1
102        add edi, BYTE 12
103        sub ecx, BYTE 4
104        jnz .L1
105
106.L2:
107        mov ecx, edx
108        and ecx, BYTE 3
109        jz .L4
110.L3:
111        mov al, [esi]
112        mov bl, [esi+1]
113        mov dl, [esi+2]
114        mov [edi], al
115        mov [edi+1], bl
116        mov [edi+2], dl
117        add esi, BYTE 4
118        add edi, BYTE 3
119        dec ecx
120        jnz .L3
121.L4:
122        retn
123
124
125
126_ConvertMMXpII32_16RGB565:
127
128        ; set up masks
129        load_immq mm5, mmx32_rgb565_b
130        load_immq mm6, mmx32_rgb565_g
131        load_immq mm7, mmx32_rgb565_r
132        CLEANUP_IMMQ_LOADS(3)
133
134        mov edx, ecx
135        shr ecx, 2
136        jnz .L1
137        jmp .L2         ; not necessary at the moment, but doesn't hurt (much)
138
139.L1:
140        movq mm0, [esi]         ; argb
141        movq mm1, mm0           ; argb
142        pand mm0, mm6           ; 00g0
143        movq mm3, mm1           ; argb
144        pand mm1, mm5           ; 000b
145        pand mm3, mm7           ; 0r00
146        pslld mm1, 2            ; 0 0 000000bb bbb00000
147        por mm0, mm1            ; 0 0 ggggggbb bbb00000
148        psrld mm0, 5            ; 0 0 00000ggg gggbbbbb
149
150        movq mm4, [esi+8]       ; argb
151        movq mm2, mm4           ; argb
152        pand mm4, mm6           ; 00g0
153        movq mm1, mm2           ; argb
154        pand mm2, mm5           ; 000b
155        pand mm1, mm7           ; 0r00
156        pslld mm2, 2            ; 0 0 000000bb bbb00000
157        por mm4, mm2            ; 0 0 ggggggbb bbb00000
158        psrld mm4, 5            ; 0 0 00000ggg gggbbbbb
159
160        packuswb mm3, mm1       ; R 0 r 0
161        packssdw mm0, mm4       ; as above.. ish
162        por mm0, mm3            ; done.
163        movq [edi], mm0
164
165        add esi, 16
166        add edi, 8
167        dec ecx
168        jnz .L1
169
170.L2:
171        mov ecx, edx
172        and ecx, BYTE 3
173        jz .L4
174.L3:
175        mov al, [esi]
176        mov bh, [esi+1]
177        mov ah, [esi+2]
178        shr al, 3
179        and eax, 0F81Fh            ; BYTE?
180        shr ebx, 5
181        and ebx, 07E0h             ; BYTE?
182        add eax, ebx
183        mov [edi], al
184        mov [edi+1], ah
185        add esi, BYTE 4
186        add edi, BYTE 2
187        dec ecx
188        jnz .L3
189
190.L4:
191	retn
192
193	
194_ConvertMMXpII32_16BGR565:
195
196        load_immq mm5, mmx32_rgb565_r
197        load_immq mm6, mmx32_rgb565_g
198        load_immq mm7, mmx32_rgb565_b
199        CLEANUP_IMMQ_LOADS(3)
200
201        mov edx, ecx
202        shr ecx, 2
203        jnz .L1
204        jmp .L2
205
206.L1:
207        movq mm0, [esi]                 ; a r g b
208        movq mm1, mm0                   ; a r g b
209        pand mm0, mm6                   ; 0 0 g 0
210        movq mm3, mm1                   ; a r g b
211        pand mm1, mm5                   ; 0 r 0 0
212        pand mm3, mm7                   ; 0 0 0 b
213
214        psllq mm3, 16                   ; 0 b 0 0
215        psrld mm1, 14                   ; 0 0 000000rr rrr00000
216        por mm0, mm1                    ; 0 0 ggggggrr rrr00000
217        psrld mm0, 5                    ; 0 0 00000ggg gggrrrrr
218
219        movq mm4, [esi+8]               ; a r g b
220        movq mm2, mm4                   ; a r g b
221        pand mm4, mm6                   ; 0 0 g 0
222        movq mm1, mm2                   ; a r g b
223        pand mm2, mm5                   ; 0 r 0 0
224        pand mm1, mm7                   ; 0 0 0 b
225
226        psllq mm1, 16                   ; 0 b 0 0
227        psrld mm2, 14                   ; 0 0 000000rr rrr00000
228        por mm4, mm2                    ; 0 0 ggggggrr rrr00000
229        psrld mm4, 5                    ; 0 0 00000ggg gggrrrrr
230
231        packuswb mm3, mm1               ; BBBBB000 00000000 bbbbb000 00000000
232        packssdw mm0, mm4               ; 00000GGG GGGRRRRR 00000GGG GGGRRRRR
233        por mm0, mm3                    ; BBBBBGGG GGGRRRRR bbbbbggg gggrrrrr
234        movq [edi], mm0
235
236        add esi, BYTE 16
237        add edi, BYTE 8
238        dec ecx
239        jnz .L1
240
241.L2:
242        and edx, BYTE 3
243        jz .L4
244.L3:
245        mov al, [esi+2]
246        mov bh, [esi+1]
247        mov ah, [esi]
248        shr al, 3
249        and eax, 0F81Fh                    ; BYTE ?
250        shr ebx, 5
251        and ebx, 07E0h                     ; BYTE ?
252        add eax, ebx
253        mov [edi], al
254        mov [edi+1], ah
255        add esi, BYTE 4
256        add edi, BYTE 2
257        dec edx
258        jnz .L3
259
260.L4:
261        retn
262
263_ConvertMMXpII32_16BGR555:
264
265        ; the 16BGR555 converter is identical to the RGB555 one,
266        ; except it uses a different multiplier for the pmaddwd
267        ; instruction.  cool huh.
268
269        load_immq mm7, mmx32_bgr555_mul
270        jmp _convert_bgr555_cheat
271
272; This is the same as the Intel version.. they obviously went to
273; much more trouble to expand/coil the loop than I did, so theirs
274; would almost certainly be faster, even if only a little.
275; I did rename 'mmx32_rgb555_add' to 'mmx32_rgb555_mul', which is
276; (I think) a more accurate name..
277_ConvertMMXpII32_16RGB555:
278
279	load_immq mm7, mmx32_rgb555_mul
280_convert_bgr555_cheat:
281	load_immq mm6, mmx32_rgb555_g
282	CLEANUP_IMMQ_LOADS(2)
283        
284	mov edx,ecx		           ; Save ecx 
285
286        and ecx,DWORD 0fffffff8h            ; clear lower three bits
287	jnz .L_OK
288        jmp near .L2 
289
290.L_OK:
291	
292	movq mm2,[esi+8]
293
294	movq mm0,[esi]
295	movq mm3,mm2
296
297	pand_immq mm3, mmx32_rgb555_rb
298	movq mm1,mm0
299
300	pand_immq mm1, mmx32_rgb555_rb
301	pmaddwd mm3,mm7
302
303	CLEANUP_IMMQ_LOADS(2)
304
305	pmaddwd mm1,mm7
306	pand mm2,mm6
307
308.L1:
309	movq mm4,[esi+24]
310	pand mm0,mm6
311
312	movq mm5,[esi+16]
313	por mm3,mm2
314
315	psrld mm3,6
316	por mm1,mm0
317
318	movq mm0,mm4
319	psrld mm1,6
320
321	pand_immq mm0, mmx32_rgb555_rb
322	packssdw mm1,mm3
323
324	movq mm3,mm5
325	pmaddwd mm0,mm7
326
327	pand_immq mm3, mmx32_rgb555_rb
328	pand mm4,mm6
329
330	movq [edi],mm1			
331	pmaddwd mm3,mm7
332
333        add esi,BYTE 32
334	por mm4,mm0
335
336	pand mm5,mm6
337	psrld mm4,6
338
339	movq mm2,[esi+8]
340	por mm5,mm3
341
342	movq mm0,[esi]
343	psrld mm5,6
344
345	movq mm3,mm2
346	movq mm1,mm0
347
348	pand_immq mm3, mmx32_rgb555_rb
349	packssdw mm5,mm4
350
351	pand_immq mm1, mmx32_rgb555_rb
352	pand mm2,mm6
353
354	CLEANUP_IMMQ_LOADS(4)
355
356	movq [edi+8],mm5
357	pmaddwd mm3,mm7
358
359	pmaddwd mm1,mm7
360        add edi,BYTE 16
361	
362        sub ecx,BYTE 8
363	jz .L2
364        jmp .L1
365
366
367.L2:	
368	mov ecx,edx
369	
370        and ecx,BYTE 7
371	jz .L4
372	
373.L3:	
374	mov ebx,[esi]
375        add esi,BYTE 4
376	
377        mov eax,ebx
378        mov edx,ebx
379
380        shr eax,3
381        shr edx,6
382
383        and eax,BYTE 0000000000011111b
384        and edx,     0000001111100000b
385
386        shr ebx,9
387
388        or eax,edx
389
390        and ebx,     0111110000000000b
391
392        or eax,ebx
393
394        mov [edi],ax
395        add edi,BYTE 2
396
397	dec ecx
398	jnz .L3	
399
400.L4:		
401	retn
402
403%ifidn __OUTPUT_FORMAT__,elf32
404section .note.GNU-stack noalloc noexec nowrite progbits
405%endif
406