1/*
2 *    Stack-less Just-In-Time compiler
3 *
4 *    Copyright 2009-2012 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without modification, are
7 * permitted provided that the following conditions are met:
8 *
9 *   1. Redistributions of source code must retain the above copyright notice, this list of
10 *      conditions and the following disclaimer.
11 *
12 *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13 *      of conditions and the following disclaimer in the documentation and/or other materials
14 *      provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27SLJIT_API_FUNC_ATTRIBUTE SLJIT_CONST char* sljit_get_platform_name(void)
28{
29	return "x86" SLJIT_CPUINFO;
30}
31
32/*
33   32b register indexes:
34     0 - EAX
35     1 - ECX
36     2 - EDX
37     3 - EBX
38     4 - none
39     5 - EBP
40     6 - ESI
41     7 - EDI
42*/
43
44/*
45   64b register indexes:
46     0 - RAX
47     1 - RCX
48     2 - RDX
49     3 - RBX
50     4 - none
51     5 - RBP
52     6 - RSI
53     7 - RDI
54     8 - R8   - From now on REX prefix is required
55     9 - R9
56    10 - R10
57    11 - R11
58    12 - R12
59    13 - R13
60    14 - R14
61    15 - R15
62*/
63
64#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
65
66/* Last register + 1. */
67#define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
68
69static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
70	0, 0, 2, 1, 0, 0, 0, 0, 7, 6, 3, 4, 5
71};
72
73#define CHECK_EXTRA_REGS(p, w, do) \
74	if (p >= SLJIT_R3 && p <= SLJIT_R6) { \
75		w = FIXED_LOCALS_OFFSET + ((p) - (SLJIT_R3 + 4)) * sizeof(sljit_sw); \
76		p = SLJIT_MEM1(SLJIT_SP); \
77		do; \
78	}
79
80#else /* SLJIT_CONFIG_X86_32 */
81
82/* Last register + 1. */
83#define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
84#define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
85#define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
86
87/* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
88   Note: avoid to use r12 and r13 for memory addessing
89   therefore r12 is better for SAVED_EREG than SAVED_REG. */
90#ifndef _WIN64
91/* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
92static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
93	0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
94};
95/* low-map. reg_map & 0x7. */
96static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
97	0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
98};
99#else
100/* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
101static SLJIT_CONST sljit_ub reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
102	0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
103};
104/* low-map. reg_map & 0x7. */
105static SLJIT_CONST sljit_ub reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
106	0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
107};
108#endif
109
110#define REX_W		0x48
111#define REX_R		0x44
112#define REX_X		0x42
113#define REX_B		0x41
114#define REX		0x40
115
116#ifndef _WIN64
117#define HALFWORD_MAX 0x7fffffffl
118#define HALFWORD_MIN -0x80000000l
119#else
120#define HALFWORD_MAX 0x7fffffffll
121#define HALFWORD_MIN -0x80000000ll
122#endif
123
124#define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
125#define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
126
127#define CHECK_EXTRA_REGS(p, w, do)
128
129#endif /* SLJIT_CONFIG_X86_32 */
130
131#define TMP_FREG	(0)
132
133/* Size flags for emit_x86_instruction: */
134#define EX86_BIN_INS		0x0010
135#define EX86_SHIFT_INS		0x0020
136#define EX86_REX		0x0040
137#define EX86_NO_REXW		0x0080
138#define EX86_BYTE_ARG		0x0100
139#define EX86_HALF_ARG		0x0200
140#define EX86_PREF_66		0x0400
141#define EX86_PREF_F2		0x0800
142#define EX86_PREF_F3		0x1000
143#define EX86_SSE2_OP1		0x2000
144#define EX86_SSE2_OP2		0x4000
145#define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
146
147/* --------------------------------------------------------------------- */
148/*  Instrucion forms                                                     */
149/* --------------------------------------------------------------------- */
150
151#define ADD		(/* BINARY */ 0 << 3)
152#define ADD_EAX_i32	0x05
153#define ADD_r_rm	0x03
154#define ADD_rm_r	0x01
155#define ADDSD_x_xm	0x58
156#define ADC		(/* BINARY */ 2 << 3)
157#define ADC_EAX_i32	0x15
158#define ADC_r_rm	0x13
159#define ADC_rm_r	0x11
160#define AND		(/* BINARY */ 4 << 3)
161#define AND_EAX_i32	0x25
162#define AND_r_rm	0x23
163#define AND_rm_r	0x21
164#define ANDPD_x_xm	0x54
165#define BSR_r_rm	(/* GROUP_0F */ 0xbd)
166#define CALL_i32	0xe8
167#define CALL_rm		(/* GROUP_FF */ 2 << 3)
168#define CDQ		0x99
169#define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
170#define CMP		(/* BINARY */ 7 << 3)
171#define CMP_EAX_i32	0x3d
172#define CMP_r_rm	0x3b
173#define CMP_rm_r	0x39
174#define CVTPD2PS_x_xm	0x5a
175#define CVTSI2SD_x_rm	0x2a
176#define CVTTSD2SI_r_xm	0x2c
177#define DIV		(/* GROUP_F7 */ 6 << 3)
178#define DIVSD_x_xm	0x5e
179#define INT3		0xcc
180#define IDIV		(/* GROUP_F7 */ 7 << 3)
181#define IMUL		(/* GROUP_F7 */ 5 << 3)
182#define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
183#define IMUL_r_rm_i8	0x6b
184#define IMUL_r_rm_i32	0x69
185#define JE_i8		0x74
186#define JMP_i8		0xeb
187#define JMP_i32		0xe9
188#define JMP_rm		(/* GROUP_FF */ 4 << 3)
189#define LEA_r_m		0x8d
190#define MOV_r_rm	0x8b
191#define MOV_r_i32	0xb8
192#define MOV_rm_r	0x89
193#define MOV_rm_i32	0xc7
194#define MOV_rm8_i8	0xc6
195#define MOV_rm8_r8	0x88
196#define MOVSD_x_xm	0x10
197#define MOVSD_xm_x	0x11
198#define MOVSXD_r_rm	0x63
199#define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
200#define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
201#define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
202#define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
203#define MUL		(/* GROUP_F7 */ 4 << 3)
204#define MULSD_x_xm	0x59
205#define NEG_rm		(/* GROUP_F7 */ 3 << 3)
206#define NOP		0x90
207#define NOT_rm		(/* GROUP_F7 */ 2 << 3)
208#define OR		(/* BINARY */ 1 << 3)
209#define OR_r_rm		0x0b
210#define OR_EAX_i32	0x0d
211#define OR_rm_r		0x09
212#define OR_rm8_r8	0x08
213#define POP_r		0x58
214#define POP_rm		0x8f
215#define POPF		0x9d
216#define PUSH_i32	0x68
217#define PUSH_r		0x50
218#define PUSH_rm		(/* GROUP_FF */ 6 << 3)
219#define PUSHF		0x9c
220#define RET_near	0xc3
221#define RET_i16		0xc2
222#define SBB		(/* BINARY */ 3 << 3)
223#define SBB_EAX_i32	0x1d
224#define SBB_r_rm	0x1b
225#define SBB_rm_r	0x19
226#define SAR		(/* SHIFT */ 7 << 3)
227#define SHL		(/* SHIFT */ 4 << 3)
228#define SHR		(/* SHIFT */ 5 << 3)
229#define SUB		(/* BINARY */ 5 << 3)
230#define SUB_EAX_i32	0x2d
231#define SUB_r_rm	0x2b
232#define SUB_rm_r	0x29
233#define SUBSD_x_xm	0x5c
234#define TEST_EAX_i32	0xa9
235#define TEST_rm_r	0x85
236#define UCOMISD_x_xm	0x2e
237#define UNPCKLPD_x_xm	0x14
238#define XCHG_EAX_r	0x90
239#define XCHG_r_rm	0x87
240#define XOR		(/* BINARY */ 6 << 3)
241#define XOR_EAX_i32	0x35
242#define XOR_r_rm	0x33
243#define XOR_rm_r	0x31
244#define XORPD_x_xm	0x57
245
246#define GROUP_0F	0x0f
247#define GROUP_F7	0xf7
248#define GROUP_FF	0xff
249#define GROUP_BINARY_81	0x81
250#define GROUP_BINARY_83	0x83
251#define GROUP_SHIFT_1	0xd1
252#define GROUP_SHIFT_N	0xc1
253#define GROUP_SHIFT_CL	0xd3
254
255#define MOD_REG		0xc0
256#define MOD_DISP8	0x40
257
258#define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
259
260#define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
261#define POP_REG(r)			(*inst++ = (POP_r + (r)))
262#define RET()				(*inst++ = (RET_near))
263#define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
264/* r32, r/m32 */
265#define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
266
267/* Multithreading does not affect these static variables, since they store
268   built-in CPU features. Therefore they can be overwritten by different threads
269   if they detect the CPU features in the same time. */
270#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
271static sljit_si cpu_has_sse2 = -1;
272#endif
273static sljit_si cpu_has_cmov = -1;
274
275#if defined(_MSC_VER) && _MSC_VER >= 1400
276#include <intrin.h>
277#endif
278
279static void get_cpu_features(void)
280{
281	sljit_ui features;
282
283#if defined(_MSC_VER) && _MSC_VER >= 1400
284
285	int CPUInfo[4];
286	__cpuid(CPUInfo, 1);
287	features = (sljit_ui)CPUInfo[3];
288
289#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
290
291	/* AT&T syntax. */
292	__asm__ (
293		"movl $0x1, %%eax\n"
294#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
295		/* On x86-32, there is no red zone, so this
296		   should work (no need for a local variable). */
297		"push %%ebx\n"
298#endif
299		"cpuid\n"
300#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
301		"pop %%ebx\n"
302#endif
303		"movl %%edx, %0\n"
304		: "=g" (features)
305		:
306#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
307		: "%eax", "%ecx", "%edx"
308#else
309		: "%rax", "%rbx", "%rcx", "%rdx"
310#endif
311	);
312
313#else /* _MSC_VER && _MSC_VER >= 1400 */
314
315	/* Intel syntax. */
316	__asm {
317		mov eax, 1
318		cpuid
319		mov features, edx
320	}
321
322#endif /* _MSC_VER && _MSC_VER >= 1400 */
323
324#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
325	cpu_has_sse2 = (features >> 26) & 0x1;
326#endif
327	cpu_has_cmov = (features >> 15) & 0x1;
328}
329
330static sljit_ub get_jump_code(sljit_si type)
331{
332	switch (type) {
333	case SLJIT_C_EQUAL:
334	case SLJIT_C_FLOAT_EQUAL:
335		return 0x84 /* je */;
336
337	case SLJIT_C_NOT_EQUAL:
338	case SLJIT_C_FLOAT_NOT_EQUAL:
339		return 0x85 /* jne */;
340
341	case SLJIT_C_LESS:
342	case SLJIT_C_FLOAT_LESS:
343		return 0x82 /* jc */;
344
345	case SLJIT_C_GREATER_EQUAL:
346	case SLJIT_C_FLOAT_GREATER_EQUAL:
347		return 0x83 /* jae */;
348
349	case SLJIT_C_GREATER:
350	case SLJIT_C_FLOAT_GREATER:
351		return 0x87 /* jnbe */;
352
353	case SLJIT_C_LESS_EQUAL:
354	case SLJIT_C_FLOAT_LESS_EQUAL:
355		return 0x86 /* jbe */;
356
357	case SLJIT_C_SIG_LESS:
358		return 0x8c /* jl */;
359
360	case SLJIT_C_SIG_GREATER_EQUAL:
361		return 0x8d /* jnl */;
362
363	case SLJIT_C_SIG_GREATER:
364		return 0x8f /* jnle */;
365
366	case SLJIT_C_SIG_LESS_EQUAL:
367		return 0x8e /* jle */;
368
369	case SLJIT_C_OVERFLOW:
370	case SLJIT_C_MUL_OVERFLOW:
371		return 0x80 /* jo */;
372
373	case SLJIT_C_NOT_OVERFLOW:
374	case SLJIT_C_MUL_NOT_OVERFLOW:
375		return 0x81 /* jno */;
376
377	case SLJIT_C_FLOAT_UNORDERED:
378		return 0x8a /* jp */;
379
380	case SLJIT_C_FLOAT_ORDERED:
381		return 0x8b /* jpo */;
382	}
383	return 0;
384}
385
386static sljit_ub* generate_far_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_si type);
387
388#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
389static sljit_ub* generate_fixed_jump(sljit_ub *code_ptr, sljit_sw addr, sljit_si type);
390#endif
391
392static sljit_ub* generate_near_jump_code(struct sljit_jump *jump, sljit_ub *code_ptr, sljit_ub *code, sljit_si type)
393{
394	sljit_si short_jump;
395	sljit_uw label_addr;
396
397	if (jump->flags & JUMP_LABEL)
398		label_addr = (sljit_uw)(code + jump->u.label->size);
399	else
400		label_addr = jump->u.target;
401	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
402
403#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
404	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
405		return generate_far_jump_code(jump, code_ptr, type);
406#endif
407
408	if (type == SLJIT_JUMP) {
409		if (short_jump)
410			*code_ptr++ = JMP_i8;
411		else
412			*code_ptr++ = JMP_i32;
413		jump->addr++;
414	}
415	else if (type >= SLJIT_FAST_CALL) {
416		short_jump = 0;
417		*code_ptr++ = CALL_i32;
418		jump->addr++;
419	}
420	else if (short_jump) {
421		*code_ptr++ = get_jump_code(type) - 0x10;
422		jump->addr++;
423	}
424	else {
425		*code_ptr++ = GROUP_0F;
426		*code_ptr++ = get_jump_code(type);
427		jump->addr += 2;
428	}
429
430	if (short_jump) {
431		jump->flags |= PATCH_MB;
432		code_ptr += sizeof(sljit_sb);
433	} else {
434		jump->flags |= PATCH_MW;
435#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
436		code_ptr += sizeof(sljit_sw);
437#else
438		code_ptr += sizeof(sljit_si);
439#endif
440	}
441
442	return code_ptr;
443}
444
445SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
446{
447	struct sljit_memory_fragment *buf;
448	sljit_ub *code;
449	sljit_ub *code_ptr;
450	sljit_ub *buf_ptr;
451	sljit_ub *buf_end;
452	sljit_ub len;
453
454	struct sljit_label *label;
455	struct sljit_jump *jump;
456	struct sljit_const *const_;
457
458	CHECK_ERROR_PTR();
459	check_sljit_generate_code(compiler);
460	reverse_buf(compiler);
461
462	/* Second code generation pass. */
463	code = (sljit_ub*)SLJIT_MALLOC_EXEC(compiler->size);
464	PTR_FAIL_WITH_EXEC_IF(code);
465	buf = compiler->buf;
466
467	code_ptr = code;
468	label = compiler->labels;
469	jump = compiler->jumps;
470	const_ = compiler->consts;
471	do {
472		buf_ptr = buf->memory;
473		buf_end = buf_ptr + buf->used_size;
474		do {
475			len = *buf_ptr++;
476			if (len > 0) {
477				/* The code is already generated. */
478				SLJIT_MEMMOVE(code_ptr, buf_ptr, len);
479				code_ptr += len;
480				buf_ptr += len;
481			}
482			else {
483				if (*buf_ptr >= 4) {
484					jump->addr = (sljit_uw)code_ptr;
485					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
486						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
487					else
488						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
489					jump = jump->next;
490				}
491				else if (*buf_ptr == 0) {
492					label->addr = (sljit_uw)code_ptr;
493					label->size = code_ptr - code;
494					label = label->next;
495				}
496				else if (*buf_ptr == 1) {
497					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
498					const_ = const_->next;
499				}
500				else {
501#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
502					*code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
503					buf_ptr++;
504					*(sljit_sw*)code_ptr = *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw));
505					code_ptr += sizeof(sljit_sw);
506					buf_ptr += sizeof(sljit_sw) - 1;
507#else
508					code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
509					buf_ptr += sizeof(sljit_sw);
510#endif
511				}
512				buf_ptr++;
513			}
514		} while (buf_ptr < buf_end);
515		SLJIT_ASSERT(buf_ptr == buf_end);
516		buf = buf->next;
517	} while (buf);
518
519	SLJIT_ASSERT(!label);
520	SLJIT_ASSERT(!jump);
521	SLJIT_ASSERT(!const_);
522
523	jump = compiler->jumps;
524	while (jump) {
525		if (jump->flags & PATCH_MB) {
526			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb))) <= 127);
527			*(sljit_ub*)jump->addr = (sljit_ub)(jump->u.label->addr - (jump->addr + sizeof(sljit_sb)));
528		} else if (jump->flags & PATCH_MW) {
529			if (jump->flags & JUMP_LABEL) {
530#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
531				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw)));
532#else
533				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
534				*(sljit_si*)jump->addr = (sljit_si)(jump->u.label->addr - (jump->addr + sizeof(sljit_si)));
535#endif
536			}
537			else {
538#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
539				*(sljit_sw*)jump->addr = (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw)));
540#else
541				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_si))) <= HALFWORD_MAX);
542				*(sljit_si*)jump->addr = (sljit_si)(jump->u.target - (jump->addr + sizeof(sljit_si)));
543#endif
544			}
545		}
546#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
547		else if (jump->flags & PATCH_MD)
548			*(sljit_sw*)jump->addr = jump->u.label->addr;
549#endif
550
551		jump = jump->next;
552	}
553
554	/* Maybe we waste some space because of short jumps. */
555	SLJIT_ASSERT(code_ptr <= code + compiler->size);
556	compiler->error = SLJIT_ERR_COMPILED;
557	compiler->executable_size = code_ptr - code;
558	return (void*)code;
559}
560
561/* --------------------------------------------------------------------- */
562/*  Operators                                                            */
563/* --------------------------------------------------------------------- */
564
565static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
566	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
567	sljit_si dst, sljit_sw dstw,
568	sljit_si src1, sljit_sw src1w,
569	sljit_si src2, sljit_sw src2w);
570
571static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
572	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
573	sljit_si dst, sljit_sw dstw,
574	sljit_si src1, sljit_sw src1w,
575	sljit_si src2, sljit_sw src2w);
576
577static sljit_si emit_mov(struct sljit_compiler *compiler,
578	sljit_si dst, sljit_sw dstw,
579	sljit_si src, sljit_sw srcw);
580
581static SLJIT_INLINE sljit_si emit_save_flags(struct sljit_compiler *compiler)
582{
583	sljit_ub *inst;
584
585#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
586	inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
587	FAIL_IF(!inst);
588	INC_SIZE(5);
589#else
590	inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
591	FAIL_IF(!inst);
592	INC_SIZE(6);
593	*inst++ = REX_W;
594#endif
595	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
596	*inst++ = 0x64;
597	*inst++ = 0x24;
598	*inst++ = (sljit_ub)sizeof(sljit_sw);
599	*inst++ = PUSHF;
600	compiler->flags_saved = 1;
601	return SLJIT_SUCCESS;
602}
603
604static SLJIT_INLINE sljit_si emit_restore_flags(struct sljit_compiler *compiler, sljit_si keep_flags)
605{
606	sljit_ub *inst;
607
608#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
609	inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
610	FAIL_IF(!inst);
611	INC_SIZE(5);
612	*inst++ = POPF;
613#else
614	inst = (sljit_ub*)ensure_buf(compiler, 1 + 6);
615	FAIL_IF(!inst);
616	INC_SIZE(6);
617	*inst++ = POPF;
618	*inst++ = REX_W;
619#endif
620	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
621	*inst++ = 0x64;
622	*inst++ = 0x24;
623	*inst++ = (sljit_ub)-(sljit_sb)sizeof(sljit_sw);
624	compiler->flags_saved = keep_flags;
625	return SLJIT_SUCCESS;
626}
627
628#ifdef _WIN32
629#include <malloc.h>
630
631static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
632{
633	/* Workaround for calling the internal _chkstk() function on Windows.
634	This function touches all 4k pages belongs to the requested stack space,
635	which size is passed in local_size. This is necessary on Windows where
636	the stack can only grow in 4k steps. However, this function just burn
637	CPU cycles if the stack is large enough. However, you don't know it in
638	advance, so it must always be called. I think this is a bad design in
639	general even if it has some reasons. */
640	*(volatile sljit_si*)alloca(local_size) = 0;
641}
642
643#endif
644
645#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
646#include "sljitNativeX86_32.c"
647#else
648#include "sljitNativeX86_64.c"
649#endif
650
651static sljit_si emit_mov(struct sljit_compiler *compiler,
652	sljit_si dst, sljit_sw dstw,
653	sljit_si src, sljit_sw srcw)
654{
655	sljit_ub* inst;
656
657	if (dst == SLJIT_UNUSED) {
658		/* No destination, doesn't need to setup flags. */
659		if (src & SLJIT_MEM) {
660			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
661			FAIL_IF(!inst);
662			*inst = MOV_r_rm;
663		}
664		return SLJIT_SUCCESS;
665	}
666	if (FAST_IS_REG(src)) {
667		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
668		FAIL_IF(!inst);
669		*inst = MOV_rm_r;
670		return SLJIT_SUCCESS;
671	}
672	if (src & SLJIT_IMM) {
673		if (FAST_IS_REG(dst)) {
674#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
675			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
676#else
677			if (!compiler->mode32) {
678				if (NOT_HALFWORD(srcw))
679					return emit_load_imm64(compiler, dst, srcw);
680			}
681			else
682				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
683#endif
684		}
685#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
686		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
687			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
688			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
689			FAIL_IF(!inst);
690			*inst = MOV_rm_r;
691			return SLJIT_SUCCESS;
692		}
693#endif
694		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
695		FAIL_IF(!inst);
696		*inst = MOV_rm_i32;
697		return SLJIT_SUCCESS;
698	}
699	if (FAST_IS_REG(dst)) {
700		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
701		FAIL_IF(!inst);
702		*inst = MOV_r_rm;
703		return SLJIT_SUCCESS;
704	}
705
706	/* Memory to memory move. Requires two instruction. */
707	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
708	FAIL_IF(!inst);
709	*inst = MOV_r_rm;
710	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
711	FAIL_IF(!inst);
712	*inst = MOV_rm_r;
713	return SLJIT_SUCCESS;
714}
715
716#define EMIT_MOV(compiler, dst, dstw, src, srcw) \
717	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
718
719SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op0(struct sljit_compiler *compiler, sljit_si op)
720{
721	sljit_ub *inst;
722#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
723	sljit_si size;
724#endif
725
726	CHECK_ERROR();
727	check_sljit_emit_op0(compiler, op);
728
729	switch (GET_OPCODE(op)) {
730	case SLJIT_BREAKPOINT:
731		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
732		FAIL_IF(!inst);
733		INC_SIZE(1);
734		*inst = INT3;
735		break;
736	case SLJIT_NOP:
737		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
738		FAIL_IF(!inst);
739		INC_SIZE(1);
740		*inst = NOP;
741		break;
742	case SLJIT_UMUL:
743	case SLJIT_SMUL:
744	case SLJIT_UDIV:
745	case SLJIT_SDIV:
746		compiler->flags_saved = 0;
747#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
748#ifdef _WIN64
749		SLJIT_COMPILE_ASSERT(
750			reg_map[SLJIT_R0] == 0
751			&& reg_map[SLJIT_R1] == 2
752			&& reg_map[TMP_REG1] > 7,
753			invalid_register_assignment_for_div_mul);
754#else
755		SLJIT_COMPILE_ASSERT(
756			reg_map[SLJIT_R0] == 0
757			&& reg_map[SLJIT_R1] < 7
758			&& reg_map[TMP_REG1] == 2,
759			invalid_register_assignment_for_div_mul);
760#endif
761		compiler->mode32 = op & SLJIT_INT_OP;
762#endif
763
764		op = GET_OPCODE(op);
765		if (op == SLJIT_UDIV) {
766#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
767			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
768			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
769#else
770			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
771#endif
772			FAIL_IF(!inst);
773			*inst = XOR_r_rm;
774		}
775
776		if (op == SLJIT_SDIV) {
777#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
778			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
779#endif
780
781#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
782			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
783			FAIL_IF(!inst);
784			INC_SIZE(1);
785			*inst = CDQ;
786#else
787			if (compiler->mode32) {
788				inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
789				FAIL_IF(!inst);
790				INC_SIZE(1);
791				*inst = CDQ;
792			} else {
793				inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
794				FAIL_IF(!inst);
795				INC_SIZE(2);
796				*inst++ = REX_W;
797				*inst = CDQ;
798			}
799#endif
800		}
801
802#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
803		inst = (sljit_ub*)ensure_buf(compiler, 1 + 2);
804		FAIL_IF(!inst);
805		INC_SIZE(2);
806		*inst++ = GROUP_F7;
807		*inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
808#else
809#ifdef _WIN64
810		size = (!compiler->mode32 || op >= SLJIT_UDIV) ? 3 : 2;
811#else
812		size = (!compiler->mode32) ? 3 : 2;
813#endif
814		inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
815		FAIL_IF(!inst);
816		INC_SIZE(size);
817#ifdef _WIN64
818		if (!compiler->mode32)
819			*inst++ = REX_W | ((op >= SLJIT_UDIV) ? REX_B : 0);
820		else if (op >= SLJIT_UDIV)
821			*inst++ = REX_B;
822		*inst++ = GROUP_F7;
823		*inst = MOD_REG | ((op >= SLJIT_UDIV) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
824#else
825		if (!compiler->mode32)
826			*inst++ = REX_W;
827		*inst++ = GROUP_F7;
828		*inst = MOD_REG | reg_map[SLJIT_R1];
829#endif
830#endif
831		switch (op) {
832		case SLJIT_UMUL:
833			*inst |= MUL;
834			break;
835		case SLJIT_SMUL:
836			*inst |= IMUL;
837			break;
838		case SLJIT_UDIV:
839			*inst |= DIV;
840			break;
841		case SLJIT_SDIV:
842			*inst |= IDIV;
843			break;
844		}
845#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
846		EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
847#endif
848		break;
849	}
850
851	return SLJIT_SUCCESS;
852}
853
854#define ENCODE_PREFIX(prefix) \
855	do { \
856		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1); \
857		FAIL_IF(!inst); \
858		INC_SIZE(1); \
859		*inst = (prefix); \
860	} while (0)
861
862static sljit_si emit_mov_byte(struct sljit_compiler *compiler, sljit_si sign,
863	sljit_si dst, sljit_sw dstw,
864	sljit_si src, sljit_sw srcw)
865{
866	sljit_ub* inst;
867	sljit_si dst_r;
868#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
869	sljit_si work_r;
870#endif
871
872#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
873	compiler->mode32 = 0;
874#endif
875
876	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
877		return SLJIT_SUCCESS; /* Empty instruction. */
878
879	if (src & SLJIT_IMM) {
880		if (FAST_IS_REG(dst)) {
881#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
882			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
883#else
884			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
885			FAIL_IF(!inst);
886			*inst = MOV_rm_i32;
887			return SLJIT_SUCCESS;
888#endif
889		}
890		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
891		FAIL_IF(!inst);
892		*inst = MOV_rm8_i8;
893		return SLJIT_SUCCESS;
894	}
895
896	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
897
898	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
899#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
900		if (reg_map[src] >= 4) {
901			SLJIT_ASSERT(dst_r == TMP_REG1);
902			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
903		} else
904			dst_r = src;
905#else
906		dst_r = src;
907#endif
908	}
909#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
910	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
911		/* src, dst are registers. */
912		SLJIT_ASSERT(SLOW_IS_REG(dst));
913		if (reg_map[dst] < 4) {
914			if (dst != src)
915				EMIT_MOV(compiler, dst, 0, src, 0);
916			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
917			FAIL_IF(!inst);
918			*inst++ = GROUP_0F;
919			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
920		}
921		else {
922			if (dst != src)
923				EMIT_MOV(compiler, dst, 0, src, 0);
924			if (sign) {
925				/* shl reg, 24 */
926				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
927				FAIL_IF(!inst);
928				*inst |= SHL;
929				/* sar reg, 24 */
930				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
931				FAIL_IF(!inst);
932				*inst |= SAR;
933			}
934			else {
935				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
936				FAIL_IF(!inst);
937				*(inst + 1) |= AND;
938			}
939		}
940		return SLJIT_SUCCESS;
941	}
942#endif
943	else {
944		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
945		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
946		FAIL_IF(!inst);
947		*inst++ = GROUP_0F;
948		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
949	}
950
951	if (dst & SLJIT_MEM) {
952#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
953		if (dst_r == TMP_REG1) {
954			/* Find a non-used register, whose reg_map[src] < 4. */
955			if ((dst & REG_MASK) == SLJIT_R0) {
956				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
957					work_r = SLJIT_R2;
958				else
959					work_r = SLJIT_R1;
960			}
961			else {
962				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
963					work_r = SLJIT_R0;
964				else if ((dst & REG_MASK) == SLJIT_R1)
965					work_r = SLJIT_R2;
966				else
967					work_r = SLJIT_R1;
968			}
969
970			if (work_r == SLJIT_R0) {
971				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
972			}
973			else {
974				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
975				FAIL_IF(!inst);
976				*inst = XCHG_r_rm;
977			}
978
979			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
980			FAIL_IF(!inst);
981			*inst = MOV_rm8_r8;
982
983			if (work_r == SLJIT_R0) {
984				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
985			}
986			else {
987				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
988				FAIL_IF(!inst);
989				*inst = XCHG_r_rm;
990			}
991		}
992		else {
993			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
994			FAIL_IF(!inst);
995			*inst = MOV_rm8_r8;
996		}
997#else
998		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
999		FAIL_IF(!inst);
1000		*inst = MOV_rm8_r8;
1001#endif
1002	}
1003
1004	return SLJIT_SUCCESS;
1005}
1006
1007static sljit_si emit_mov_half(struct sljit_compiler *compiler, sljit_si sign,
1008	sljit_si dst, sljit_sw dstw,
1009	sljit_si src, sljit_sw srcw)
1010{
1011	sljit_ub* inst;
1012	sljit_si dst_r;
1013
1014#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1015	compiler->mode32 = 0;
1016#endif
1017
1018	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1019		return SLJIT_SUCCESS; /* Empty instruction. */
1020
1021	if (src & SLJIT_IMM) {
1022		if (FAST_IS_REG(dst)) {
1023#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1024			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1025#else
1026			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1027			FAIL_IF(!inst);
1028			*inst = MOV_rm_i32;
1029			return SLJIT_SUCCESS;
1030#endif
1031		}
1032		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1033		FAIL_IF(!inst);
1034		*inst = MOV_rm_i32;
1035		return SLJIT_SUCCESS;
1036	}
1037
1038	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1039
1040	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1041		dst_r = src;
1042	else {
1043		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1044		FAIL_IF(!inst);
1045		*inst++ = GROUP_0F;
1046		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1047	}
1048
1049	if (dst & SLJIT_MEM) {
1050		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1051		FAIL_IF(!inst);
1052		*inst = MOV_rm_r;
1053	}
1054
1055	return SLJIT_SUCCESS;
1056}
1057
1058static sljit_si emit_unary(struct sljit_compiler *compiler, sljit_ub opcode,
1059	sljit_si dst, sljit_sw dstw,
1060	sljit_si src, sljit_sw srcw)
1061{
1062	sljit_ub* inst;
1063
1064	if (dst == SLJIT_UNUSED) {
1065		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1066		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1067		FAIL_IF(!inst);
1068		*inst++ = GROUP_F7;
1069		*inst |= opcode;
1070		return SLJIT_SUCCESS;
1071	}
1072	if (dst == src && dstw == srcw) {
1073		/* Same input and output */
1074		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1075		FAIL_IF(!inst);
1076		*inst++ = GROUP_F7;
1077		*inst |= opcode;
1078		return SLJIT_SUCCESS;
1079	}
1080	if (FAST_IS_REG(dst)) {
1081		EMIT_MOV(compiler, dst, 0, src, srcw);
1082		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1083		FAIL_IF(!inst);
1084		*inst++ = GROUP_F7;
1085		*inst |= opcode;
1086		return SLJIT_SUCCESS;
1087	}
1088	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1089	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1090	FAIL_IF(!inst);
1091	*inst++ = GROUP_F7;
1092	*inst |= opcode;
1093	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1094	return SLJIT_SUCCESS;
1095}
1096
1097static sljit_si emit_not_with_flags(struct sljit_compiler *compiler,
1098	sljit_si dst, sljit_sw dstw,
1099	sljit_si src, sljit_sw srcw)
1100{
1101	sljit_ub* inst;
1102
1103	if (dst == SLJIT_UNUSED) {
1104		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1105		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1106		FAIL_IF(!inst);
1107		*inst++ = GROUP_F7;
1108		*inst |= NOT_rm;
1109		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1110		FAIL_IF(!inst);
1111		*inst = OR_r_rm;
1112		return SLJIT_SUCCESS;
1113	}
1114	if (FAST_IS_REG(dst)) {
1115		EMIT_MOV(compiler, dst, 0, src, srcw);
1116		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1117		FAIL_IF(!inst);
1118		*inst++ = GROUP_F7;
1119		*inst |= NOT_rm;
1120		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1121		FAIL_IF(!inst);
1122		*inst = OR_r_rm;
1123		return SLJIT_SUCCESS;
1124	}
1125	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1126	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1127	FAIL_IF(!inst);
1128	*inst++ = GROUP_F7;
1129	*inst |= NOT_rm;
1130	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1131	FAIL_IF(!inst);
1132	*inst = OR_r_rm;
1133	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1134	return SLJIT_SUCCESS;
1135}
1136
1137static sljit_si emit_clz(struct sljit_compiler *compiler, sljit_si op_flags,
1138	sljit_si dst, sljit_sw dstw,
1139	sljit_si src, sljit_sw srcw)
1140{
1141	sljit_ub* inst;
1142	sljit_si dst_r;
1143
1144	SLJIT_UNUSED_ARG(op_flags);
1145	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1146		/* Just set the zero flag. */
1147		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1148		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1149		FAIL_IF(!inst);
1150		*inst++ = GROUP_F7;
1151		*inst |= NOT_rm;
1152#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1153		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1154#else
1155		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, TMP_REG1, 0);
1156#endif
1157		FAIL_IF(!inst);
1158		*inst |= SHR;
1159		return SLJIT_SUCCESS;
1160	}
1161
1162	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1163		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1164		src = TMP_REG1;
1165		srcw = 0;
1166	}
1167
1168	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1169	FAIL_IF(!inst);
1170	*inst++ = GROUP_0F;
1171	*inst = BSR_r_rm;
1172
1173#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1174	if (FAST_IS_REG(dst))
1175		dst_r = dst;
1176	else {
1177		/* Find an unused temporary register. */
1178		if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1179			dst_r = SLJIT_R0;
1180		else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
1181			dst_r = SLJIT_R1;
1182		else
1183			dst_r = SLJIT_R2;
1184		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1185	}
1186	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1187#else
1188	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1189	compiler->mode32 = 0;
1190	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 64 + 63 : 32 + 31);
1191	compiler->mode32 = op_flags & SLJIT_INT_OP;
1192#endif
1193
1194	if (cpu_has_cmov == -1)
1195		get_cpu_features();
1196
1197	if (cpu_has_cmov) {
1198		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1199		FAIL_IF(!inst);
1200		*inst++ = GROUP_0F;
1201		*inst = CMOVNE_r_rm;
1202	} else {
1203#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1204		inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1205		FAIL_IF(!inst);
1206		INC_SIZE(4);
1207
1208		*inst++ = JE_i8;
1209		*inst++ = 2;
1210		*inst++ = MOV_r_rm;
1211		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1212#else
1213		inst = (sljit_ub*)ensure_buf(compiler, 1 + 5);
1214		FAIL_IF(!inst);
1215		INC_SIZE(5);
1216
1217		*inst++ = JE_i8;
1218		*inst++ = 3;
1219		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1220		*inst++ = MOV_r_rm;
1221		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1222#endif
1223	}
1224
1225#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1226	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1227#else
1228	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_INT_OP) ? 63 : 31, dst_r, 0);
1229#endif
1230	FAIL_IF(!inst);
1231	*(inst + 1) |= XOR;
1232
1233#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1234	if (dst & SLJIT_MEM) {
1235		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1236		FAIL_IF(!inst);
1237		*inst = XCHG_r_rm;
1238	}
1239#else
1240	if (dst & SLJIT_MEM)
1241		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1242#endif
1243	return SLJIT_SUCCESS;
1244}
1245
1246SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op1(struct sljit_compiler *compiler, sljit_si op,
1247	sljit_si dst, sljit_sw dstw,
1248	sljit_si src, sljit_sw srcw)
1249{
1250	sljit_ub* inst;
1251	sljit_si update = 0;
1252	sljit_si op_flags = GET_ALL_FLAGS(op);
1253#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1254	sljit_si dst_is_ereg = 0;
1255	sljit_si src_is_ereg = 0;
1256#else
1257#	define src_is_ereg 0
1258#endif
1259
1260	CHECK_ERROR();
1261	check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw);
1262	ADJUST_LOCAL_OFFSET(dst, dstw);
1263	ADJUST_LOCAL_OFFSET(src, srcw);
1264
1265	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1266	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1267#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1268	compiler->mode32 = op_flags & SLJIT_INT_OP;
1269#endif
1270
1271	op = GET_OPCODE(op);
1272	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1273#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1274		compiler->mode32 = 0;
1275#endif
1276
1277		if (op_flags & SLJIT_INT_OP) {
1278			if (FAST_IS_REG(src) && src == dst) {
1279				if (!TYPE_CAST_NEEDED(op))
1280					return SLJIT_SUCCESS;
1281			}
1282#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1283			if (op == SLJIT_MOV_SI && (src & SLJIT_MEM))
1284				op = SLJIT_MOV_UI;
1285			if (op == SLJIT_MOVU_SI && (src & SLJIT_MEM))
1286				op = SLJIT_MOVU_UI;
1287			if (op == SLJIT_MOV_UI && (src & SLJIT_IMM))
1288				op = SLJIT_MOV_SI;
1289			if (op == SLJIT_MOVU_UI && (src & SLJIT_IMM))
1290				op = SLJIT_MOVU_SI;
1291#endif
1292		}
1293
1294		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1295		if (op >= SLJIT_MOVU) {
1296			update = 1;
1297			op -= 8;
1298		}
1299
1300		if (src & SLJIT_IMM) {
1301			switch (op) {
1302			case SLJIT_MOV_UB:
1303				srcw = (sljit_ub)srcw;
1304				break;
1305			case SLJIT_MOV_SB:
1306				srcw = (sljit_sb)srcw;
1307				break;
1308			case SLJIT_MOV_UH:
1309				srcw = (sljit_uh)srcw;
1310				break;
1311			case SLJIT_MOV_SH:
1312				srcw = (sljit_sh)srcw;
1313				break;
1314#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1315			case SLJIT_MOV_UI:
1316				srcw = (sljit_ui)srcw;
1317				break;
1318			case SLJIT_MOV_SI:
1319				srcw = (sljit_si)srcw;
1320				break;
1321#endif
1322			}
1323#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1324			if (SLJIT_UNLIKELY(dst_is_ereg))
1325				return emit_mov(compiler, dst, dstw, src, srcw);
1326#endif
1327		}
1328
1329		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
1330			inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
1331			FAIL_IF(!inst);
1332			*inst = LEA_r_m;
1333			src &= SLJIT_MEM | 0xf;
1334			srcw = 0;
1335		}
1336
1337#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1338		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_UI || op == SLJIT_MOV_SI || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1339			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1340			dst = TMP_REG1;
1341		}
1342#endif
1343
1344		switch (op) {
1345		case SLJIT_MOV:
1346		case SLJIT_MOV_P:
1347#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1348		case SLJIT_MOV_UI:
1349		case SLJIT_MOV_SI:
1350#endif
1351			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1352			break;
1353		case SLJIT_MOV_UB:
1354			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1355			break;
1356		case SLJIT_MOV_SB:
1357			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1358			break;
1359		case SLJIT_MOV_UH:
1360			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1361			break;
1362		case SLJIT_MOV_SH:
1363			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1364			break;
1365#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1366		case SLJIT_MOV_UI:
1367			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1368			break;
1369		case SLJIT_MOV_SI:
1370			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1371			break;
1372#endif
1373		}
1374
1375#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1376		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1377			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1378#endif
1379
1380		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
1381			inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
1382			FAIL_IF(!inst);
1383			*inst = LEA_r_m;
1384		}
1385		return SLJIT_SUCCESS;
1386	}
1387
1388	if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
1389		compiler->flags_saved = 0;
1390
1391	switch (op) {
1392	case SLJIT_NOT:
1393		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
1394			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1395		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1396
1397	case SLJIT_NEG:
1398		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1399			FAIL_IF(emit_save_flags(compiler));
1400		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1401
1402	case SLJIT_CLZ:
1403		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1404			FAIL_IF(emit_save_flags(compiler));
1405		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1406	}
1407
1408	return SLJIT_SUCCESS;
1409
1410#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1411#	undef src_is_ereg
1412#endif
1413}
1414
1415#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1416
1417#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1418	if (IS_HALFWORD(immw) || compiler->mode32) { \
1419		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1420		FAIL_IF(!inst); \
1421		*(inst + 1) |= (op_imm); \
1422	} \
1423	else { \
1424		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1425		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1426		FAIL_IF(!inst); \
1427		*inst = (op_mr); \
1428	}
1429
1430#define BINARY_EAX_IMM(op_eax_imm, immw) \
1431	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1432
1433#else
1434
1435#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1436	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1437	FAIL_IF(!inst); \
1438	*(inst + 1) |= (op_imm);
1439
1440#define BINARY_EAX_IMM(op_eax_imm, immw) \
1441	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1442
1443#endif
1444
1445static sljit_si emit_cum_binary(struct sljit_compiler *compiler,
1446	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
1447	sljit_si dst, sljit_sw dstw,
1448	sljit_si src1, sljit_sw src1w,
1449	sljit_si src2, sljit_sw src2w)
1450{
1451	sljit_ub* inst;
1452
1453	if (dst == SLJIT_UNUSED) {
1454		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1455		if (src2 & SLJIT_IMM) {
1456			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1457		}
1458		else {
1459			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1460			FAIL_IF(!inst);
1461			*inst = op_rm;
1462		}
1463		return SLJIT_SUCCESS;
1464	}
1465
1466	if (dst == src1 && dstw == src1w) {
1467		if (src2 & SLJIT_IMM) {
1468#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1469			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1470#else
1471			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1472#endif
1473				BINARY_EAX_IMM(op_eax_imm, src2w);
1474			}
1475			else {
1476				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1477			}
1478		}
1479		else if (FAST_IS_REG(dst)) {
1480			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1481			FAIL_IF(!inst);
1482			*inst = op_rm;
1483		}
1484		else if (FAST_IS_REG(src2)) {
1485			/* Special exception for sljit_emit_op_flags. */
1486			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1487			FAIL_IF(!inst);
1488			*inst = op_mr;
1489		}
1490		else {
1491			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1492			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1493			FAIL_IF(!inst);
1494			*inst = op_mr;
1495		}
1496		return SLJIT_SUCCESS;
1497	}
1498
1499	/* Only for cumulative operations. */
1500	if (dst == src2 && dstw == src2w) {
1501		if (src1 & SLJIT_IMM) {
1502#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1503			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1504#else
1505			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1506#endif
1507				BINARY_EAX_IMM(op_eax_imm, src1w);
1508			}
1509			else {
1510				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1511			}
1512		}
1513		else if (FAST_IS_REG(dst)) {
1514			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1515			FAIL_IF(!inst);
1516			*inst = op_rm;
1517		}
1518		else if (FAST_IS_REG(src1)) {
1519			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1520			FAIL_IF(!inst);
1521			*inst = op_mr;
1522		}
1523		else {
1524			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1525			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1526			FAIL_IF(!inst);
1527			*inst = op_mr;
1528		}
1529		return SLJIT_SUCCESS;
1530	}
1531
1532	/* General version. */
1533	if (FAST_IS_REG(dst)) {
1534		EMIT_MOV(compiler, dst, 0, src1, src1w);
1535		if (src2 & SLJIT_IMM) {
1536			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1537		}
1538		else {
1539			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1540			FAIL_IF(!inst);
1541			*inst = op_rm;
1542		}
1543	}
1544	else {
1545		/* This version requires less memory writing. */
1546		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1547		if (src2 & SLJIT_IMM) {
1548			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1549		}
1550		else {
1551			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1552			FAIL_IF(!inst);
1553			*inst = op_rm;
1554		}
1555		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1556	}
1557
1558	return SLJIT_SUCCESS;
1559}
1560
1561static sljit_si emit_non_cum_binary(struct sljit_compiler *compiler,
1562	sljit_ub op_rm, sljit_ub op_mr, sljit_ub op_imm, sljit_ub op_eax_imm,
1563	sljit_si dst, sljit_sw dstw,
1564	sljit_si src1, sljit_sw src1w,
1565	sljit_si src2, sljit_sw src2w)
1566{
1567	sljit_ub* inst;
1568
1569	if (dst == SLJIT_UNUSED) {
1570		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1571		if (src2 & SLJIT_IMM) {
1572			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1573		}
1574		else {
1575			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1576			FAIL_IF(!inst);
1577			*inst = op_rm;
1578		}
1579		return SLJIT_SUCCESS;
1580	}
1581
1582	if (dst == src1 && dstw == src1w) {
1583		if (src2 & SLJIT_IMM) {
1584#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1585			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1586#else
1587			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1588#endif
1589				BINARY_EAX_IMM(op_eax_imm, src2w);
1590			}
1591			else {
1592				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1593			}
1594		}
1595		else if (FAST_IS_REG(dst)) {
1596			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1597			FAIL_IF(!inst);
1598			*inst = op_rm;
1599		}
1600		else if (FAST_IS_REG(src2)) {
1601			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1602			FAIL_IF(!inst);
1603			*inst = op_mr;
1604		}
1605		else {
1606			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1607			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1608			FAIL_IF(!inst);
1609			*inst = op_mr;
1610		}
1611		return SLJIT_SUCCESS;
1612	}
1613
1614	/* General version. */
1615	if (FAST_IS_REG(dst) && dst != src2) {
1616		EMIT_MOV(compiler, dst, 0, src1, src1w);
1617		if (src2 & SLJIT_IMM) {
1618			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1619		}
1620		else {
1621			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1622			FAIL_IF(!inst);
1623			*inst = op_rm;
1624		}
1625	}
1626	else {
1627		/* This version requires less memory writing. */
1628		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1629		if (src2 & SLJIT_IMM) {
1630			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1631		}
1632		else {
1633			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1634			FAIL_IF(!inst);
1635			*inst = op_rm;
1636		}
1637		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1638	}
1639
1640	return SLJIT_SUCCESS;
1641}
1642
1643static sljit_si emit_mul(struct sljit_compiler *compiler,
1644	sljit_si dst, sljit_sw dstw,
1645	sljit_si src1, sljit_sw src1w,
1646	sljit_si src2, sljit_sw src2w)
1647{
1648	sljit_ub* inst;
1649	sljit_si dst_r;
1650
1651	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1652
1653	/* Register destination. */
1654	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1655		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1656		FAIL_IF(!inst);
1657		*inst++ = GROUP_0F;
1658		*inst = IMUL_r_rm;
1659	}
1660	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1661		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1662		FAIL_IF(!inst);
1663		*inst++ = GROUP_0F;
1664		*inst = IMUL_r_rm;
1665	}
1666	else if (src1 & SLJIT_IMM) {
1667		if (src2 & SLJIT_IMM) {
1668			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1669			src2 = dst_r;
1670			src2w = 0;
1671		}
1672
1673		if (src1w <= 127 && src1w >= -128) {
1674			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1675			FAIL_IF(!inst);
1676			*inst = IMUL_r_rm_i8;
1677			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
1678			FAIL_IF(!inst);
1679			INC_SIZE(1);
1680			*inst = (sljit_sb)src1w;
1681		}
1682#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1683		else {
1684			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1685			FAIL_IF(!inst);
1686			*inst = IMUL_r_rm_i32;
1687			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1688			FAIL_IF(!inst);
1689			INC_SIZE(4);
1690			*(sljit_sw*)inst = src1w;
1691		}
1692#else
1693		else if (IS_HALFWORD(src1w)) {
1694			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1695			FAIL_IF(!inst);
1696			*inst = IMUL_r_rm_i32;
1697			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1698			FAIL_IF(!inst);
1699			INC_SIZE(4);
1700			*(sljit_si*)inst = (sljit_si)src1w;
1701		}
1702		else {
1703			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1704			if (dst_r != src2)
1705				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1706			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1707			FAIL_IF(!inst);
1708			*inst++ = GROUP_0F;
1709			*inst = IMUL_r_rm;
1710		}
1711#endif
1712	}
1713	else if (src2 & SLJIT_IMM) {
1714		/* Note: src1 is NOT immediate. */
1715
1716		if (src2w <= 127 && src2w >= -128) {
1717			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1718			FAIL_IF(!inst);
1719			*inst = IMUL_r_rm_i8;
1720			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1);
1721			FAIL_IF(!inst);
1722			INC_SIZE(1);
1723			*inst = (sljit_sb)src2w;
1724		}
1725#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1726		else {
1727			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1728			FAIL_IF(!inst);
1729			*inst = IMUL_r_rm_i32;
1730			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1731			FAIL_IF(!inst);
1732			INC_SIZE(4);
1733			*(sljit_sw*)inst = src2w;
1734		}
1735#else
1736		else if (IS_HALFWORD(src2w)) {
1737			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1738			FAIL_IF(!inst);
1739			*inst = IMUL_r_rm_i32;
1740			inst = (sljit_ub*)ensure_buf(compiler, 1 + 4);
1741			FAIL_IF(!inst);
1742			INC_SIZE(4);
1743			*(sljit_si*)inst = (sljit_si)src2w;
1744		}
1745		else {
1746			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1747			if (dst_r != src1)
1748				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1749			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1750			FAIL_IF(!inst);
1751			*inst++ = GROUP_0F;
1752			*inst = IMUL_r_rm;
1753		}
1754#endif
1755	}
1756	else {
1757		/* Neither argument is immediate. */
1758		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1759			dst_r = TMP_REG1;
1760		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1761		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1762		FAIL_IF(!inst);
1763		*inst++ = GROUP_0F;
1764		*inst = IMUL_r_rm;
1765	}
1766
1767	if (dst_r == TMP_REG1)
1768		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1769
1770	return SLJIT_SUCCESS;
1771}
1772
1773static sljit_si emit_lea_binary(struct sljit_compiler *compiler, sljit_si keep_flags,
1774	sljit_si dst, sljit_sw dstw,
1775	sljit_si src1, sljit_sw src1w,
1776	sljit_si src2, sljit_sw src2w)
1777{
1778	sljit_ub* inst;
1779	sljit_si dst_r, done = 0;
1780
1781	/* These cases better be left to handled by normal way. */
1782	if (!keep_flags) {
1783		if (dst == src1 && dstw == src1w)
1784			return SLJIT_ERR_UNSUPPORTED;
1785		if (dst == src2 && dstw == src2w)
1786			return SLJIT_ERR_UNSUPPORTED;
1787	}
1788
1789	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1790
1791	if (FAST_IS_REG(src1)) {
1792		if (FAST_IS_REG(src2)) {
1793			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1794			FAIL_IF(!inst);
1795			*inst = LEA_r_m;
1796			done = 1;
1797		}
1798#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1799		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1800			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_si)src2w);
1801#else
1802		if (src2 & SLJIT_IMM) {
1803			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1804#endif
1805			FAIL_IF(!inst);
1806			*inst = LEA_r_m;
1807			done = 1;
1808		}
1809	}
1810	else if (FAST_IS_REG(src2)) {
1811#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1812		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1813			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_si)src1w);
1814#else
1815		if (src1 & SLJIT_IMM) {
1816			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1817#endif
1818			FAIL_IF(!inst);
1819			*inst = LEA_r_m;
1820			done = 1;
1821		}
1822	}
1823
1824	if (done) {
1825		if (dst_r == TMP_REG1)
1826			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1827		return SLJIT_SUCCESS;
1828	}
1829	return SLJIT_ERR_UNSUPPORTED;
1830}
1831
1832static sljit_si emit_cmp_binary(struct sljit_compiler *compiler,
1833	sljit_si src1, sljit_sw src1w,
1834	sljit_si src2, sljit_sw src2w)
1835{
1836	sljit_ub* inst;
1837
1838#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1839	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1840#else
1841	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1842#endif
1843		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1844		return SLJIT_SUCCESS;
1845	}
1846
1847	if (FAST_IS_REG(src1)) {
1848		if (src2 & SLJIT_IMM) {
1849			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1850		}
1851		else {
1852			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1853			FAIL_IF(!inst);
1854			*inst = CMP_r_rm;
1855		}
1856		return SLJIT_SUCCESS;
1857	}
1858
1859	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1860		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1861		FAIL_IF(!inst);
1862		*inst = CMP_rm_r;
1863		return SLJIT_SUCCESS;
1864	}
1865
1866	if (src2 & SLJIT_IMM) {
1867		if (src1 & SLJIT_IMM) {
1868			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1869			src1 = TMP_REG1;
1870			src1w = 0;
1871		}
1872		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1873	}
1874	else {
1875		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1876		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1877		FAIL_IF(!inst);
1878		*inst = CMP_r_rm;
1879	}
1880	return SLJIT_SUCCESS;
1881}
1882
1883static sljit_si emit_test_binary(struct sljit_compiler *compiler,
1884	sljit_si src1, sljit_sw src1w,
1885	sljit_si src2, sljit_sw src2w)
1886{
1887	sljit_ub* inst;
1888
1889#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1890	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1891#else
1892	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1893#endif
1894		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1895		return SLJIT_SUCCESS;
1896	}
1897
1898#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1899	if (src2 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1900#else
1901	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1902#endif
1903		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1904		return SLJIT_SUCCESS;
1905	}
1906
1907	if (FAST_IS_REG(src1)) {
1908		if (src2 & SLJIT_IMM) {
1909#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1910			if (IS_HALFWORD(src2w) || compiler->mode32) {
1911				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
1912				FAIL_IF(!inst);
1913				*inst = GROUP_F7;
1914			}
1915			else {
1916				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1917				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, 0);
1918				FAIL_IF(!inst);
1919				*inst = TEST_rm_r;
1920			}
1921#else
1922			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, 0);
1923			FAIL_IF(!inst);
1924			*inst = GROUP_F7;
1925#endif
1926		}
1927		else {
1928			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1929			FAIL_IF(!inst);
1930			*inst = TEST_rm_r;
1931		}
1932		return SLJIT_SUCCESS;
1933	}
1934
1935	if (FAST_IS_REG(src2)) {
1936		if (src1 & SLJIT_IMM) {
1937#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1938			if (IS_HALFWORD(src1w) || compiler->mode32) {
1939				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, 0);
1940				FAIL_IF(!inst);
1941				*inst = GROUP_F7;
1942			}
1943			else {
1944				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1945				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, 0);
1946				FAIL_IF(!inst);
1947				*inst = TEST_rm_r;
1948			}
1949#else
1950			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, 0);
1951			FAIL_IF(!inst);
1952			*inst = GROUP_F7;
1953#endif
1954		}
1955		else {
1956			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1957			FAIL_IF(!inst);
1958			*inst = TEST_rm_r;
1959		}
1960		return SLJIT_SUCCESS;
1961	}
1962
1963	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1964	if (src2 & SLJIT_IMM) {
1965#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1966		if (IS_HALFWORD(src2w) || compiler->mode32) {
1967			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1968			FAIL_IF(!inst);
1969			*inst = GROUP_F7;
1970		}
1971		else {
1972			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1973			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
1974			FAIL_IF(!inst);
1975			*inst = TEST_rm_r;
1976		}
1977#else
1978		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
1979		FAIL_IF(!inst);
1980		*inst = GROUP_F7;
1981#endif
1982	}
1983	else {
1984		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1985		FAIL_IF(!inst);
1986		*inst = TEST_rm_r;
1987	}
1988	return SLJIT_SUCCESS;
1989}
1990
1991static sljit_si emit_shift(struct sljit_compiler *compiler,
1992	sljit_ub mode,
1993	sljit_si dst, sljit_sw dstw,
1994	sljit_si src1, sljit_sw src1w,
1995	sljit_si src2, sljit_sw src2w)
1996{
1997	sljit_ub* inst;
1998
1999	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2000		if (dst == src1 && dstw == src1w) {
2001			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2002			FAIL_IF(!inst);
2003			*inst |= mode;
2004			return SLJIT_SUCCESS;
2005		}
2006		if (dst == SLJIT_UNUSED) {
2007			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2008			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2009			FAIL_IF(!inst);
2010			*inst |= mode;
2011			return SLJIT_SUCCESS;
2012		}
2013		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2014			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2015			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2016			FAIL_IF(!inst);
2017			*inst |= mode;
2018			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2019			return SLJIT_SUCCESS;
2020		}
2021		if (FAST_IS_REG(dst)) {
2022			EMIT_MOV(compiler, dst, 0, src1, src1w);
2023			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2024			FAIL_IF(!inst);
2025			*inst |= mode;
2026			return SLJIT_SUCCESS;
2027		}
2028
2029		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2030		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2031		FAIL_IF(!inst);
2032		*inst |= mode;
2033		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2034		return SLJIT_SUCCESS;
2035	}
2036
2037	if (dst == SLJIT_PREF_SHIFT_REG) {
2038		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2039		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2040		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2041		FAIL_IF(!inst);
2042		*inst |= mode;
2043		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2044	}
2045	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2046		if (src1 != dst)
2047			EMIT_MOV(compiler, dst, 0, src1, src1w);
2048		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2049		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2050		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2051		FAIL_IF(!inst);
2052		*inst |= mode;
2053		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2054	}
2055	else {
2056		/* This case is really difficult, since ecx itself may used for
2057		   addressing, and we must ensure to work even in that case. */
2058		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2059#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2060		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2061#else
2062		/* [esp+0] contains the flags. */
2063		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
2064#endif
2065		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2066		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2067		FAIL_IF(!inst);
2068		*inst |= mode;
2069#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2070		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2071#else
2072		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
2073#endif
2074		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2075	}
2076
2077	return SLJIT_SUCCESS;
2078}
2079
2080static sljit_si emit_shift_with_flags(struct sljit_compiler *compiler,
2081	sljit_ub mode, sljit_si set_flags,
2082	sljit_si dst, sljit_sw dstw,
2083	sljit_si src1, sljit_sw src1w,
2084	sljit_si src2, sljit_sw src2w)
2085{
2086	/* The CPU does not set flags if the shift count is 0. */
2087	if (src2 & SLJIT_IMM) {
2088#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2089		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2090			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2091#else
2092		if ((src2w & 0x1f) != 0)
2093			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2094#endif
2095		if (!set_flags)
2096			return emit_mov(compiler, dst, dstw, src1, src1w);
2097		/* OR dst, src, 0 */
2098		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2099			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2100	}
2101
2102	if (!set_flags)
2103		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2104
2105	if (!FAST_IS_REG(dst))
2106		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2107
2108	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2109
2110	if (FAST_IS_REG(dst))
2111		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2112	return SLJIT_SUCCESS;
2113}
2114
2115SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op2(struct sljit_compiler *compiler, sljit_si op,
2116	sljit_si dst, sljit_sw dstw,
2117	sljit_si src1, sljit_sw src1w,
2118	sljit_si src2, sljit_sw src2w)
2119{
2120	CHECK_ERROR();
2121	check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
2122	ADJUST_LOCAL_OFFSET(dst, dstw);
2123	ADJUST_LOCAL_OFFSET(src1, src1w);
2124	ADJUST_LOCAL_OFFSET(src2, src2w);
2125
2126	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2127	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2128	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2129#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2130	compiler->mode32 = op & SLJIT_INT_OP;
2131#endif
2132
2133	if (GET_OPCODE(op) >= SLJIT_MUL) {
2134		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2135			compiler->flags_saved = 0;
2136		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2137			FAIL_IF(emit_save_flags(compiler));
2138	}
2139
2140	switch (GET_OPCODE(op)) {
2141	case SLJIT_ADD:
2142		if (!GET_FLAGS(op)) {
2143			if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2144				return compiler->error;
2145		}
2146		else
2147			compiler->flags_saved = 0;
2148		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2149			FAIL_IF(emit_save_flags(compiler));
2150		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2151			dst, dstw, src1, src1w, src2, src2w);
2152	case SLJIT_ADDC:
2153		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2154			FAIL_IF(emit_restore_flags(compiler, 1));
2155		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2156			FAIL_IF(emit_save_flags(compiler));
2157		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2158			compiler->flags_saved = 0;
2159		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2160			dst, dstw, src1, src1w, src2, src2w);
2161	case SLJIT_SUB:
2162		if (!GET_FLAGS(op)) {
2163			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2164				return compiler->error;
2165		}
2166		else
2167			compiler->flags_saved = 0;
2168		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2169			FAIL_IF(emit_save_flags(compiler));
2170		if (dst == SLJIT_UNUSED)
2171			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2172		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2173			dst, dstw, src1, src1w, src2, src2w);
2174	case SLJIT_SUBC:
2175		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2176			FAIL_IF(emit_restore_flags(compiler, 1));
2177		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2178			FAIL_IF(emit_save_flags(compiler));
2179		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2180			compiler->flags_saved = 0;
2181		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2182			dst, dstw, src1, src1w, src2, src2w);
2183	case SLJIT_MUL:
2184		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2185	case SLJIT_AND:
2186		if (dst == SLJIT_UNUSED)
2187			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2188		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2189			dst, dstw, src1, src1w, src2, src2w);
2190	case SLJIT_OR:
2191		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2192			dst, dstw, src1, src1w, src2, src2w);
2193	case SLJIT_XOR:
2194		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2195			dst, dstw, src1, src1w, src2, src2w);
2196	case SLJIT_SHL:
2197		return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
2198			dst, dstw, src1, src1w, src2, src2w);
2199	case SLJIT_LSHR:
2200		return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
2201			dst, dstw, src1, src1w, src2, src2w);
2202	case SLJIT_ASHR:
2203		return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
2204			dst, dstw, src1, src1w, src2, src2w);
2205	}
2206
2207	return SLJIT_SUCCESS;
2208}
2209
2210SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_register_index(sljit_si reg)
2211{
2212	check_sljit_get_register_index(reg);
2213#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2214	if (reg >= SLJIT_R3 && reg <= SLJIT_R6)
2215		return -1;
2216#endif
2217	return reg_map[reg];
2218}
2219
2220SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_float_register_index(sljit_si reg)
2221{
2222	check_sljit_get_float_register_index(reg);
2223	return reg;
2224}
2225
2226SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_custom(struct sljit_compiler *compiler,
2227	void *instruction, sljit_si size)
2228{
2229	sljit_ub *inst;
2230
2231	CHECK_ERROR();
2232	check_sljit_emit_op_custom(compiler, instruction, size);
2233	SLJIT_ASSERT(size > 0 && size < 16);
2234
2235	inst = (sljit_ub*)ensure_buf(compiler, 1 + size);
2236	FAIL_IF(!inst);
2237	INC_SIZE(size);
2238	SLJIT_MEMMOVE(inst, instruction, size);
2239	return SLJIT_SUCCESS;
2240}
2241
2242/* --------------------------------------------------------------------- */
2243/*  Floating point operators                                             */
2244/* --------------------------------------------------------------------- */
2245
2246/* Alignment + 2 * 16 bytes. */
2247static sljit_si sse2_data[3 + (4 + 4) * 2];
2248static sljit_si *sse2_buffer;
2249
2250static void init_compiler(void)
2251{
2252	sse2_buffer = (sljit_si*)(((sljit_uw)sse2_data + 15) & ~0xf);
2253	/* Single precision constants. */
2254	sse2_buffer[0] = 0x80000000;
2255	sse2_buffer[4] = 0x7fffffff;
2256	/* Double precision constants. */
2257	sse2_buffer[8] = 0;
2258	sse2_buffer[9] = 0x80000000;
2259	sse2_buffer[12] = 0xffffffff;
2260	sse2_buffer[13] = 0x7fffffff;
2261}
2262
2263SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_is_fpu_available(void)
2264{
2265#ifdef SLJIT_IS_FPU_AVAILABLE
2266	return SLJIT_IS_FPU_AVAILABLE;
2267#elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2268	if (cpu_has_sse2 == -1)
2269		get_cpu_features();
2270	return cpu_has_sse2;
2271#else /* SLJIT_DETECT_SSE2 */
2272	return 1;
2273#endif /* SLJIT_DETECT_SSE2 */
2274}
2275
2276static sljit_si emit_sse2(struct sljit_compiler *compiler, sljit_ub opcode,
2277	sljit_si single, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
2278{
2279	sljit_ub *inst;
2280
2281	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2282	FAIL_IF(!inst);
2283	*inst++ = GROUP_0F;
2284	*inst = opcode;
2285	return SLJIT_SUCCESS;
2286}
2287
2288static sljit_si emit_sse2_logic(struct sljit_compiler *compiler, sljit_ub opcode,
2289	sljit_si pref66, sljit_si xmm1, sljit_si xmm2, sljit_sw xmm2w)
2290{
2291	sljit_ub *inst;
2292
2293	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2294	FAIL_IF(!inst);
2295	*inst++ = GROUP_0F;
2296	*inst = opcode;
2297	return SLJIT_SUCCESS;
2298}
2299
2300static SLJIT_INLINE sljit_si emit_sse2_load(struct sljit_compiler *compiler,
2301	sljit_si single, sljit_si dst, sljit_si src, sljit_sw srcw)
2302{
2303	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2304}
2305
2306static SLJIT_INLINE sljit_si emit_sse2_store(struct sljit_compiler *compiler,
2307	sljit_si single, sljit_si dst, sljit_sw dstw, sljit_si src)
2308{
2309	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2310}
2311
2312static SLJIT_INLINE sljit_si sljit_emit_fop1_convw_fromd(struct sljit_compiler *compiler, sljit_si op,
2313	sljit_si dst, sljit_sw dstw,
2314	sljit_si src, sljit_sw srcw)
2315{
2316	sljit_si dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2317	sljit_ub *inst;
2318
2319#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2320	if (GET_OPCODE(op) == SLJIT_CONVW_FROMD)
2321		compiler->mode32 = 0;
2322#endif
2323
2324	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_SINGLE_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2325	FAIL_IF(!inst);
2326	*inst++ = GROUP_0F;
2327	*inst = CVTTSD2SI_r_xm;
2328
2329	if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
2330		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2331	return SLJIT_SUCCESS;
2332}
2333
2334static SLJIT_INLINE sljit_si sljit_emit_fop1_convd_fromw(struct sljit_compiler *compiler, sljit_si op,
2335	sljit_si dst, sljit_sw dstw,
2336	sljit_si src, sljit_sw srcw)
2337{
2338	sljit_si dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2339	sljit_ub *inst;
2340
2341#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2342	if (GET_OPCODE(op) == SLJIT_CONVD_FROMW)
2343		compiler->mode32 = 0;
2344#endif
2345
2346	if (src & SLJIT_IMM) {
2347#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2348		if (GET_OPCODE(op) == SLJIT_CONVD_FROMI)
2349			srcw = (sljit_si)srcw;
2350#endif
2351		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2352		src = TMP_REG1;
2353		srcw = 0;
2354	}
2355
2356	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_SINGLE_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2357	FAIL_IF(!inst);
2358	*inst++ = GROUP_0F;
2359	*inst = CVTSI2SD_x_rm;
2360
2361#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2362	compiler->mode32 = 1;
2363#endif
2364	if (dst_r == TMP_FREG)
2365		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2366	return SLJIT_SUCCESS;
2367}
2368
2369static SLJIT_INLINE sljit_si sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_si op,
2370	sljit_si src1, sljit_sw src1w,
2371	sljit_si src2, sljit_sw src2w)
2372{
2373	compiler->flags_saved = 0;
2374	if (!FAST_IS_REG(src1)) {
2375		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2376		src1 = TMP_FREG;
2377	}
2378	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_SINGLE_OP), src1, src2, src2w);
2379}
2380
2381SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop1(struct sljit_compiler *compiler, sljit_si op,
2382	sljit_si dst, sljit_sw dstw,
2383	sljit_si src, sljit_sw srcw)
2384{
2385	sljit_si dst_r;
2386
2387#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2388	compiler->mode32 = 1;
2389#endif
2390
2391	CHECK_ERROR();
2392	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2393
2394	if (GET_OPCODE(op) == SLJIT_MOVD) {
2395		if (FAST_IS_REG(dst))
2396			return emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst, src, srcw);
2397		if (FAST_IS_REG(src))
2398			return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, src);
2399		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src, srcw));
2400		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2401	}
2402
2403	if (GET_OPCODE(op) == SLJIT_CONVD_FROMS) {
2404		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2405		if (FAST_IS_REG(src)) {
2406			/* We overwrite the high bits of source. From SLJIT point of view,
2407			   this is not an issue.
2408			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2409			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_SINGLE_OP, src, src, 0));
2410		}
2411		else {
2412			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_SINGLE_OP), TMP_FREG, src, srcw));
2413			src = TMP_FREG;
2414		}
2415
2416		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_SINGLE_OP, dst_r, src, 0));
2417		if (dst_r == TMP_FREG)
2418			return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2419		return SLJIT_SUCCESS;
2420	}
2421
2422	if (SLOW_IS_REG(dst)) {
2423		dst_r = dst;
2424		if (dst != src)
2425			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
2426	}
2427	else {
2428		dst_r = TMP_FREG;
2429		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src, srcw));
2430	}
2431
2432	switch (GET_OPCODE(op)) {
2433	case SLJIT_NEGD:
2434		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer : sse2_buffer + 8)));
2435		break;
2436
2437	case SLJIT_ABSD:
2438		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_SINGLE_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2439		break;
2440	}
2441
2442	if (dst_r == TMP_FREG)
2443		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2444	return SLJIT_SUCCESS;
2445}
2446
2447SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_fop2(struct sljit_compiler *compiler, sljit_si op,
2448	sljit_si dst, sljit_sw dstw,
2449	sljit_si src1, sljit_sw src1w,
2450	sljit_si src2, sljit_sw src2w)
2451{
2452	sljit_si dst_r;
2453
2454	CHECK_ERROR();
2455	check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w);
2456	ADJUST_LOCAL_OFFSET(dst, dstw);
2457	ADJUST_LOCAL_OFFSET(src1, src1w);
2458	ADJUST_LOCAL_OFFSET(src2, src2w);
2459
2460#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2461	compiler->mode32 = 1;
2462#endif
2463
2464	if (FAST_IS_REG(dst)) {
2465		dst_r = dst;
2466		if (dst == src1)
2467			; /* Do nothing here. */
2468		else if (dst == src2 && (op == SLJIT_ADDD || op == SLJIT_MULD)) {
2469			/* Swap arguments. */
2470			src2 = src1;
2471			src2w = src1w;
2472		}
2473		else if (dst != src2)
2474			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, dst_r, src1, src1w));
2475		else {
2476			dst_r = TMP_FREG;
2477			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2478		}
2479	}
2480	else {
2481		dst_r = TMP_FREG;
2482		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_SINGLE_OP, TMP_FREG, src1, src1w));
2483	}
2484
2485	switch (GET_OPCODE(op)) {
2486	case SLJIT_ADDD:
2487		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2488		break;
2489
2490	case SLJIT_SUBD:
2491		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2492		break;
2493
2494	case SLJIT_MULD:
2495		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2496		break;
2497
2498	case SLJIT_DIVD:
2499		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_SINGLE_OP, dst_r, src2, src2w));
2500		break;
2501	}
2502
2503	if (dst_r == TMP_FREG)
2504		return emit_sse2_store(compiler, op & SLJIT_SINGLE_OP, dst, dstw, TMP_FREG);
2505	return SLJIT_SUCCESS;
2506}
2507
2508/* --------------------------------------------------------------------- */
2509/*  Conditional instructions                                             */
2510/* --------------------------------------------------------------------- */
2511
2512SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2513{
2514	sljit_ub *inst;
2515	struct sljit_label *label;
2516
2517	CHECK_ERROR_PTR();
2518	check_sljit_emit_label(compiler);
2519
2520	/* We should restore the flags before the label,
2521	   since other taken jumps has their own flags as well. */
2522	if (SLJIT_UNLIKELY(compiler->flags_saved))
2523		PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2524
2525	if (compiler->last_label && compiler->last_label->size == compiler->size)
2526		return compiler->last_label;
2527
2528	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2529	PTR_FAIL_IF(!label);
2530	set_label(label, compiler);
2531
2532	inst = (sljit_ub*)ensure_buf(compiler, 2);
2533	PTR_FAIL_IF(!inst);
2534
2535	*inst++ = 0;
2536	*inst++ = 0;
2537
2538	return label;
2539}
2540
2541SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_si type)
2542{
2543	sljit_ub *inst;
2544	struct sljit_jump *jump;
2545
2546	CHECK_ERROR_PTR();
2547	check_sljit_emit_jump(compiler, type);
2548
2549	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2550		if ((type & 0xff) <= SLJIT_JUMP)
2551			PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2552		compiler->flags_saved = 0;
2553	}
2554
2555	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2556	PTR_FAIL_IF_NULL(jump);
2557	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2558	type &= 0xff;
2559
2560	if (type >= SLJIT_CALL1)
2561		PTR_FAIL_IF(call_with_args(compiler, type));
2562
2563	/* Worst case size. */
2564#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2565	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2566#else
2567	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2568#endif
2569
2570	inst = (sljit_ub*)ensure_buf(compiler, 2);
2571	PTR_FAIL_IF_NULL(inst);
2572
2573	*inst++ = 0;
2574	*inst++ = type + 4;
2575	return jump;
2576}
2577
2578SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_ijump(struct sljit_compiler *compiler, sljit_si type, sljit_si src, sljit_sw srcw)
2579{
2580	sljit_ub *inst;
2581	struct sljit_jump *jump;
2582
2583	CHECK_ERROR();
2584	check_sljit_emit_ijump(compiler, type, src, srcw);
2585	ADJUST_LOCAL_OFFSET(src, srcw);
2586
2587	CHECK_EXTRA_REGS(src, srcw, (void)0);
2588
2589	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2590		if (type <= SLJIT_JUMP)
2591			FAIL_IF(emit_restore_flags(compiler, 0));
2592		compiler->flags_saved = 0;
2593	}
2594
2595	if (type >= SLJIT_CALL1) {
2596#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2597#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2598		if (src == SLJIT_R2) {
2599			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2600			src = TMP_REG1;
2601		}
2602		if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
2603			srcw += sizeof(sljit_sw);
2604#endif
2605#endif
2606#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2607		if (src == SLJIT_R2) {
2608			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2609			src = TMP_REG1;
2610		}
2611#endif
2612		FAIL_IF(call_with_args(compiler, type));
2613	}
2614
2615	if (src == SLJIT_IMM) {
2616		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2617		FAIL_IF_NULL(jump);
2618		set_jump(jump, compiler, JUMP_ADDR);
2619		jump->u.target = srcw;
2620
2621		/* Worst case size. */
2622#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2623		compiler->size += 5;
2624#else
2625		compiler->size += 10 + 3;
2626#endif
2627
2628		inst = (sljit_ub*)ensure_buf(compiler, 2);
2629		FAIL_IF_NULL(inst);
2630
2631		*inst++ = 0;
2632		*inst++ = type + 4;
2633	}
2634	else {
2635#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2636		/* REX_W is not necessary (src is not immediate). */
2637		compiler->mode32 = 1;
2638#endif
2639		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2640		FAIL_IF(!inst);
2641		*inst++ = GROUP_FF;
2642		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2643	}
2644	return SLJIT_SUCCESS;
2645}
2646
2647SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_si op,
2648	sljit_si dst, sljit_sw dstw,
2649	sljit_si src, sljit_sw srcw,
2650	sljit_si type)
2651{
2652	sljit_ub *inst;
2653	sljit_ub cond_set = 0;
2654#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2655	sljit_si reg;
2656#else
2657	/* CHECK_EXTRA_REGS migh overwrite these values. */
2658	sljit_si dst_save = dst;
2659	sljit_sw dstw_save = dstw;
2660#endif
2661
2662	CHECK_ERROR();
2663	check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type);
2664
2665	if (dst == SLJIT_UNUSED)
2666		return SLJIT_SUCCESS;
2667
2668	ADJUST_LOCAL_OFFSET(dst, dstw);
2669	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2670	if (SLJIT_UNLIKELY(compiler->flags_saved))
2671		FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
2672
2673	/* setcc = jcc + 0x10. */
2674	cond_set = get_jump_code(type) + 0x10;
2675
2676#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2677	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2678		inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 3);
2679		FAIL_IF(!inst);
2680		INC_SIZE(4 + 3);
2681		/* Set low register to conditional flag. */
2682		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2683		*inst++ = GROUP_0F;
2684		*inst++ = cond_set;
2685		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2686		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2687		*inst++ = OR_rm8_r8;
2688		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2689		return SLJIT_SUCCESS;
2690	}
2691
2692	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2693
2694	inst = (sljit_ub*)ensure_buf(compiler, 1 + 4 + 4);
2695	FAIL_IF(!inst);
2696	INC_SIZE(4 + 4);
2697	/* Set low register to conditional flag. */
2698	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2699	*inst++ = GROUP_0F;
2700	*inst++ = cond_set;
2701	*inst++ = MOD_REG | reg_lmap[reg];
2702	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2703	*inst++ = GROUP_0F;
2704	*inst++ = MOVZX_r_rm8;
2705	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2706
2707	if (reg != TMP_REG1)
2708		return SLJIT_SUCCESS;
2709
2710	if (GET_OPCODE(op) < SLJIT_ADD) {
2711		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2712		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2713	}
2714#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
2715	compiler->skip_checks = 1;
2716#endif
2717	return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
2718#else /* SLJIT_CONFIG_X86_64 */
2719	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2720		if (reg_map[dst] <= 4) {
2721			/* Low byte is accessible. */
2722			inst = (sljit_ub*)ensure_buf(compiler, 1 + 3 + 3);
2723			FAIL_IF(!inst);
2724			INC_SIZE(3 + 3);
2725			/* Set low byte to conditional flag. */
2726			*inst++ = GROUP_0F;
2727			*inst++ = cond_set;
2728			*inst++ = MOD_REG | reg_map[dst];
2729
2730			*inst++ = GROUP_0F;
2731			*inst++ = MOVZX_r_rm8;
2732			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2733			return SLJIT_SUCCESS;
2734		}
2735
2736		/* Low byte is not accessible. */
2737		if (cpu_has_cmov == -1)
2738			get_cpu_features();
2739
2740		if (cpu_has_cmov) {
2741			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2742			/* a xor reg, reg operation would overwrite the flags. */
2743			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2744
2745			inst = (sljit_ub*)ensure_buf(compiler, 1 + 3);
2746			FAIL_IF(!inst);
2747			INC_SIZE(3);
2748
2749			*inst++ = GROUP_0F;
2750			/* cmovcc = setcc - 0x50. */
2751			*inst++ = cond_set - 0x50;
2752			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2753			return SLJIT_SUCCESS;
2754		}
2755
2756		inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2757		FAIL_IF(!inst);
2758		INC_SIZE(1 + 3 + 3 + 1);
2759		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2760		/* Set al to conditional flag. */
2761		*inst++ = GROUP_0F;
2762		*inst++ = cond_set;
2763		*inst++ = MOD_REG | 0 /* eax */;
2764
2765		*inst++ = GROUP_0F;
2766		*inst++ = MOVZX_r_rm8;
2767		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2768		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2769		return SLJIT_SUCCESS;
2770	}
2771
2772	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2773		SLJIT_COMPILE_ASSERT(reg_map[SLJIT_R0] == 0, scratch_reg1_must_be_eax);
2774		if (dst != SLJIT_R0) {
2775			inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2776			FAIL_IF(!inst);
2777			INC_SIZE(1 + 3 + 2 + 1);
2778			/* Set low register to conditional flag. */
2779			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2780			*inst++ = GROUP_0F;
2781			*inst++ = cond_set;
2782			*inst++ = MOD_REG | 0 /* eax */;
2783			*inst++ = OR_rm8_r8;
2784			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2785			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2786		}
2787		else {
2788			inst = (sljit_ub*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2789			FAIL_IF(!inst);
2790			INC_SIZE(2 + 3 + 2 + 2);
2791			/* Set low register to conditional flag. */
2792			*inst++ = XCHG_r_rm;
2793			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2794			*inst++ = GROUP_0F;
2795			*inst++ = cond_set;
2796			*inst++ = MOD_REG | 1 /* ecx */;
2797			*inst++ = OR_rm8_r8;
2798			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2799			*inst++ = XCHG_r_rm;
2800			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2801		}
2802		return SLJIT_SUCCESS;
2803	}
2804
2805	/* Set TMP_REG1 to the bit. */
2806	inst = (sljit_ub*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2807	FAIL_IF(!inst);
2808	INC_SIZE(1 + 3 + 3 + 1);
2809	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2810	/* Set al to conditional flag. */
2811	*inst++ = GROUP_0F;
2812	*inst++ = cond_set;
2813	*inst++ = MOD_REG | 0 /* eax */;
2814
2815	*inst++ = GROUP_0F;
2816	*inst++ = MOVZX_r_rm8;
2817	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2818
2819	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2820
2821	if (GET_OPCODE(op) < SLJIT_ADD)
2822		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2823
2824#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) || (defined SLJIT_DEBUG && SLJIT_DEBUG)
2825	compiler->skip_checks = 1;
2826#endif
2827	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2828#endif /* SLJIT_CONFIG_X86_64 */
2829}
2830
2831SLJIT_API_FUNC_ATTRIBUTE sljit_si sljit_get_local_base(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw offset)
2832{
2833	CHECK_ERROR();
2834	check_sljit_get_local_base(compiler, dst, dstw, offset);
2835	ADJUST_LOCAL_OFFSET(dst, dstw);
2836
2837	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2838
2839#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2840	compiler->mode32 = 0;
2841#endif
2842
2843	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
2844
2845#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2846	if (NOT_HALFWORD(offset)) {
2847		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2848#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2849		SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2850		return compiler->error;
2851#else
2852		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
2853#endif
2854	}
2855#endif
2856
2857	if (offset != 0)
2858		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
2859	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
2860}
2861
2862SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_si dst, sljit_sw dstw, sljit_sw init_value)
2863{
2864	sljit_ub *inst;
2865	struct sljit_const *const_;
2866#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2867	sljit_si reg;
2868#endif
2869
2870	CHECK_ERROR_PTR();
2871	check_sljit_emit_const(compiler, dst, dstw, init_value);
2872	ADJUST_LOCAL_OFFSET(dst, dstw);
2873
2874	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2875
2876	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2877	PTR_FAIL_IF(!const_);
2878	set_const(const_, compiler);
2879
2880#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2881	compiler->mode32 = 0;
2882	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2883
2884	if (emit_load_imm64(compiler, reg, init_value))
2885		return NULL;
2886#else
2887	if (dst == SLJIT_UNUSED)
2888		dst = TMP_REG1;
2889
2890	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2891		return NULL;
2892#endif
2893
2894	inst = (sljit_ub*)ensure_buf(compiler, 2);
2895	PTR_FAIL_IF(!inst);
2896
2897	*inst++ = 0;
2898	*inst++ = 1;
2899
2900#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2901	if (dst & SLJIT_MEM)
2902		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2903			return NULL;
2904#endif
2905
2906	return const_;
2907}
2908
2909SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
2910{
2911#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2912	*(sljit_sw*)addr = new_addr - (addr + 4);
2913#else
2914	*(sljit_uw*)addr = new_addr;
2915#endif
2916}
2917
2918SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
2919{
2920	*(sljit_sw*)addr = new_constant;
2921}
2922