1/*
2 *    Stack-less Just-In-Time compiler
3 *
4 *    Copyright 2009-2012 Zoltan Herczeg (hzmester@freemail.hu). All rights reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without modification, are
7 * permitted provided that the following conditions are met:
8 *
9 *   1. Redistributions of source code must retain the above copyright notice, this list of
10 *      conditions and the following disclaimer.
11 *
12 *   2. Redistributions in binary form must reproduce the above copyright notice, this list
13 *      of conditions and the following disclaimer in the documentation and/or other materials
14 *      provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER(S) AND CONTRIBUTORS ``AS IS'' AND ANY
17 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
18 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
19 * SHALL THE COPYRIGHT HOLDER(S) OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
20 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
21 * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22 * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
23 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
24 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25 */
26
27SLJIT_API_FUNC_ATTRIBUTE const char* sljit_get_platform_name(void)
28{
29	return "x86" SLJIT_CPUINFO;
30}
31
32/*
33   32b register indexes:
34     0 - EAX
35     1 - ECX
36     2 - EDX
37     3 - EBX
38     4 - none
39     5 - EBP
40     6 - ESI
41     7 - EDI
42*/
43
44/*
45   64b register indexes:
46     0 - RAX
47     1 - RCX
48     2 - RDX
49     3 - RBX
50     4 - none
51     5 - RBP
52     6 - RSI
53     7 - RDI
54     8 - R8   - From now on REX prefix is required
55     9 - R9
56    10 - R10
57    11 - R11
58    12 - R12
59    13 - R13
60    14 - R14
61    15 - R15
62*/
63
64#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
65
66/* Last register + 1. */
67#define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
68
69static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 3] = {
70	0, 0, 2, 1, 0, 0, 0, 0, 7, 6, 3, 4, 5
71};
72
73#define CHECK_EXTRA_REGS(p, w, do) \
74	if (p >= SLJIT_R3 && p <= SLJIT_R6) { \
75		w = SLJIT_LOCALS_OFFSET + ((p) - (SLJIT_R3 + 4)) * sizeof(sljit_sw); \
76		p = SLJIT_MEM1(SLJIT_SP); \
77		do; \
78	}
79
80#else /* SLJIT_CONFIG_X86_32 */
81
82/* Last register + 1. */
83#define TMP_REG1	(SLJIT_NUMBER_OF_REGISTERS + 2)
84#define TMP_REG2	(SLJIT_NUMBER_OF_REGISTERS + 3)
85#define TMP_REG3	(SLJIT_NUMBER_OF_REGISTERS + 4)
86
87/* Note: r12 & 0x7 == 0b100, which decoded as SIB byte present
88   Note: avoid to use r12 and r13 for memory addessing
89   therefore r12 is better for SAVED_EREG than SAVED_REG. */
90#ifndef _WIN64
91/* 1st passed in rdi, 2nd argument passed in rsi, 3rd in rdx. */
92static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
93	0, 0, 6, 1, 8, 11, 10, 12, 5, 13, 14, 15, 3, 4, 2, 7, 9
94};
95/* low-map. reg_map & 0x7. */
96static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
97	0, 0, 6, 1, 0, 3,  2,  4,  5,  5,  6,  7, 3, 4, 2, 7, 1
98};
99#else
100/* 1st passed in rcx, 2nd argument passed in rdx, 3rd in r8. */
101static const sljit_u8 reg_map[SLJIT_NUMBER_OF_REGISTERS + 5] = {
102	0, 0, 2, 1, 11, 12, 5, 13, 14, 15, 7, 6, 3, 4, 10, 8, 9
103};
104/* low-map. reg_map & 0x7. */
105static const sljit_u8 reg_lmap[SLJIT_NUMBER_OF_REGISTERS + 5] = {
106	0, 0, 2, 1, 3,  4,  5,  5, 6,  7,  7, 6, 3, 4, 2,  0, 1
107};
108#endif
109
110#define REX_W		0x48
111#define REX_R		0x44
112#define REX_X		0x42
113#define REX_B		0x41
114#define REX		0x40
115
116#ifndef _WIN64
117#define HALFWORD_MAX 0x7fffffffl
118#define HALFWORD_MIN -0x80000000l
119#else
120#define HALFWORD_MAX 0x7fffffffll
121#define HALFWORD_MIN -0x80000000ll
122#endif
123
124#define IS_HALFWORD(x)		((x) <= HALFWORD_MAX && (x) >= HALFWORD_MIN)
125#define NOT_HALFWORD(x)		((x) > HALFWORD_MAX || (x) < HALFWORD_MIN)
126
127#define CHECK_EXTRA_REGS(p, w, do)
128
129#endif /* SLJIT_CONFIG_X86_32 */
130
131#define TMP_FREG	(0)
132
133/* Size flags for emit_x86_instruction: */
134#define EX86_BIN_INS		0x0010
135#define EX86_SHIFT_INS		0x0020
136#define EX86_REX		0x0040
137#define EX86_NO_REXW		0x0080
138#define EX86_BYTE_ARG		0x0100
139#define EX86_HALF_ARG		0x0200
140#define EX86_PREF_66		0x0400
141#define EX86_PREF_F2		0x0800
142#define EX86_PREF_F3		0x1000
143#define EX86_SSE2_OP1		0x2000
144#define EX86_SSE2_OP2		0x4000
145#define EX86_SSE2		(EX86_SSE2_OP1 | EX86_SSE2_OP2)
146
147/* --------------------------------------------------------------------- */
148/*  Instrucion forms                                                     */
149/* --------------------------------------------------------------------- */
150
151#define ADD		(/* BINARY */ 0 << 3)
152#define ADD_EAX_i32	0x05
153#define ADD_r_rm	0x03
154#define ADD_rm_r	0x01
155#define ADDSD_x_xm	0x58
156#define ADC		(/* BINARY */ 2 << 3)
157#define ADC_EAX_i32	0x15
158#define ADC_r_rm	0x13
159#define ADC_rm_r	0x11
160#define AND		(/* BINARY */ 4 << 3)
161#define AND_EAX_i32	0x25
162#define AND_r_rm	0x23
163#define AND_rm_r	0x21
164#define ANDPD_x_xm	0x54
165#define BSR_r_rm	(/* GROUP_0F */ 0xbd)
166#define CALL_i32	0xe8
167#define CALL_rm		(/* GROUP_FF */ 2 << 3)
168#define CDQ		0x99
169#define CMOVNE_r_rm	(/* GROUP_0F */ 0x45)
170#define CMP		(/* BINARY */ 7 << 3)
171#define CMP_EAX_i32	0x3d
172#define CMP_r_rm	0x3b
173#define CMP_rm_r	0x39
174#define CVTPD2PS_x_xm	0x5a
175#define CVTSI2SD_x_rm	0x2a
176#define CVTTSD2SI_r_xm	0x2c
177#define DIV		(/* GROUP_F7 */ 6 << 3)
178#define DIVSD_x_xm	0x5e
179#define INT3		0xcc
180#define IDIV		(/* GROUP_F7 */ 7 << 3)
181#define IMUL		(/* GROUP_F7 */ 5 << 3)
182#define IMUL_r_rm	(/* GROUP_0F */ 0xaf)
183#define IMUL_r_rm_i8	0x6b
184#define IMUL_r_rm_i32	0x69
185#define JE_i8		0x74
186#define JNE_i8		0x75
187#define JMP_i8		0xeb
188#define JMP_i32		0xe9
189#define JMP_rm		(/* GROUP_FF */ 4 << 3)
190#define LEA_r_m		0x8d
191#define MOV_r_rm	0x8b
192#define MOV_r_i32	0xb8
193#define MOV_rm_r	0x89
194#define MOV_rm_i32	0xc7
195#define MOV_rm8_i8	0xc6
196#define MOV_rm8_r8	0x88
197#define MOVSD_x_xm	0x10
198#define MOVSD_xm_x	0x11
199#define MOVSXD_r_rm	0x63
200#define MOVSX_r_rm8	(/* GROUP_0F */ 0xbe)
201#define MOVSX_r_rm16	(/* GROUP_0F */ 0xbf)
202#define MOVZX_r_rm8	(/* GROUP_0F */ 0xb6)
203#define MOVZX_r_rm16	(/* GROUP_0F */ 0xb7)
204#define MUL		(/* GROUP_F7 */ 4 << 3)
205#define MULSD_x_xm	0x59
206#define NEG_rm		(/* GROUP_F7 */ 3 << 3)
207#define NOP		0x90
208#define NOT_rm		(/* GROUP_F7 */ 2 << 3)
209#define OR		(/* BINARY */ 1 << 3)
210#define OR_r_rm		0x0b
211#define OR_EAX_i32	0x0d
212#define OR_rm_r		0x09
213#define OR_rm8_r8	0x08
214#define POP_r		0x58
215#define POP_rm		0x8f
216#define POPF		0x9d
217#define PUSH_i32	0x68
218#define PUSH_r		0x50
219#define PUSH_rm		(/* GROUP_FF */ 6 << 3)
220#define PUSHF		0x9c
221#define RET_near	0xc3
222#define RET_i16		0xc2
223#define SBB		(/* BINARY */ 3 << 3)
224#define SBB_EAX_i32	0x1d
225#define SBB_r_rm	0x1b
226#define SBB_rm_r	0x19
227#define SAR		(/* SHIFT */ 7 << 3)
228#define SHL		(/* SHIFT */ 4 << 3)
229#define SHR		(/* SHIFT */ 5 << 3)
230#define SUB		(/* BINARY */ 5 << 3)
231#define SUB_EAX_i32	0x2d
232#define SUB_r_rm	0x2b
233#define SUB_rm_r	0x29
234#define SUBSD_x_xm	0x5c
235#define TEST_EAX_i32	0xa9
236#define TEST_rm_r	0x85
237#define UCOMISD_x_xm	0x2e
238#define UNPCKLPD_x_xm	0x14
239#define XCHG_EAX_r	0x90
240#define XCHG_r_rm	0x87
241#define XOR		(/* BINARY */ 6 << 3)
242#define XOR_EAX_i32	0x35
243#define XOR_r_rm	0x33
244#define XOR_rm_r	0x31
245#define XORPD_x_xm	0x57
246
247#define GROUP_0F	0x0f
248#define GROUP_F7	0xf7
249#define GROUP_FF	0xff
250#define GROUP_BINARY_81	0x81
251#define GROUP_BINARY_83	0x83
252#define GROUP_SHIFT_1	0xd1
253#define GROUP_SHIFT_N	0xc1
254#define GROUP_SHIFT_CL	0xd3
255
256#define MOD_REG		0xc0
257#define MOD_DISP8	0x40
258
259#define INC_SIZE(s)			(*inst++ = (s), compiler->size += (s))
260
261#define PUSH_REG(r)			(*inst++ = (PUSH_r + (r)))
262#define POP_REG(r)			(*inst++ = (POP_r + (r)))
263#define RET()				(*inst++ = (RET_near))
264#define RET_I16(n)			(*inst++ = (RET_i16), *inst++ = n, *inst++ = 0)
265/* r32, r/m32 */
266#define MOV_RM(mod, reg, rm)		(*inst++ = (MOV_r_rm), *inst++ = (mod) << 6 | (reg) << 3 | (rm))
267
268/* Multithreading does not affect these static variables, since they store
269   built-in CPU features. Therefore they can be overwritten by different threads
270   if they detect the CPU features in the same time. */
271#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
272static sljit_s32 cpu_has_sse2 = -1;
273#endif
274static sljit_s32 cpu_has_cmov = -1;
275
276#ifdef _WIN32_WCE
277#include <cmnintrin.h>
278#elif defined(_MSC_VER) && _MSC_VER >= 1400
279#include <intrin.h>
280#endif
281
282/******************************************************/
283/*    Unaligned-store functions                       */
284/******************************************************/
285
286static SLJIT_INLINE void sljit_unaligned_store_s16(void *addr, sljit_s16 value)
287{
288	SLJIT_MEMCPY(addr, &value, sizeof(value));
289}
290
291static SLJIT_INLINE void sljit_unaligned_store_s32(void *addr, sljit_s32 value)
292{
293	SLJIT_MEMCPY(addr, &value, sizeof(value));
294}
295
296static SLJIT_INLINE void sljit_unaligned_store_sw(void *addr, sljit_sw value)
297{
298	SLJIT_MEMCPY(addr, &value, sizeof(value));
299}
300
301/******************************************************/
302/*    Utility functions                               */
303/******************************************************/
304
305static void get_cpu_features(void)
306{
307	sljit_u32 features;
308
309#if defined(_MSC_VER) && _MSC_VER >= 1400
310
311	int CPUInfo[4];
312	__cpuid(CPUInfo, 1);
313	features = (sljit_u32)CPUInfo[3];
314
315#elif defined(__GNUC__) || defined(__INTEL_COMPILER) || defined(__SUNPRO_C)
316
317	/* AT&T syntax. */
318	__asm__ (
319		"movl $0x1, %%eax\n"
320#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
321		/* On x86-32, there is no red zone, so this
322		   should work (no need for a local variable). */
323		"push %%ebx\n"
324#endif
325		"cpuid\n"
326#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
327		"pop %%ebx\n"
328#endif
329		"movl %%edx, %0\n"
330		: "=g" (features)
331		:
332#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
333		: "%eax", "%ecx", "%edx"
334#else
335		: "%rax", "%rbx", "%rcx", "%rdx"
336#endif
337	);
338
339#else /* _MSC_VER && _MSC_VER >= 1400 */
340
341	/* Intel syntax. */
342	__asm {
343		mov eax, 1
344		cpuid
345		mov features, edx
346	}
347
348#endif /* _MSC_VER && _MSC_VER >= 1400 */
349
350#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
351	cpu_has_sse2 = (features >> 26) & 0x1;
352#endif
353	cpu_has_cmov = (features >> 15) & 0x1;
354}
355
356static sljit_u8 get_jump_code(sljit_s32 type)
357{
358	switch (type) {
359	case SLJIT_EQUAL:
360	case SLJIT_EQUAL_F64:
361		return 0x84 /* je */;
362
363	case SLJIT_NOT_EQUAL:
364	case SLJIT_NOT_EQUAL_F64:
365		return 0x85 /* jne */;
366
367	case SLJIT_LESS:
368	case SLJIT_LESS_F64:
369		return 0x82 /* jc */;
370
371	case SLJIT_GREATER_EQUAL:
372	case SLJIT_GREATER_EQUAL_F64:
373		return 0x83 /* jae */;
374
375	case SLJIT_GREATER:
376	case SLJIT_GREATER_F64:
377		return 0x87 /* jnbe */;
378
379	case SLJIT_LESS_EQUAL:
380	case SLJIT_LESS_EQUAL_F64:
381		return 0x86 /* jbe */;
382
383	case SLJIT_SIG_LESS:
384		return 0x8c /* jl */;
385
386	case SLJIT_SIG_GREATER_EQUAL:
387		return 0x8d /* jnl */;
388
389	case SLJIT_SIG_GREATER:
390		return 0x8f /* jnle */;
391
392	case SLJIT_SIG_LESS_EQUAL:
393		return 0x8e /* jle */;
394
395	case SLJIT_OVERFLOW:
396	case SLJIT_MUL_OVERFLOW:
397		return 0x80 /* jo */;
398
399	case SLJIT_NOT_OVERFLOW:
400	case SLJIT_MUL_NOT_OVERFLOW:
401		return 0x81 /* jno */;
402
403	case SLJIT_UNORDERED_F64:
404		return 0x8a /* jp */;
405
406	case SLJIT_ORDERED_F64:
407		return 0x8b /* jpo */;
408	}
409	return 0;
410}
411
412static sljit_u8* generate_far_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_s32 type);
413
414#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
415static sljit_u8* generate_fixed_jump(sljit_u8 *code_ptr, sljit_sw addr, sljit_s32 type);
416#endif
417
418static sljit_u8* generate_near_jump_code(struct sljit_jump *jump, sljit_u8 *code_ptr, sljit_u8 *code, sljit_s32 type)
419{
420	sljit_s32 short_jump;
421	sljit_uw label_addr;
422
423	if (jump->flags & JUMP_LABEL)
424		label_addr = (sljit_uw)(code + jump->u.label->size);
425	else
426		label_addr = jump->u.target;
427	short_jump = (sljit_sw)(label_addr - (jump->addr + 2)) >= -128 && (sljit_sw)(label_addr - (jump->addr + 2)) <= 127;
428
429#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
430	if ((sljit_sw)(label_addr - (jump->addr + 1)) > HALFWORD_MAX || (sljit_sw)(label_addr - (jump->addr + 1)) < HALFWORD_MIN)
431		return generate_far_jump_code(jump, code_ptr, type);
432#endif
433
434	if (type == SLJIT_JUMP) {
435		if (short_jump)
436			*code_ptr++ = JMP_i8;
437		else
438			*code_ptr++ = JMP_i32;
439		jump->addr++;
440	}
441	else if (type >= SLJIT_FAST_CALL) {
442		short_jump = 0;
443		*code_ptr++ = CALL_i32;
444		jump->addr++;
445	}
446	else if (short_jump) {
447		*code_ptr++ = get_jump_code(type) - 0x10;
448		jump->addr++;
449	}
450	else {
451		*code_ptr++ = GROUP_0F;
452		*code_ptr++ = get_jump_code(type);
453		jump->addr += 2;
454	}
455
456	if (short_jump) {
457		jump->flags |= PATCH_MB;
458		code_ptr += sizeof(sljit_s8);
459	} else {
460		jump->flags |= PATCH_MW;
461#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
462		code_ptr += sizeof(sljit_sw);
463#else
464		code_ptr += sizeof(sljit_s32);
465#endif
466	}
467
468	return code_ptr;
469}
470
471SLJIT_API_FUNC_ATTRIBUTE void* sljit_generate_code(struct sljit_compiler *compiler)
472{
473	struct sljit_memory_fragment *buf;
474	sljit_u8 *code;
475	sljit_u8 *code_ptr;
476	sljit_u8 *buf_ptr;
477	sljit_u8 *buf_end;
478	sljit_u8 len;
479
480	struct sljit_label *label;
481	struct sljit_jump *jump;
482	struct sljit_const *const_;
483
484	CHECK_ERROR_PTR();
485	CHECK_PTR(check_sljit_generate_code(compiler));
486	reverse_buf(compiler);
487
488	/* Second code generation pass. */
489	code = (sljit_u8*)SLJIT_MALLOC_EXEC(compiler->size);
490	PTR_FAIL_WITH_EXEC_IF(code);
491	buf = compiler->buf;
492
493	code_ptr = code;
494	label = compiler->labels;
495	jump = compiler->jumps;
496	const_ = compiler->consts;
497	do {
498		buf_ptr = buf->memory;
499		buf_end = buf_ptr + buf->used_size;
500		do {
501			len = *buf_ptr++;
502			if (len > 0) {
503				/* The code is already generated. */
504				SLJIT_MEMCPY(code_ptr, buf_ptr, len);
505				code_ptr += len;
506				buf_ptr += len;
507			}
508			else {
509				if (*buf_ptr >= 4) {
510					jump->addr = (sljit_uw)code_ptr;
511					if (!(jump->flags & SLJIT_REWRITABLE_JUMP))
512						code_ptr = generate_near_jump_code(jump, code_ptr, code, *buf_ptr - 4);
513					else
514						code_ptr = generate_far_jump_code(jump, code_ptr, *buf_ptr - 4);
515					jump = jump->next;
516				}
517				else if (*buf_ptr == 0) {
518					label->addr = (sljit_uw)code_ptr;
519					label->size = code_ptr - code;
520					label = label->next;
521				}
522				else if (*buf_ptr == 1) {
523					const_->addr = ((sljit_uw)code_ptr) - sizeof(sljit_sw);
524					const_ = const_->next;
525				}
526				else {
527#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
528					*code_ptr++ = (*buf_ptr == 2) ? CALL_i32 : JMP_i32;
529					buf_ptr++;
530					sljit_unaligned_store_sw(code_ptr, *(sljit_sw*)buf_ptr - ((sljit_sw)code_ptr + sizeof(sljit_sw)));
531					code_ptr += sizeof(sljit_sw);
532					buf_ptr += sizeof(sljit_sw) - 1;
533#else
534					code_ptr = generate_fixed_jump(code_ptr, *(sljit_sw*)(buf_ptr + 1), *buf_ptr);
535					buf_ptr += sizeof(sljit_sw);
536#endif
537				}
538				buf_ptr++;
539			}
540		} while (buf_ptr < buf_end);
541		SLJIT_ASSERT(buf_ptr == buf_end);
542		buf = buf->next;
543	} while (buf);
544
545	SLJIT_ASSERT(!label);
546	SLJIT_ASSERT(!jump);
547	SLJIT_ASSERT(!const_);
548
549	jump = compiler->jumps;
550	while (jump) {
551		if (jump->flags & PATCH_MB) {
552			SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8))) >= -128 && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8))) <= 127);
553			*(sljit_u8*)jump->addr = (sljit_u8)(jump->u.label->addr - (jump->addr + sizeof(sljit_s8)));
554		} else if (jump->flags & PATCH_MW) {
555			if (jump->flags & JUMP_LABEL) {
556#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
557				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_sw))));
558#else
559				SLJIT_ASSERT((sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
560				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.label->addr - (jump->addr + sizeof(sljit_s32))));
561#endif
562			}
563			else {
564#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
565				sljit_unaligned_store_sw((void*)jump->addr, (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_sw))));
566#else
567				SLJIT_ASSERT((sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_s32))) >= HALFWORD_MIN && (sljit_sw)(jump->u.target - (jump->addr + sizeof(sljit_s32))) <= HALFWORD_MAX);
568				sljit_unaligned_store_s32((void*)jump->addr, (sljit_s32)(jump->u.target - (jump->addr + sizeof(sljit_s32))));
569#endif
570			}
571		}
572#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
573		else if (jump->flags & PATCH_MD)
574			sljit_unaligned_store_sw((void*)jump->addr, jump->u.label->addr);
575#endif
576
577		jump = jump->next;
578	}
579
580	/* Maybe we waste some space because of short jumps. */
581	SLJIT_ASSERT(code_ptr <= code + compiler->size);
582	compiler->error = SLJIT_ERR_COMPILED;
583	compiler->executable_size = code_ptr - code;
584	return (void*)code;
585}
586
587/* --------------------------------------------------------------------- */
588/*  Operators                                                            */
589/* --------------------------------------------------------------------- */
590
591static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
592	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
593	sljit_s32 dst, sljit_sw dstw,
594	sljit_s32 src1, sljit_sw src1w,
595	sljit_s32 src2, sljit_sw src2w);
596
597static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
598	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
599	sljit_s32 dst, sljit_sw dstw,
600	sljit_s32 src1, sljit_sw src1w,
601	sljit_s32 src2, sljit_sw src2w);
602
603static sljit_s32 emit_mov(struct sljit_compiler *compiler,
604	sljit_s32 dst, sljit_sw dstw,
605	sljit_s32 src, sljit_sw srcw);
606
607static SLJIT_INLINE sljit_s32 emit_save_flags(struct sljit_compiler *compiler)
608{
609	sljit_u8 *inst;
610
611#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
612	inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
613	FAIL_IF(!inst);
614	INC_SIZE(5);
615#else
616	inst = (sljit_u8*)ensure_buf(compiler, 1 + 6);
617	FAIL_IF(!inst);
618	INC_SIZE(6);
619	*inst++ = REX_W;
620#endif
621	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp + sizeof(sljit_sw)] */
622	*inst++ = 0x64;
623	*inst++ = 0x24;
624	*inst++ = (sljit_u8)sizeof(sljit_sw);
625	*inst++ = PUSHF;
626	compiler->flags_saved = 1;
627	return SLJIT_SUCCESS;
628}
629
630static SLJIT_INLINE sljit_s32 emit_restore_flags(struct sljit_compiler *compiler, sljit_s32 keep_flags)
631{
632	sljit_u8 *inst;
633
634#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
635	inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
636	FAIL_IF(!inst);
637	INC_SIZE(5);
638	*inst++ = POPF;
639#else
640	inst = (sljit_u8*)ensure_buf(compiler, 1 + 6);
641	FAIL_IF(!inst);
642	INC_SIZE(6);
643	*inst++ = POPF;
644	*inst++ = REX_W;
645#endif
646	*inst++ = LEA_r_m; /* lea esp/rsp, [esp/rsp - sizeof(sljit_sw)] */
647	*inst++ = 0x64;
648	*inst++ = 0x24;
649	*inst++ = (sljit_u8)(-(sljit_s8)sizeof(sljit_sw));
650	compiler->flags_saved = keep_flags;
651	return SLJIT_SUCCESS;
652}
653
654#ifdef _WIN32
655#include <malloc.h>
656
657static void SLJIT_CALL sljit_grow_stack(sljit_sw local_size)
658{
659	/* Workaround for calling the internal _chkstk() function on Windows.
660	This function touches all 4k pages belongs to the requested stack space,
661	which size is passed in local_size. This is necessary on Windows where
662	the stack can only grow in 4k steps. However, this function just burn
663	CPU cycles if the stack is large enough. However, you don't know it in
664	advance, so it must always be called. I think this is a bad design in
665	general even if it has some reasons. */
666	*(volatile sljit_s32*)alloca(local_size) = 0;
667}
668
669#endif
670
671#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
672#include "sljitNativeX86_32.c"
673#else
674#include "sljitNativeX86_64.c"
675#endif
676
677static sljit_s32 emit_mov(struct sljit_compiler *compiler,
678	sljit_s32 dst, sljit_sw dstw,
679	sljit_s32 src, sljit_sw srcw)
680{
681	sljit_u8* inst;
682
683	if (dst == SLJIT_UNUSED) {
684		/* No destination, doesn't need to setup flags. */
685		if (src & SLJIT_MEM) {
686			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
687			FAIL_IF(!inst);
688			*inst = MOV_r_rm;
689		}
690		return SLJIT_SUCCESS;
691	}
692	if (FAST_IS_REG(src)) {
693		inst = emit_x86_instruction(compiler, 1, src, 0, dst, dstw);
694		FAIL_IF(!inst);
695		*inst = MOV_rm_r;
696		return SLJIT_SUCCESS;
697	}
698	if (src & SLJIT_IMM) {
699		if (FAST_IS_REG(dst)) {
700#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
701			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
702#else
703			if (!compiler->mode32) {
704				if (NOT_HALFWORD(srcw))
705					return emit_load_imm64(compiler, dst, srcw);
706			}
707			else
708				return emit_do_imm32(compiler, (reg_map[dst] >= 8) ? REX_B : 0, MOV_r_i32 + reg_lmap[dst], srcw);
709#endif
710		}
711#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
712		if (!compiler->mode32 && NOT_HALFWORD(srcw)) {
713			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, srcw));
714			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, dst, dstw);
715			FAIL_IF(!inst);
716			*inst = MOV_rm_r;
717			return SLJIT_SUCCESS;
718		}
719#endif
720		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, dstw);
721		FAIL_IF(!inst);
722		*inst = MOV_rm_i32;
723		return SLJIT_SUCCESS;
724	}
725	if (FAST_IS_REG(dst)) {
726		inst = emit_x86_instruction(compiler, 1, dst, 0, src, srcw);
727		FAIL_IF(!inst);
728		*inst = MOV_r_rm;
729		return SLJIT_SUCCESS;
730	}
731
732	/* Memory to memory move. Requires two instruction. */
733	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src, srcw);
734	FAIL_IF(!inst);
735	*inst = MOV_r_rm;
736	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
737	FAIL_IF(!inst);
738	*inst = MOV_rm_r;
739	return SLJIT_SUCCESS;
740}
741
742#define EMIT_MOV(compiler, dst, dstw, src, srcw) \
743	FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
744
745SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op0(struct sljit_compiler *compiler, sljit_s32 op)
746{
747	sljit_u8 *inst;
748#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
749	sljit_s32 size;
750#endif
751
752	CHECK_ERROR();
753	CHECK(check_sljit_emit_op0(compiler, op));
754
755	switch (GET_OPCODE(op)) {
756	case SLJIT_BREAKPOINT:
757		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
758		FAIL_IF(!inst);
759		INC_SIZE(1);
760		*inst = INT3;
761		break;
762	case SLJIT_NOP:
763		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
764		FAIL_IF(!inst);
765		INC_SIZE(1);
766		*inst = NOP;
767		break;
768	case SLJIT_LMUL_UW:
769	case SLJIT_LMUL_SW:
770	case SLJIT_DIVMOD_UW:
771	case SLJIT_DIVMOD_SW:
772	case SLJIT_DIV_UW:
773	case SLJIT_DIV_SW:
774		compiler->flags_saved = 0;
775#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
776#ifdef _WIN64
777		SLJIT_COMPILE_ASSERT(
778			reg_map[SLJIT_R0] == 0
779			&& reg_map[SLJIT_R1] == 2
780			&& reg_map[TMP_REG1] > 7,
781			invalid_register_assignment_for_div_mul);
782#else
783		SLJIT_COMPILE_ASSERT(
784			reg_map[SLJIT_R0] == 0
785			&& reg_map[SLJIT_R1] < 7
786			&& reg_map[TMP_REG1] == 2,
787			invalid_register_assignment_for_div_mul);
788#endif
789		compiler->mode32 = op & SLJIT_I32_OP;
790#endif
791		SLJIT_COMPILE_ASSERT((SLJIT_DIVMOD_UW & 0x2) == 0 && SLJIT_DIV_UW - 0x2 == SLJIT_DIVMOD_UW, bad_div_opcode_assignments);
792
793		op = GET_OPCODE(op);
794		if ((op | 0x2) == SLJIT_DIV_UW) {
795#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
796			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
797			inst = emit_x86_instruction(compiler, 1, SLJIT_R1, 0, SLJIT_R1, 0);
798#else
799			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
800#endif
801			FAIL_IF(!inst);
802			*inst = XOR_r_rm;
803		}
804
805		if ((op | 0x2) == SLJIT_DIV_SW) {
806#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32) || defined(_WIN64)
807			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_R1, 0);
808#endif
809
810#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
811			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
812			FAIL_IF(!inst);
813			INC_SIZE(1);
814			*inst = CDQ;
815#else
816			if (compiler->mode32) {
817				inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
818				FAIL_IF(!inst);
819				INC_SIZE(1);
820				*inst = CDQ;
821			} else {
822				inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
823				FAIL_IF(!inst);
824				INC_SIZE(2);
825				*inst++ = REX_W;
826				*inst = CDQ;
827			}
828#endif
829		}
830
831#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
832		inst = (sljit_u8*)ensure_buf(compiler, 1 + 2);
833		FAIL_IF(!inst);
834		INC_SIZE(2);
835		*inst++ = GROUP_F7;
836		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_map[TMP_REG1] : reg_map[SLJIT_R1]);
837#else
838#ifdef _WIN64
839		size = (!compiler->mode32 || op >= SLJIT_DIVMOD_UW) ? 3 : 2;
840#else
841		size = (!compiler->mode32) ? 3 : 2;
842#endif
843		inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
844		FAIL_IF(!inst);
845		INC_SIZE(size);
846#ifdef _WIN64
847		if (!compiler->mode32)
848			*inst++ = REX_W | ((op >= SLJIT_DIVMOD_UW) ? REX_B : 0);
849		else if (op >= SLJIT_DIVMOD_UW)
850			*inst++ = REX_B;
851		*inst++ = GROUP_F7;
852		*inst = MOD_REG | ((op >= SLJIT_DIVMOD_UW) ? reg_lmap[TMP_REG1] : reg_lmap[SLJIT_R1]);
853#else
854		if (!compiler->mode32)
855			*inst++ = REX_W;
856		*inst++ = GROUP_F7;
857		*inst = MOD_REG | reg_map[SLJIT_R1];
858#endif
859#endif
860		switch (op) {
861		case SLJIT_LMUL_UW:
862			*inst |= MUL;
863			break;
864		case SLJIT_LMUL_SW:
865			*inst |= IMUL;
866			break;
867		case SLJIT_DIVMOD_UW:
868		case SLJIT_DIV_UW:
869			*inst |= DIV;
870			break;
871		case SLJIT_DIVMOD_SW:
872		case SLJIT_DIV_SW:
873			*inst |= IDIV;
874			break;
875		}
876#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && !defined(_WIN64)
877		if (op <= SLJIT_DIVMOD_SW)
878			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
879#else
880		if (op >= SLJIT_DIV_UW)
881			EMIT_MOV(compiler, SLJIT_R1, 0, TMP_REG1, 0);
882#endif
883		break;
884	}
885
886	return SLJIT_SUCCESS;
887}
888
889#define ENCODE_PREFIX(prefix) \
890	do { \
891		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1); \
892		FAIL_IF(!inst); \
893		INC_SIZE(1); \
894		*inst = (prefix); \
895	} while (0)
896
897static sljit_s32 emit_mov_byte(struct sljit_compiler *compiler, sljit_s32 sign,
898	sljit_s32 dst, sljit_sw dstw,
899	sljit_s32 src, sljit_sw srcw)
900{
901	sljit_u8* inst;
902	sljit_s32 dst_r;
903#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
904	sljit_s32 work_r;
905#endif
906
907#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
908	compiler->mode32 = 0;
909#endif
910
911	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
912		return SLJIT_SUCCESS; /* Empty instruction. */
913
914	if (src & SLJIT_IMM) {
915		if (FAST_IS_REG(dst)) {
916#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
917			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
918#else
919			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
920			FAIL_IF(!inst);
921			*inst = MOV_rm_i32;
922			return SLJIT_SUCCESS;
923#endif
924		}
925		inst = emit_x86_instruction(compiler, 1 | EX86_BYTE_ARG | EX86_NO_REXW, SLJIT_IMM, srcw, dst, dstw);
926		FAIL_IF(!inst);
927		*inst = MOV_rm8_i8;
928		return SLJIT_SUCCESS;
929	}
930
931	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
932
933	if ((dst & SLJIT_MEM) && FAST_IS_REG(src)) {
934#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
935		if (reg_map[src] >= 4) {
936			SLJIT_ASSERT(dst_r == TMP_REG1);
937			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
938		} else
939			dst_r = src;
940#else
941		dst_r = src;
942#endif
943	}
944#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
945	else if (FAST_IS_REG(src) && reg_map[src] >= 4) {
946		/* src, dst are registers. */
947		SLJIT_ASSERT(SLOW_IS_REG(dst));
948		if (reg_map[dst] < 4) {
949			if (dst != src)
950				EMIT_MOV(compiler, dst, 0, src, 0);
951			inst = emit_x86_instruction(compiler, 2, dst, 0, dst, 0);
952			FAIL_IF(!inst);
953			*inst++ = GROUP_0F;
954			*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
955		}
956		else {
957			if (dst != src)
958				EMIT_MOV(compiler, dst, 0, src, 0);
959			if (sign) {
960				/* shl reg, 24 */
961				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
962				FAIL_IF(!inst);
963				*inst |= SHL;
964				/* sar reg, 24 */
965				inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 24, dst, 0);
966				FAIL_IF(!inst);
967				*inst |= SAR;
968			}
969			else {
970				inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 0xff, dst, 0);
971				FAIL_IF(!inst);
972				*(inst + 1) |= AND;
973			}
974		}
975		return SLJIT_SUCCESS;
976	}
977#endif
978	else {
979		/* src can be memory addr or reg_map[src] < 4 on x86_32 architectures. */
980		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
981		FAIL_IF(!inst);
982		*inst++ = GROUP_0F;
983		*inst = sign ? MOVSX_r_rm8 : MOVZX_r_rm8;
984	}
985
986	if (dst & SLJIT_MEM) {
987#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
988		if (dst_r == TMP_REG1) {
989			/* Find a non-used register, whose reg_map[src] < 4. */
990			if ((dst & REG_MASK) == SLJIT_R0) {
991				if ((dst & OFFS_REG_MASK) == TO_OFFS_REG(SLJIT_R1))
992					work_r = SLJIT_R2;
993				else
994					work_r = SLJIT_R1;
995			}
996			else {
997				if ((dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
998					work_r = SLJIT_R0;
999				else if ((dst & REG_MASK) == SLJIT_R1)
1000					work_r = SLJIT_R2;
1001				else
1002					work_r = SLJIT_R1;
1003			}
1004
1005			if (work_r == SLJIT_R0) {
1006				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
1007			}
1008			else {
1009				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1010				FAIL_IF(!inst);
1011				*inst = XCHG_r_rm;
1012			}
1013
1014			inst = emit_x86_instruction(compiler, 1, work_r, 0, dst, dstw);
1015			FAIL_IF(!inst);
1016			*inst = MOV_rm8_r8;
1017
1018			if (work_r == SLJIT_R0) {
1019				ENCODE_PREFIX(XCHG_EAX_r + reg_map[TMP_REG1]);
1020			}
1021			else {
1022				inst = emit_x86_instruction(compiler, 1, work_r, 0, dst_r, 0);
1023				FAIL_IF(!inst);
1024				*inst = XCHG_r_rm;
1025			}
1026		}
1027		else {
1028			inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1029			FAIL_IF(!inst);
1030			*inst = MOV_rm8_r8;
1031		}
1032#else
1033		inst = emit_x86_instruction(compiler, 1 | EX86_REX | EX86_NO_REXW, dst_r, 0, dst, dstw);
1034		FAIL_IF(!inst);
1035		*inst = MOV_rm8_r8;
1036#endif
1037	}
1038
1039	return SLJIT_SUCCESS;
1040}
1041
1042static sljit_s32 emit_mov_half(struct sljit_compiler *compiler, sljit_s32 sign,
1043	sljit_s32 dst, sljit_sw dstw,
1044	sljit_s32 src, sljit_sw srcw)
1045{
1046	sljit_u8* inst;
1047	sljit_s32 dst_r;
1048
1049#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1050	compiler->mode32 = 0;
1051#endif
1052
1053	if (dst == SLJIT_UNUSED && !(src & SLJIT_MEM))
1054		return SLJIT_SUCCESS; /* Empty instruction. */
1055
1056	if (src & SLJIT_IMM) {
1057		if (FAST_IS_REG(dst)) {
1058#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1059			return emit_do_imm(compiler, MOV_r_i32 + reg_map[dst], srcw);
1060#else
1061			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, srcw, dst, 0);
1062			FAIL_IF(!inst);
1063			*inst = MOV_rm_i32;
1064			return SLJIT_SUCCESS;
1065#endif
1066		}
1067		inst = emit_x86_instruction(compiler, 1 | EX86_HALF_ARG | EX86_NO_REXW | EX86_PREF_66, SLJIT_IMM, srcw, dst, dstw);
1068		FAIL_IF(!inst);
1069		*inst = MOV_rm_i32;
1070		return SLJIT_SUCCESS;
1071	}
1072
1073	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1074
1075	if ((dst & SLJIT_MEM) && FAST_IS_REG(src))
1076		dst_r = src;
1077	else {
1078		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src, srcw);
1079		FAIL_IF(!inst);
1080		*inst++ = GROUP_0F;
1081		*inst = sign ? MOVSX_r_rm16 : MOVZX_r_rm16;
1082	}
1083
1084	if (dst & SLJIT_MEM) {
1085		inst = emit_x86_instruction(compiler, 1 | EX86_NO_REXW | EX86_PREF_66, dst_r, 0, dst, dstw);
1086		FAIL_IF(!inst);
1087		*inst = MOV_rm_r;
1088	}
1089
1090	return SLJIT_SUCCESS;
1091}
1092
1093static sljit_s32 emit_unary(struct sljit_compiler *compiler, sljit_u8 opcode,
1094	sljit_s32 dst, sljit_sw dstw,
1095	sljit_s32 src, sljit_sw srcw)
1096{
1097	sljit_u8* inst;
1098
1099	if (dst == SLJIT_UNUSED) {
1100		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1101		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1102		FAIL_IF(!inst);
1103		*inst++ = GROUP_F7;
1104		*inst |= opcode;
1105		return SLJIT_SUCCESS;
1106	}
1107	if (dst == src && dstw == srcw) {
1108		/* Same input and output */
1109		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1110		FAIL_IF(!inst);
1111		*inst++ = GROUP_F7;
1112		*inst |= opcode;
1113		return SLJIT_SUCCESS;
1114	}
1115	if (FAST_IS_REG(dst)) {
1116		EMIT_MOV(compiler, dst, 0, src, srcw);
1117		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1118		FAIL_IF(!inst);
1119		*inst++ = GROUP_F7;
1120		*inst |= opcode;
1121		return SLJIT_SUCCESS;
1122	}
1123	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1124	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1125	FAIL_IF(!inst);
1126	*inst++ = GROUP_F7;
1127	*inst |= opcode;
1128	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1129	return SLJIT_SUCCESS;
1130}
1131
1132static sljit_s32 emit_not_with_flags(struct sljit_compiler *compiler,
1133	sljit_s32 dst, sljit_sw dstw,
1134	sljit_s32 src, sljit_sw srcw)
1135{
1136	sljit_u8* inst;
1137
1138	if (dst == SLJIT_UNUSED) {
1139		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1140		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1141		FAIL_IF(!inst);
1142		*inst++ = GROUP_F7;
1143		*inst |= NOT_rm;
1144		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1145		FAIL_IF(!inst);
1146		*inst = OR_r_rm;
1147		return SLJIT_SUCCESS;
1148	}
1149	if (FAST_IS_REG(dst)) {
1150		EMIT_MOV(compiler, dst, 0, src, srcw);
1151		inst = emit_x86_instruction(compiler, 1, 0, 0, dst, dstw);
1152		FAIL_IF(!inst);
1153		*inst++ = GROUP_F7;
1154		*inst |= NOT_rm;
1155		inst = emit_x86_instruction(compiler, 1, dst, 0, dst, 0);
1156		FAIL_IF(!inst);
1157		*inst = OR_r_rm;
1158		return SLJIT_SUCCESS;
1159	}
1160	EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1161	inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1162	FAIL_IF(!inst);
1163	*inst++ = GROUP_F7;
1164	*inst |= NOT_rm;
1165	inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, TMP_REG1, 0);
1166	FAIL_IF(!inst);
1167	*inst = OR_r_rm;
1168	EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1169	return SLJIT_SUCCESS;
1170}
1171
1172static sljit_s32 emit_clz(struct sljit_compiler *compiler, sljit_s32 op_flags,
1173	sljit_s32 dst, sljit_sw dstw,
1174	sljit_s32 src, sljit_sw srcw)
1175{
1176	sljit_u8* inst;
1177	sljit_s32 dst_r;
1178
1179	SLJIT_UNUSED_ARG(op_flags);
1180	if (SLJIT_UNLIKELY(dst == SLJIT_UNUSED)) {
1181		/* Just set the zero flag. */
1182		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
1183		inst = emit_x86_instruction(compiler, 1, 0, 0, TMP_REG1, 0);
1184		FAIL_IF(!inst);
1185		*inst++ = GROUP_F7;
1186		*inst |= NOT_rm;
1187#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1188		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, 31, TMP_REG1, 0);
1189#else
1190		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, TMP_REG1, 0);
1191#endif
1192		FAIL_IF(!inst);
1193		*inst |= SHR;
1194		return SLJIT_SUCCESS;
1195	}
1196
1197	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
1198		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
1199		src = TMP_REG1;
1200		srcw = 0;
1201	}
1202
1203	inst = emit_x86_instruction(compiler, 2, TMP_REG1, 0, src, srcw);
1204	FAIL_IF(!inst);
1205	*inst++ = GROUP_0F;
1206	*inst = BSR_r_rm;
1207
1208#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1209	if (FAST_IS_REG(dst))
1210		dst_r = dst;
1211	else {
1212		/* Find an unused temporary register. */
1213		if ((dst & REG_MASK) != SLJIT_R0 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R0))
1214			dst_r = SLJIT_R0;
1215		else if ((dst & REG_MASK) != SLJIT_R1 && (dst & OFFS_REG_MASK) != TO_OFFS_REG(SLJIT_R1))
1216			dst_r = SLJIT_R1;
1217		else
1218			dst_r = SLJIT_R2;
1219		EMIT_MOV(compiler, dst, dstw, dst_r, 0);
1220	}
1221	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, 32 + 31);
1222#else
1223	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG2;
1224	compiler->mode32 = 0;
1225	EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 64 + 63 : 32 + 31);
1226	compiler->mode32 = op_flags & SLJIT_I32_OP;
1227#endif
1228
1229	if (cpu_has_cmov == -1)
1230		get_cpu_features();
1231
1232	if (cpu_has_cmov) {
1233		inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG1, 0);
1234		FAIL_IF(!inst);
1235		*inst++ = GROUP_0F;
1236		*inst = CMOVNE_r_rm;
1237	} else {
1238#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1239		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1240		FAIL_IF(!inst);
1241		INC_SIZE(4);
1242
1243		*inst++ = JE_i8;
1244		*inst++ = 2;
1245		*inst++ = MOV_r_rm;
1246		*inst++ = MOD_REG | (reg_map[dst_r] << 3) | reg_map[TMP_REG1];
1247#else
1248		inst = (sljit_u8*)ensure_buf(compiler, 1 + 5);
1249		FAIL_IF(!inst);
1250		INC_SIZE(5);
1251
1252		*inst++ = JE_i8;
1253		*inst++ = 3;
1254		*inst++ = REX_W | (reg_map[dst_r] >= 8 ? REX_R : 0) | (reg_map[TMP_REG1] >= 8 ? REX_B : 0);
1255		*inst++ = MOV_r_rm;
1256		*inst++ = MOD_REG | (reg_lmap[dst_r] << 3) | reg_lmap[TMP_REG1];
1257#endif
1258	}
1259
1260#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1261	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, 31, dst_r, 0);
1262#else
1263	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, !(op_flags & SLJIT_I32_OP) ? 63 : 31, dst_r, 0);
1264#endif
1265	FAIL_IF(!inst);
1266	*(inst + 1) |= XOR;
1267
1268#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1269	if (dst & SLJIT_MEM) {
1270		inst = emit_x86_instruction(compiler, 1, dst_r, 0, dst, dstw);
1271		FAIL_IF(!inst);
1272		*inst = XCHG_r_rm;
1273	}
1274#else
1275	if (dst & SLJIT_MEM)
1276		EMIT_MOV(compiler, dst, dstw, TMP_REG2, 0);
1277#endif
1278	return SLJIT_SUCCESS;
1279}
1280
1281SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op1(struct sljit_compiler *compiler, sljit_s32 op,
1282	sljit_s32 dst, sljit_sw dstw,
1283	sljit_s32 src, sljit_sw srcw)
1284{
1285	sljit_u8* inst;
1286	sljit_s32 update = 0;
1287	sljit_s32 op_flags = GET_ALL_FLAGS(op);
1288#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1289	sljit_s32 dst_is_ereg = 0;
1290	sljit_s32 src_is_ereg = 0;
1291#else
1292#	define src_is_ereg 0
1293#endif
1294
1295	CHECK_ERROR();
1296	CHECK(check_sljit_emit_op1(compiler, op, dst, dstw, src, srcw));
1297	ADJUST_LOCAL_OFFSET(dst, dstw);
1298	ADJUST_LOCAL_OFFSET(src, srcw);
1299
1300	CHECK_EXTRA_REGS(dst, dstw, dst_is_ereg = 1);
1301	CHECK_EXTRA_REGS(src, srcw, src_is_ereg = 1);
1302#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1303	compiler->mode32 = op_flags & SLJIT_I32_OP;
1304#endif
1305
1306	op = GET_OPCODE(op);
1307	if (op >= SLJIT_MOV && op <= SLJIT_MOVU_P) {
1308#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1309		compiler->mode32 = 0;
1310#endif
1311
1312		if (op_flags & SLJIT_I32_OP) {
1313			if (FAST_IS_REG(src) && src == dst) {
1314				if (!TYPE_CAST_NEEDED(op))
1315					return SLJIT_SUCCESS;
1316			}
1317#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1318			if (op == SLJIT_MOV_S32 && (src & SLJIT_MEM))
1319				op = SLJIT_MOV_U32;
1320			if (op == SLJIT_MOVU_S32 && (src & SLJIT_MEM))
1321				op = SLJIT_MOVU_U32;
1322			if (op == SLJIT_MOV_U32 && (src & SLJIT_IMM))
1323				op = SLJIT_MOV_S32;
1324			if (op == SLJIT_MOVU_U32 && (src & SLJIT_IMM))
1325				op = SLJIT_MOVU_S32;
1326#endif
1327		}
1328
1329		SLJIT_COMPILE_ASSERT(SLJIT_MOV + 8 == SLJIT_MOVU, movu_offset);
1330		if (op >= SLJIT_MOVU) {
1331			update = 1;
1332			op -= 8;
1333		}
1334
1335		if (src & SLJIT_IMM) {
1336			switch (op) {
1337			case SLJIT_MOV_U8:
1338				srcw = (sljit_u8)srcw;
1339				break;
1340			case SLJIT_MOV_S8:
1341				srcw = (sljit_s8)srcw;
1342				break;
1343			case SLJIT_MOV_U16:
1344				srcw = (sljit_u16)srcw;
1345				break;
1346			case SLJIT_MOV_S16:
1347				srcw = (sljit_s16)srcw;
1348				break;
1349#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1350			case SLJIT_MOV_U32:
1351				srcw = (sljit_u32)srcw;
1352				break;
1353			case SLJIT_MOV_S32:
1354				srcw = (sljit_s32)srcw;
1355				break;
1356#endif
1357			}
1358#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1359			if (SLJIT_UNLIKELY(dst_is_ereg))
1360				return emit_mov(compiler, dst, dstw, src, srcw);
1361#endif
1362		}
1363
1364		if (SLJIT_UNLIKELY(update) && (src & SLJIT_MEM) && !src_is_ereg && (src & REG_MASK) && (srcw != 0 || (src & OFFS_REG_MASK) != 0)) {
1365			inst = emit_x86_instruction(compiler, 1, src & REG_MASK, 0, src, srcw);
1366			FAIL_IF(!inst);
1367			*inst = LEA_r_m;
1368			src &= SLJIT_MEM | 0xf;
1369			srcw = 0;
1370		}
1371
1372#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1373		if (SLJIT_UNLIKELY(dst_is_ereg) && (!(op == SLJIT_MOV || op == SLJIT_MOV_U32 || op == SLJIT_MOV_S32 || op == SLJIT_MOV_P) || (src & SLJIT_MEM))) {
1374			SLJIT_ASSERT(dst == SLJIT_MEM1(SLJIT_SP));
1375			dst = TMP_REG1;
1376		}
1377#endif
1378
1379		switch (op) {
1380		case SLJIT_MOV:
1381		case SLJIT_MOV_P:
1382#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1383		case SLJIT_MOV_U32:
1384		case SLJIT_MOV_S32:
1385#endif
1386			FAIL_IF(emit_mov(compiler, dst, dstw, src, srcw));
1387			break;
1388		case SLJIT_MOV_U8:
1389			FAIL_IF(emit_mov_byte(compiler, 0, dst, dstw, src, srcw));
1390			break;
1391		case SLJIT_MOV_S8:
1392			FAIL_IF(emit_mov_byte(compiler, 1, dst, dstw, src, srcw));
1393			break;
1394		case SLJIT_MOV_U16:
1395			FAIL_IF(emit_mov_half(compiler, 0, dst, dstw, src, srcw));
1396			break;
1397		case SLJIT_MOV_S16:
1398			FAIL_IF(emit_mov_half(compiler, 1, dst, dstw, src, srcw));
1399			break;
1400#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1401		case SLJIT_MOV_U32:
1402			FAIL_IF(emit_mov_int(compiler, 0, dst, dstw, src, srcw));
1403			break;
1404		case SLJIT_MOV_S32:
1405			FAIL_IF(emit_mov_int(compiler, 1, dst, dstw, src, srcw));
1406			break;
1407#endif
1408		}
1409
1410#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1411		if (SLJIT_UNLIKELY(dst_is_ereg) && dst == TMP_REG1)
1412			return emit_mov(compiler, SLJIT_MEM1(SLJIT_SP), dstw, TMP_REG1, 0);
1413#endif
1414
1415		if (SLJIT_UNLIKELY(update) && (dst & SLJIT_MEM) && (dst & REG_MASK) && (dstw != 0 || (dst & OFFS_REG_MASK) != 0)) {
1416			inst = emit_x86_instruction(compiler, 1, dst & REG_MASK, 0, dst, dstw);
1417			FAIL_IF(!inst);
1418			*inst = LEA_r_m;
1419		}
1420		return SLJIT_SUCCESS;
1421	}
1422
1423	if (SLJIT_UNLIKELY(GET_FLAGS(op_flags)))
1424		compiler->flags_saved = 0;
1425
1426	switch (op) {
1427	case SLJIT_NOT:
1428		if (SLJIT_UNLIKELY(op_flags & SLJIT_SET_E))
1429			return emit_not_with_flags(compiler, dst, dstw, src, srcw);
1430		return emit_unary(compiler, NOT_rm, dst, dstw, src, srcw);
1431
1432	case SLJIT_NEG:
1433		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1434			FAIL_IF(emit_save_flags(compiler));
1435		return emit_unary(compiler, NEG_rm, dst, dstw, src, srcw);
1436
1437	case SLJIT_CLZ:
1438		if (SLJIT_UNLIKELY(op_flags & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
1439			FAIL_IF(emit_save_flags(compiler));
1440		return emit_clz(compiler, op_flags, dst, dstw, src, srcw);
1441	}
1442
1443	return SLJIT_SUCCESS;
1444
1445#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1446#	undef src_is_ereg
1447#endif
1448}
1449
1450#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1451
1452#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1453	if (IS_HALFWORD(immw) || compiler->mode32) { \
1454		inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1455		FAIL_IF(!inst); \
1456		*(inst + 1) |= (op_imm); \
1457	} \
1458	else { \
1459		FAIL_IF(emit_load_imm64(compiler, TMP_REG2, immw)); \
1460		inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, arg, argw); \
1461		FAIL_IF(!inst); \
1462		*inst = (op_mr); \
1463	}
1464
1465#define BINARY_EAX_IMM(op_eax_imm, immw) \
1466	FAIL_IF(emit_do_imm32(compiler, (!compiler->mode32) ? REX_W : 0, (op_eax_imm), immw))
1467
1468#else
1469
1470#define BINARY_IMM(op_imm, op_mr, immw, arg, argw) \
1471	inst = emit_x86_instruction(compiler, 1 | EX86_BIN_INS, SLJIT_IMM, immw, arg, argw); \
1472	FAIL_IF(!inst); \
1473	*(inst + 1) |= (op_imm);
1474
1475#define BINARY_EAX_IMM(op_eax_imm, immw) \
1476	FAIL_IF(emit_do_imm(compiler, (op_eax_imm), immw))
1477
1478#endif
1479
1480static sljit_s32 emit_cum_binary(struct sljit_compiler *compiler,
1481	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1482	sljit_s32 dst, sljit_sw dstw,
1483	sljit_s32 src1, sljit_sw src1w,
1484	sljit_s32 src2, sljit_sw src2w)
1485{
1486	sljit_u8* inst;
1487
1488	if (dst == SLJIT_UNUSED) {
1489		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1490		if (src2 & SLJIT_IMM) {
1491			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1492		}
1493		else {
1494			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1495			FAIL_IF(!inst);
1496			*inst = op_rm;
1497		}
1498		return SLJIT_SUCCESS;
1499	}
1500
1501	if (dst == src1 && dstw == src1w) {
1502		if (src2 & SLJIT_IMM) {
1503#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1504			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1505#else
1506			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1507#endif
1508				BINARY_EAX_IMM(op_eax_imm, src2w);
1509			}
1510			else {
1511				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1512			}
1513		}
1514		else if (FAST_IS_REG(dst)) {
1515			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1516			FAIL_IF(!inst);
1517			*inst = op_rm;
1518		}
1519		else if (FAST_IS_REG(src2)) {
1520			/* Special exception for sljit_emit_op_flags. */
1521			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1522			FAIL_IF(!inst);
1523			*inst = op_mr;
1524		}
1525		else {
1526			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1527			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1528			FAIL_IF(!inst);
1529			*inst = op_mr;
1530		}
1531		return SLJIT_SUCCESS;
1532	}
1533
1534	/* Only for cumulative operations. */
1535	if (dst == src2 && dstw == src2w) {
1536		if (src1 & SLJIT_IMM) {
1537#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1538			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1539#else
1540			if ((dst == SLJIT_R0) && (src1w > 127 || src1w < -128)) {
1541#endif
1542				BINARY_EAX_IMM(op_eax_imm, src1w);
1543			}
1544			else {
1545				BINARY_IMM(op_imm, op_mr, src1w, dst, dstw);
1546			}
1547		}
1548		else if (FAST_IS_REG(dst)) {
1549			inst = emit_x86_instruction(compiler, 1, dst, dstw, src1, src1w);
1550			FAIL_IF(!inst);
1551			*inst = op_rm;
1552		}
1553		else if (FAST_IS_REG(src1)) {
1554			inst = emit_x86_instruction(compiler, 1, src1, src1w, dst, dstw);
1555			FAIL_IF(!inst);
1556			*inst = op_mr;
1557		}
1558		else {
1559			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1560			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1561			FAIL_IF(!inst);
1562			*inst = op_mr;
1563		}
1564		return SLJIT_SUCCESS;
1565	}
1566
1567	/* General version. */
1568	if (FAST_IS_REG(dst)) {
1569		EMIT_MOV(compiler, dst, 0, src1, src1w);
1570		if (src2 & SLJIT_IMM) {
1571			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1572		}
1573		else {
1574			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1575			FAIL_IF(!inst);
1576			*inst = op_rm;
1577		}
1578	}
1579	else {
1580		/* This version requires less memory writing. */
1581		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1582		if (src2 & SLJIT_IMM) {
1583			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1584		}
1585		else {
1586			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1587			FAIL_IF(!inst);
1588			*inst = op_rm;
1589		}
1590		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1591	}
1592
1593	return SLJIT_SUCCESS;
1594}
1595
1596static sljit_s32 emit_non_cum_binary(struct sljit_compiler *compiler,
1597	sljit_u8 op_rm, sljit_u8 op_mr, sljit_u8 op_imm, sljit_u8 op_eax_imm,
1598	sljit_s32 dst, sljit_sw dstw,
1599	sljit_s32 src1, sljit_sw src1w,
1600	sljit_s32 src2, sljit_sw src2w)
1601{
1602	sljit_u8* inst;
1603
1604	if (dst == SLJIT_UNUSED) {
1605		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1606		if (src2 & SLJIT_IMM) {
1607			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1608		}
1609		else {
1610			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1611			FAIL_IF(!inst);
1612			*inst = op_rm;
1613		}
1614		return SLJIT_SUCCESS;
1615	}
1616
1617	if (dst == src1 && dstw == src1w) {
1618		if (src2 & SLJIT_IMM) {
1619#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1620			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1621#else
1622			if ((dst == SLJIT_R0) && (src2w > 127 || src2w < -128)) {
1623#endif
1624				BINARY_EAX_IMM(op_eax_imm, src2w);
1625			}
1626			else {
1627				BINARY_IMM(op_imm, op_mr, src2w, dst, dstw);
1628			}
1629		}
1630		else if (FAST_IS_REG(dst)) {
1631			inst = emit_x86_instruction(compiler, 1, dst, dstw, src2, src2w);
1632			FAIL_IF(!inst);
1633			*inst = op_rm;
1634		}
1635		else if (FAST_IS_REG(src2)) {
1636			inst = emit_x86_instruction(compiler, 1, src2, src2w, dst, dstw);
1637			FAIL_IF(!inst);
1638			*inst = op_mr;
1639		}
1640		else {
1641			EMIT_MOV(compiler, TMP_REG1, 0, src2, src2w);
1642			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, dst, dstw);
1643			FAIL_IF(!inst);
1644			*inst = op_mr;
1645		}
1646		return SLJIT_SUCCESS;
1647	}
1648
1649	/* General version. */
1650	if (FAST_IS_REG(dst) && dst != src2) {
1651		EMIT_MOV(compiler, dst, 0, src1, src1w);
1652		if (src2 & SLJIT_IMM) {
1653			BINARY_IMM(op_imm, op_mr, src2w, dst, 0);
1654		}
1655		else {
1656			inst = emit_x86_instruction(compiler, 1, dst, 0, src2, src2w);
1657			FAIL_IF(!inst);
1658			*inst = op_rm;
1659		}
1660	}
1661	else {
1662		/* This version requires less memory writing. */
1663		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1664		if (src2 & SLJIT_IMM) {
1665			BINARY_IMM(op_imm, op_mr, src2w, TMP_REG1, 0);
1666		}
1667		else {
1668			inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1669			FAIL_IF(!inst);
1670			*inst = op_rm;
1671		}
1672		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1673	}
1674
1675	return SLJIT_SUCCESS;
1676}
1677
1678static sljit_s32 emit_mul(struct sljit_compiler *compiler,
1679	sljit_s32 dst, sljit_sw dstw,
1680	sljit_s32 src1, sljit_sw src1w,
1681	sljit_s32 src2, sljit_sw src2w)
1682{
1683	sljit_u8* inst;
1684	sljit_s32 dst_r;
1685
1686	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1687
1688	/* Register destination. */
1689	if (dst_r == src1 && !(src2 & SLJIT_IMM)) {
1690		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1691		FAIL_IF(!inst);
1692		*inst++ = GROUP_0F;
1693		*inst = IMUL_r_rm;
1694	}
1695	else if (dst_r == src2 && !(src1 & SLJIT_IMM)) {
1696		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src1, src1w);
1697		FAIL_IF(!inst);
1698		*inst++ = GROUP_0F;
1699		*inst = IMUL_r_rm;
1700	}
1701	else if (src1 & SLJIT_IMM) {
1702		if (src2 & SLJIT_IMM) {
1703			EMIT_MOV(compiler, dst_r, 0, SLJIT_IMM, src2w);
1704			src2 = dst_r;
1705			src2w = 0;
1706		}
1707
1708		if (src1w <= 127 && src1w >= -128) {
1709			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1710			FAIL_IF(!inst);
1711			*inst = IMUL_r_rm_i8;
1712			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1713			FAIL_IF(!inst);
1714			INC_SIZE(1);
1715			*inst = (sljit_s8)src1w;
1716		}
1717#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1718		else {
1719			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1720			FAIL_IF(!inst);
1721			*inst = IMUL_r_rm_i32;
1722			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1723			FAIL_IF(!inst);
1724			INC_SIZE(4);
1725			sljit_unaligned_store_sw(inst, src1w);
1726		}
1727#else
1728		else if (IS_HALFWORD(src1w)) {
1729			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src2, src2w);
1730			FAIL_IF(!inst);
1731			*inst = IMUL_r_rm_i32;
1732			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1733			FAIL_IF(!inst);
1734			INC_SIZE(4);
1735			sljit_unaligned_store_s32(inst, (sljit_s32)src1w);
1736		}
1737		else {
1738			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src1w);
1739			if (dst_r != src2)
1740				EMIT_MOV(compiler, dst_r, 0, src2, src2w);
1741			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1742			FAIL_IF(!inst);
1743			*inst++ = GROUP_0F;
1744			*inst = IMUL_r_rm;
1745		}
1746#endif
1747	}
1748	else if (src2 & SLJIT_IMM) {
1749		/* Note: src1 is NOT immediate. */
1750
1751		if (src2w <= 127 && src2w >= -128) {
1752			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1753			FAIL_IF(!inst);
1754			*inst = IMUL_r_rm_i8;
1755			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1);
1756			FAIL_IF(!inst);
1757			INC_SIZE(1);
1758			*inst = (sljit_s8)src2w;
1759		}
1760#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
1761		else {
1762			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1763			FAIL_IF(!inst);
1764			*inst = IMUL_r_rm_i32;
1765			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1766			FAIL_IF(!inst);
1767			INC_SIZE(4);
1768			sljit_unaligned_store_sw(inst, src2w);
1769		}
1770#else
1771		else if (IS_HALFWORD(src2w)) {
1772			inst = emit_x86_instruction(compiler, 1, dst_r, 0, src1, src1w);
1773			FAIL_IF(!inst);
1774			*inst = IMUL_r_rm_i32;
1775			inst = (sljit_u8*)ensure_buf(compiler, 1 + 4);
1776			FAIL_IF(!inst);
1777			INC_SIZE(4);
1778			sljit_unaligned_store_s32(inst, (sljit_s32)src2w);
1779		}
1780		else {
1781			EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_IMM, src2w);
1782			if (dst_r != src1)
1783				EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1784			inst = emit_x86_instruction(compiler, 2, dst_r, 0, TMP_REG2, 0);
1785			FAIL_IF(!inst);
1786			*inst++ = GROUP_0F;
1787			*inst = IMUL_r_rm;
1788		}
1789#endif
1790	}
1791	else {
1792		/* Neither argument is immediate. */
1793		if (ADDRESSING_DEPENDS_ON(src2, dst_r))
1794			dst_r = TMP_REG1;
1795		EMIT_MOV(compiler, dst_r, 0, src1, src1w);
1796		inst = emit_x86_instruction(compiler, 2, dst_r, 0, src2, src2w);
1797		FAIL_IF(!inst);
1798		*inst++ = GROUP_0F;
1799		*inst = IMUL_r_rm;
1800	}
1801
1802	if (dst_r == TMP_REG1)
1803		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
1804
1805	return SLJIT_SUCCESS;
1806}
1807
1808static sljit_s32 emit_lea_binary(struct sljit_compiler *compiler, sljit_s32 keep_flags,
1809	sljit_s32 dst, sljit_sw dstw,
1810	sljit_s32 src1, sljit_sw src1w,
1811	sljit_s32 src2, sljit_sw src2w)
1812{
1813	sljit_u8* inst;
1814	sljit_s32 dst_r, done = 0;
1815
1816	/* These cases better be left to handled by normal way. */
1817	if (!keep_flags) {
1818		if (dst == src1 && dstw == src1w)
1819			return SLJIT_ERR_UNSUPPORTED;
1820		if (dst == src2 && dstw == src2w)
1821			return SLJIT_ERR_UNSUPPORTED;
1822	}
1823
1824	dst_r = FAST_IS_REG(dst) ? dst : TMP_REG1;
1825
1826	if (FAST_IS_REG(src1)) {
1827		if (FAST_IS_REG(src2)) {
1828			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM2(src1, src2), 0);
1829			FAIL_IF(!inst);
1830			*inst = LEA_r_m;
1831			done = 1;
1832		}
1833#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1834		if ((src2 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1835			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), (sljit_s32)src2w);
1836#else
1837		if (src2 & SLJIT_IMM) {
1838			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src1), src2w);
1839#endif
1840			FAIL_IF(!inst);
1841			*inst = LEA_r_m;
1842			done = 1;
1843		}
1844	}
1845	else if (FAST_IS_REG(src2)) {
1846#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1847		if ((src1 & SLJIT_IMM) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1848			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), (sljit_s32)src1w);
1849#else
1850		if (src1 & SLJIT_IMM) {
1851			inst = emit_x86_instruction(compiler, 1, dst_r, 0, SLJIT_MEM1(src2), src1w);
1852#endif
1853			FAIL_IF(!inst);
1854			*inst = LEA_r_m;
1855			done = 1;
1856		}
1857	}
1858
1859	if (done) {
1860		if (dst_r == TMP_REG1)
1861			return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
1862		return SLJIT_SUCCESS;
1863	}
1864	return SLJIT_ERR_UNSUPPORTED;
1865}
1866
1867static sljit_s32 emit_cmp_binary(struct sljit_compiler *compiler,
1868	sljit_s32 src1, sljit_sw src1w,
1869	sljit_s32 src2, sljit_sw src2w)
1870{
1871	sljit_u8* inst;
1872
1873#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1874	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1875#else
1876	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1877#endif
1878		BINARY_EAX_IMM(CMP_EAX_i32, src2w);
1879		return SLJIT_SUCCESS;
1880	}
1881
1882	if (FAST_IS_REG(src1)) {
1883		if (src2 & SLJIT_IMM) {
1884			BINARY_IMM(CMP, CMP_rm_r, src2w, src1, 0);
1885		}
1886		else {
1887			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1888			FAIL_IF(!inst);
1889			*inst = CMP_r_rm;
1890		}
1891		return SLJIT_SUCCESS;
1892	}
1893
1894	if (FAST_IS_REG(src2) && !(src1 & SLJIT_IMM)) {
1895		inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1896		FAIL_IF(!inst);
1897		*inst = CMP_rm_r;
1898		return SLJIT_SUCCESS;
1899	}
1900
1901	if (src2 & SLJIT_IMM) {
1902		if (src1 & SLJIT_IMM) {
1903			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1904			src1 = TMP_REG1;
1905			src1w = 0;
1906		}
1907		BINARY_IMM(CMP, CMP_rm_r, src2w, src1, src1w);
1908	}
1909	else {
1910		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
1911		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
1912		FAIL_IF(!inst);
1913		*inst = CMP_r_rm;
1914	}
1915	return SLJIT_SUCCESS;
1916}
1917
1918static sljit_s32 emit_test_binary(struct sljit_compiler *compiler,
1919	sljit_s32 src1, sljit_sw src1w,
1920	sljit_s32 src2, sljit_sw src2w)
1921{
1922	sljit_u8* inst;
1923
1924#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1925	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128) && (compiler->mode32 || IS_HALFWORD(src2w))) {
1926#else
1927	if (src1 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src2w > 127 || src2w < -128)) {
1928#endif
1929		BINARY_EAX_IMM(TEST_EAX_i32, src2w);
1930		return SLJIT_SUCCESS;
1931	}
1932
1933#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1934	if (src2 == SLJIT_R0 && (src2 & SLJIT_IMM) && (src1w > 127 || src1w < -128) && (compiler->mode32 || IS_HALFWORD(src1w))) {
1935#else
1936	if (src2 == SLJIT_R0 && (src1 & SLJIT_IMM) && (src1w > 127 || src1w < -128)) {
1937#endif
1938		BINARY_EAX_IMM(TEST_EAX_i32, src1w);
1939		return SLJIT_SUCCESS;
1940	}
1941
1942	if (!(src1 & SLJIT_IMM)) {
1943		if (src2 & SLJIT_IMM) {
1944#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1945			if (IS_HALFWORD(src2w) || compiler->mode32) {
1946				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1947				FAIL_IF(!inst);
1948				*inst = GROUP_F7;
1949			}
1950			else {
1951				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
1952				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src1, src1w);
1953				FAIL_IF(!inst);
1954				*inst = TEST_rm_r;
1955			}
1956#else
1957			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, src1, src1w);
1958			FAIL_IF(!inst);
1959			*inst = GROUP_F7;
1960#endif
1961			return SLJIT_SUCCESS;
1962		}
1963		else if (FAST_IS_REG(src1)) {
1964			inst = emit_x86_instruction(compiler, 1, src1, 0, src2, src2w);
1965			FAIL_IF(!inst);
1966			*inst = TEST_rm_r;
1967			return SLJIT_SUCCESS;
1968		}
1969	}
1970
1971	if (!(src2 & SLJIT_IMM)) {
1972		if (src1 & SLJIT_IMM) {
1973#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
1974			if (IS_HALFWORD(src1w) || compiler->mode32) {
1975				inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src1w, src2, src2w);
1976				FAIL_IF(!inst);
1977				*inst = GROUP_F7;
1978			}
1979			else {
1980				FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src1w));
1981				inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, src2, src2w);
1982				FAIL_IF(!inst);
1983				*inst = TEST_rm_r;
1984			}
1985#else
1986			inst = emit_x86_instruction(compiler, 1, src1, src1w, src2, src2w);
1987			FAIL_IF(!inst);
1988			*inst = GROUP_F7;
1989#endif
1990			return SLJIT_SUCCESS;
1991		}
1992		else if (FAST_IS_REG(src2)) {
1993			inst = emit_x86_instruction(compiler, 1, src2, 0, src1, src1w);
1994			FAIL_IF(!inst);
1995			*inst = TEST_rm_r;
1996			return SLJIT_SUCCESS;
1997		}
1998	}
1999
2000	EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2001	if (src2 & SLJIT_IMM) {
2002#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2003		if (IS_HALFWORD(src2w) || compiler->mode32) {
2004			inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2005			FAIL_IF(!inst);
2006			*inst = GROUP_F7;
2007		}
2008		else {
2009			FAIL_IF(emit_load_imm64(compiler, TMP_REG2, src2w));
2010			inst = emit_x86_instruction(compiler, 1, TMP_REG2, 0, TMP_REG1, 0);
2011			FAIL_IF(!inst);
2012			*inst = TEST_rm_r;
2013		}
2014#else
2015		inst = emit_x86_instruction(compiler, 1, SLJIT_IMM, src2w, TMP_REG1, 0);
2016		FAIL_IF(!inst);
2017		*inst = GROUP_F7;
2018#endif
2019	}
2020	else {
2021		inst = emit_x86_instruction(compiler, 1, TMP_REG1, 0, src2, src2w);
2022		FAIL_IF(!inst);
2023		*inst = TEST_rm_r;
2024	}
2025	return SLJIT_SUCCESS;
2026}
2027
2028static sljit_s32 emit_shift(struct sljit_compiler *compiler,
2029	sljit_u8 mode,
2030	sljit_s32 dst, sljit_sw dstw,
2031	sljit_s32 src1, sljit_sw src1w,
2032	sljit_s32 src2, sljit_sw src2w)
2033{
2034	sljit_u8* inst;
2035
2036	if ((src2 & SLJIT_IMM) || (src2 == SLJIT_PREF_SHIFT_REG)) {
2037		if (dst == src1 && dstw == src1w) {
2038			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, dstw);
2039			FAIL_IF(!inst);
2040			*inst |= mode;
2041			return SLJIT_SUCCESS;
2042		}
2043		if (dst == SLJIT_UNUSED) {
2044			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2045			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2046			FAIL_IF(!inst);
2047			*inst |= mode;
2048			return SLJIT_SUCCESS;
2049		}
2050		if (dst == SLJIT_PREF_SHIFT_REG && src2 == SLJIT_PREF_SHIFT_REG) {
2051			EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2052			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2053			FAIL_IF(!inst);
2054			*inst |= mode;
2055			EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2056			return SLJIT_SUCCESS;
2057		}
2058		if (FAST_IS_REG(dst)) {
2059			EMIT_MOV(compiler, dst, 0, src1, src1w);
2060			inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, dst, 0);
2061			FAIL_IF(!inst);
2062			*inst |= mode;
2063			return SLJIT_SUCCESS;
2064		}
2065
2066		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2067		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, src2, src2w, TMP_REG1, 0);
2068		FAIL_IF(!inst);
2069		*inst |= mode;
2070		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2071		return SLJIT_SUCCESS;
2072	}
2073
2074	if (dst == SLJIT_PREF_SHIFT_REG) {
2075		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2076		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2077		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2078		FAIL_IF(!inst);
2079		*inst |= mode;
2080		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2081	}
2082	else if (FAST_IS_REG(dst) && dst != src2 && !ADDRESSING_DEPENDS_ON(src2, dst)) {
2083		if (src1 != dst)
2084			EMIT_MOV(compiler, dst, 0, src1, src1w);
2085		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_PREF_SHIFT_REG, 0);
2086		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2087		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, dst, 0);
2088		FAIL_IF(!inst);
2089		*inst |= mode;
2090		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2091	}
2092	else {
2093		/* This case is really difficult, since ecx itself may used for
2094		   addressing, and we must ensure to work even in that case. */
2095		EMIT_MOV(compiler, TMP_REG1, 0, src1, src1w);
2096#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2097		EMIT_MOV(compiler, TMP_REG2, 0, SLJIT_PREF_SHIFT_REG, 0);
2098#else
2099		/* [esp+0] contains the flags. */
2100		EMIT_MOV(compiler, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw), SLJIT_PREF_SHIFT_REG, 0);
2101#endif
2102		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, src2, src2w);
2103		inst = emit_x86_instruction(compiler, 1 | EX86_SHIFT_INS, SLJIT_PREF_SHIFT_REG, 0, TMP_REG1, 0);
2104		FAIL_IF(!inst);
2105		*inst |= mode;
2106#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2107		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, TMP_REG2, 0);
2108#else
2109		EMIT_MOV(compiler, SLJIT_PREF_SHIFT_REG, 0, SLJIT_MEM1(SLJIT_SP), sizeof(sljit_sw));
2110#endif
2111		EMIT_MOV(compiler, dst, dstw, TMP_REG1, 0);
2112	}
2113
2114	return SLJIT_SUCCESS;
2115}
2116
2117static sljit_s32 emit_shift_with_flags(struct sljit_compiler *compiler,
2118	sljit_u8 mode, sljit_s32 set_flags,
2119	sljit_s32 dst, sljit_sw dstw,
2120	sljit_s32 src1, sljit_sw src1w,
2121	sljit_s32 src2, sljit_sw src2w)
2122{
2123	/* The CPU does not set flags if the shift count is 0. */
2124	if (src2 & SLJIT_IMM) {
2125#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2126		if ((src2w & 0x3f) != 0 || (compiler->mode32 && (src2w & 0x1f) != 0))
2127			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2128#else
2129		if ((src2w & 0x1f) != 0)
2130			return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2131#endif
2132		if (!set_flags)
2133			return emit_mov(compiler, dst, dstw, src1, src1w);
2134		/* OR dst, src, 0 */
2135		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2136			dst, dstw, src1, src1w, SLJIT_IMM, 0);
2137	}
2138
2139	if (!set_flags)
2140		return emit_shift(compiler, mode, dst, dstw, src1, src1w, src2, src2w);
2141
2142	if (!FAST_IS_REG(dst))
2143		FAIL_IF(emit_cmp_binary(compiler, src1, src1w, SLJIT_IMM, 0));
2144
2145	FAIL_IF(emit_shift(compiler,mode, dst, dstw, src1, src1w, src2, src2w));
2146
2147	if (FAST_IS_REG(dst))
2148		return emit_cmp_binary(compiler, dst, dstw, SLJIT_IMM, 0);
2149	return SLJIT_SUCCESS;
2150}
2151
2152SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op2(struct sljit_compiler *compiler, sljit_s32 op,
2153	sljit_s32 dst, sljit_sw dstw,
2154	sljit_s32 src1, sljit_sw src1w,
2155	sljit_s32 src2, sljit_sw src2w)
2156{
2157	CHECK_ERROR();
2158	CHECK(check_sljit_emit_op2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2159	ADJUST_LOCAL_OFFSET(dst, dstw);
2160	ADJUST_LOCAL_OFFSET(src1, src1w);
2161	ADJUST_LOCAL_OFFSET(src2, src2w);
2162
2163	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2164	CHECK_EXTRA_REGS(src1, src1w, (void)0);
2165	CHECK_EXTRA_REGS(src2, src2w, (void)0);
2166#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2167	compiler->mode32 = op & SLJIT_I32_OP;
2168#endif
2169
2170	if (GET_OPCODE(op) >= SLJIT_MUL) {
2171		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2172			compiler->flags_saved = 0;
2173		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2174			FAIL_IF(emit_save_flags(compiler));
2175	}
2176
2177	switch (GET_OPCODE(op)) {
2178	case SLJIT_ADD:
2179		if (!GET_FLAGS(op)) {
2180			if (emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, src2, src2w) != SLJIT_ERR_UNSUPPORTED)
2181				return compiler->error;
2182		}
2183		else
2184			compiler->flags_saved = 0;
2185		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2186			FAIL_IF(emit_save_flags(compiler));
2187		return emit_cum_binary(compiler, ADD_r_rm, ADD_rm_r, ADD, ADD_EAX_i32,
2188			dst, dstw, src1, src1w, src2, src2w);
2189	case SLJIT_ADDC:
2190		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2191			FAIL_IF(emit_restore_flags(compiler, 1));
2192		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2193			FAIL_IF(emit_save_flags(compiler));
2194		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2195			compiler->flags_saved = 0;
2196		return emit_cum_binary(compiler, ADC_r_rm, ADC_rm_r, ADC, ADC_EAX_i32,
2197			dst, dstw, src1, src1w, src2, src2w);
2198	case SLJIT_SUB:
2199		if (!GET_FLAGS(op)) {
2200			if ((src2 & SLJIT_IMM) && emit_lea_binary(compiler, op & SLJIT_KEEP_FLAGS, dst, dstw, src1, src1w, SLJIT_IMM, -src2w) != SLJIT_ERR_UNSUPPORTED)
2201				return compiler->error;
2202		}
2203		else
2204			compiler->flags_saved = 0;
2205		if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS) && !compiler->flags_saved)
2206			FAIL_IF(emit_save_flags(compiler));
2207		if (dst == SLJIT_UNUSED)
2208			return emit_cmp_binary(compiler, src1, src1w, src2, src2w);
2209		return emit_non_cum_binary(compiler, SUB_r_rm, SUB_rm_r, SUB, SUB_EAX_i32,
2210			dst, dstw, src1, src1w, src2, src2w);
2211	case SLJIT_SUBC:
2212		if (SLJIT_UNLIKELY(compiler->flags_saved)) /* C flag must be restored. */
2213			FAIL_IF(emit_restore_flags(compiler, 1));
2214		else if (SLJIT_UNLIKELY(op & SLJIT_KEEP_FLAGS))
2215			FAIL_IF(emit_save_flags(compiler));
2216		if (SLJIT_UNLIKELY(GET_FLAGS(op)))
2217			compiler->flags_saved = 0;
2218		return emit_non_cum_binary(compiler, SBB_r_rm, SBB_rm_r, SBB, SBB_EAX_i32,
2219			dst, dstw, src1, src1w, src2, src2w);
2220	case SLJIT_MUL:
2221		return emit_mul(compiler, dst, dstw, src1, src1w, src2, src2w);
2222	case SLJIT_AND:
2223		if (dst == SLJIT_UNUSED)
2224			return emit_test_binary(compiler, src1, src1w, src2, src2w);
2225		return emit_cum_binary(compiler, AND_r_rm, AND_rm_r, AND, AND_EAX_i32,
2226			dst, dstw, src1, src1w, src2, src2w);
2227	case SLJIT_OR:
2228		return emit_cum_binary(compiler, OR_r_rm, OR_rm_r, OR, OR_EAX_i32,
2229			dst, dstw, src1, src1w, src2, src2w);
2230	case SLJIT_XOR:
2231		return emit_cum_binary(compiler, XOR_r_rm, XOR_rm_r, XOR, XOR_EAX_i32,
2232			dst, dstw, src1, src1w, src2, src2w);
2233	case SLJIT_SHL:
2234		return emit_shift_with_flags(compiler, SHL, GET_FLAGS(op),
2235			dst, dstw, src1, src1w, src2, src2w);
2236	case SLJIT_LSHR:
2237		return emit_shift_with_flags(compiler, SHR, GET_FLAGS(op),
2238			dst, dstw, src1, src1w, src2, src2w);
2239	case SLJIT_ASHR:
2240		return emit_shift_with_flags(compiler, SAR, GET_FLAGS(op),
2241			dst, dstw, src1, src1w, src2, src2w);
2242	}
2243
2244	return SLJIT_SUCCESS;
2245}
2246
2247SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 reg)
2248{
2249	CHECK_REG_INDEX(check_sljit_get_register_index(reg));
2250#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2251	if (reg >= SLJIT_R3 && reg <= SLJIT_R6)
2252		return -1;
2253#endif
2254	return reg_map[reg];
2255}
2256
2257SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_float_register_index(sljit_s32 reg)
2258{
2259	CHECK_REG_INDEX(check_sljit_get_float_register_index(reg));
2260	return reg;
2261}
2262
2263SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler,
2264	void *instruction, sljit_s32 size)
2265{
2266	sljit_u8 *inst;
2267
2268	CHECK_ERROR();
2269	CHECK(check_sljit_emit_op_custom(compiler, instruction, size));
2270
2271	inst = (sljit_u8*)ensure_buf(compiler, 1 + size);
2272	FAIL_IF(!inst);
2273	INC_SIZE(size);
2274	SLJIT_MEMCPY(inst, instruction, size);
2275	return SLJIT_SUCCESS;
2276}
2277
2278/* --------------------------------------------------------------------- */
2279/*  Floating point operators                                             */
2280/* --------------------------------------------------------------------- */
2281
2282/* Alignment + 2 * 16 bytes. */
2283static sljit_s32 sse2_data[3 + (4 + 4) * 2];
2284static sljit_s32 *sse2_buffer;
2285
2286static void init_compiler(void)
2287{
2288	sse2_buffer = (sljit_s32*)(((sljit_uw)sse2_data + 15) & ~0xf);
2289	/* Single precision constants. */
2290	sse2_buffer[0] = 0x80000000;
2291	sse2_buffer[4] = 0x7fffffff;
2292	/* Double precision constants. */
2293	sse2_buffer[8] = 0;
2294	sse2_buffer[9] = 0x80000000;
2295	sse2_buffer[12] = 0xffffffff;
2296	sse2_buffer[13] = 0x7fffffff;
2297}
2298
2299SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_is_fpu_available(void)
2300{
2301#ifdef SLJIT_IS_FPU_AVAILABLE
2302	return SLJIT_IS_FPU_AVAILABLE;
2303#elif (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2304	if (cpu_has_sse2 == -1)
2305		get_cpu_features();
2306	return cpu_has_sse2;
2307#else /* SLJIT_DETECT_SSE2 */
2308	return 1;
2309#endif /* SLJIT_DETECT_SSE2 */
2310}
2311
2312static sljit_s32 emit_sse2(struct sljit_compiler *compiler, sljit_u8 opcode,
2313	sljit_s32 single, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2314{
2315	sljit_u8 *inst;
2316
2317	inst = emit_x86_instruction(compiler, 2 | (single ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2318	FAIL_IF(!inst);
2319	*inst++ = GROUP_0F;
2320	*inst = opcode;
2321	return SLJIT_SUCCESS;
2322}
2323
2324static sljit_s32 emit_sse2_logic(struct sljit_compiler *compiler, sljit_u8 opcode,
2325	sljit_s32 pref66, sljit_s32 xmm1, sljit_s32 xmm2, sljit_sw xmm2w)
2326{
2327	sljit_u8 *inst;
2328
2329	inst = emit_x86_instruction(compiler, 2 | (pref66 ? EX86_PREF_66 : 0) | EX86_SSE2, xmm1, 0, xmm2, xmm2w);
2330	FAIL_IF(!inst);
2331	*inst++ = GROUP_0F;
2332	*inst = opcode;
2333	return SLJIT_SUCCESS;
2334}
2335
2336static SLJIT_INLINE sljit_s32 emit_sse2_load(struct sljit_compiler *compiler,
2337	sljit_s32 single, sljit_s32 dst, sljit_s32 src, sljit_sw srcw)
2338{
2339	return emit_sse2(compiler, MOVSD_x_xm, single, dst, src, srcw);
2340}
2341
2342static SLJIT_INLINE sljit_s32 emit_sse2_store(struct sljit_compiler *compiler,
2343	sljit_s32 single, sljit_s32 dst, sljit_sw dstw, sljit_s32 src)
2344{
2345	return emit_sse2(compiler, MOVSD_xm_x, single, src, dst, dstw);
2346}
2347
2348static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_sw_from_f64(struct sljit_compiler *compiler, sljit_s32 op,
2349	sljit_s32 dst, sljit_sw dstw,
2350	sljit_s32 src, sljit_sw srcw)
2351{
2352	sljit_s32 dst_r = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2353	sljit_u8 *inst;
2354
2355#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2356	if (GET_OPCODE(op) == SLJIT_CONV_SW_FROM_F64)
2357		compiler->mode32 = 0;
2358#endif
2359
2360	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP2, dst_r, 0, src, srcw);
2361	FAIL_IF(!inst);
2362	*inst++ = GROUP_0F;
2363	*inst = CVTTSD2SI_r_xm;
2364
2365	if (dst_r == TMP_REG1 && dst != SLJIT_UNUSED)
2366		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2367	return SLJIT_SUCCESS;
2368}
2369
2370static SLJIT_INLINE sljit_s32 sljit_emit_fop1_conv_f64_from_sw(struct sljit_compiler *compiler, sljit_s32 op,
2371	sljit_s32 dst, sljit_sw dstw,
2372	sljit_s32 src, sljit_sw srcw)
2373{
2374	sljit_s32 dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2375	sljit_u8 *inst;
2376
2377#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2378	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_SW)
2379		compiler->mode32 = 0;
2380#endif
2381
2382	if (src & SLJIT_IMM) {
2383#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2384		if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_S32)
2385			srcw = (sljit_s32)srcw;
2386#endif
2387		EMIT_MOV(compiler, TMP_REG1, 0, src, srcw);
2388		src = TMP_REG1;
2389		srcw = 0;
2390	}
2391
2392	inst = emit_x86_instruction(compiler, 2 | ((op & SLJIT_F32_OP) ? EX86_PREF_F3 : EX86_PREF_F2) | EX86_SSE2_OP1, dst_r, 0, src, srcw);
2393	FAIL_IF(!inst);
2394	*inst++ = GROUP_0F;
2395	*inst = CVTSI2SD_x_rm;
2396
2397#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2398	compiler->mode32 = 1;
2399#endif
2400	if (dst_r == TMP_FREG)
2401		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2402	return SLJIT_SUCCESS;
2403}
2404
2405static SLJIT_INLINE sljit_s32 sljit_emit_fop1_cmp(struct sljit_compiler *compiler, sljit_s32 op,
2406	sljit_s32 src1, sljit_sw src1w,
2407	sljit_s32 src2, sljit_sw src2w)
2408{
2409	compiler->flags_saved = 0;
2410	if (!FAST_IS_REG(src1)) {
2411		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2412		src1 = TMP_FREG;
2413	}
2414	return emit_sse2_logic(compiler, UCOMISD_x_xm, !(op & SLJIT_F32_OP), src1, src2, src2w);
2415}
2416
2417SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop1(struct sljit_compiler *compiler, sljit_s32 op,
2418	sljit_s32 dst, sljit_sw dstw,
2419	sljit_s32 src, sljit_sw srcw)
2420{
2421	sljit_s32 dst_r;
2422
2423#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2424	compiler->mode32 = 1;
2425#endif
2426
2427	CHECK_ERROR();
2428	SELECT_FOP1_OPERATION_WITH_CHECKS(compiler, op, dst, dstw, src, srcw);
2429
2430	if (GET_OPCODE(op) == SLJIT_MOV_F64) {
2431		if (FAST_IS_REG(dst))
2432			return emit_sse2_load(compiler, op & SLJIT_F32_OP, dst, src, srcw);
2433		if (FAST_IS_REG(src))
2434			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, src);
2435		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src, srcw));
2436		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2437	}
2438
2439	if (GET_OPCODE(op) == SLJIT_CONV_F64_FROM_F32) {
2440		dst_r = FAST_IS_REG(dst) ? dst : TMP_FREG;
2441		if (FAST_IS_REG(src)) {
2442			/* We overwrite the high bits of source. From SLJIT point of view,
2443			   this is not an issue.
2444			   Note: In SSE3, we could also use MOVDDUP and MOVSLDUP. */
2445			FAIL_IF(emit_sse2_logic(compiler, UNPCKLPD_x_xm, op & SLJIT_F32_OP, src, src, 0));
2446		}
2447		else {
2448			FAIL_IF(emit_sse2_load(compiler, !(op & SLJIT_F32_OP), TMP_FREG, src, srcw));
2449			src = TMP_FREG;
2450		}
2451
2452		FAIL_IF(emit_sse2_logic(compiler, CVTPD2PS_x_xm, op & SLJIT_F32_OP, dst_r, src, 0));
2453		if (dst_r == TMP_FREG)
2454			return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2455		return SLJIT_SUCCESS;
2456	}
2457
2458	if (SLOW_IS_REG(dst)) {
2459		dst_r = dst;
2460		if (dst != src)
2461			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2462	}
2463	else {
2464		dst_r = TMP_FREG;
2465		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src, srcw));
2466	}
2467
2468	switch (GET_OPCODE(op)) {
2469	case SLJIT_NEG_F64:
2470		FAIL_IF(emit_sse2_logic(compiler, XORPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer : sse2_buffer + 8)));
2471		break;
2472
2473	case SLJIT_ABS_F64:
2474		FAIL_IF(emit_sse2_logic(compiler, ANDPD_x_xm, 1, dst_r, SLJIT_MEM0(), (sljit_sw)(op & SLJIT_F32_OP ? sse2_buffer + 4 : sse2_buffer + 12)));
2475		break;
2476	}
2477
2478	if (dst_r == TMP_FREG)
2479		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2480	return SLJIT_SUCCESS;
2481}
2482
2483SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_fop2(struct sljit_compiler *compiler, sljit_s32 op,
2484	sljit_s32 dst, sljit_sw dstw,
2485	sljit_s32 src1, sljit_sw src1w,
2486	sljit_s32 src2, sljit_sw src2w)
2487{
2488	sljit_s32 dst_r;
2489
2490	CHECK_ERROR();
2491	CHECK(check_sljit_emit_fop2(compiler, op, dst, dstw, src1, src1w, src2, src2w));
2492	ADJUST_LOCAL_OFFSET(dst, dstw);
2493	ADJUST_LOCAL_OFFSET(src1, src1w);
2494	ADJUST_LOCAL_OFFSET(src2, src2w);
2495
2496#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2497	compiler->mode32 = 1;
2498#endif
2499
2500	if (FAST_IS_REG(dst)) {
2501		dst_r = dst;
2502		if (dst == src1)
2503			; /* Do nothing here. */
2504		else if (dst == src2 && (op == SLJIT_ADD_F64 || op == SLJIT_MUL_F64)) {
2505			/* Swap arguments. */
2506			src2 = src1;
2507			src2w = src1w;
2508		}
2509		else if (dst != src2)
2510			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, dst_r, src1, src1w));
2511		else {
2512			dst_r = TMP_FREG;
2513			FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2514		}
2515	}
2516	else {
2517		dst_r = TMP_FREG;
2518		FAIL_IF(emit_sse2_load(compiler, op & SLJIT_F32_OP, TMP_FREG, src1, src1w));
2519	}
2520
2521	switch (GET_OPCODE(op)) {
2522	case SLJIT_ADD_F64:
2523		FAIL_IF(emit_sse2(compiler, ADDSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2524		break;
2525
2526	case SLJIT_SUB_F64:
2527		FAIL_IF(emit_sse2(compiler, SUBSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2528		break;
2529
2530	case SLJIT_MUL_F64:
2531		FAIL_IF(emit_sse2(compiler, MULSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2532		break;
2533
2534	case SLJIT_DIV_F64:
2535		FAIL_IF(emit_sse2(compiler, DIVSD_x_xm, op & SLJIT_F32_OP, dst_r, src2, src2w));
2536		break;
2537	}
2538
2539	if (dst_r == TMP_FREG)
2540		return emit_sse2_store(compiler, op & SLJIT_F32_OP, dst, dstw, TMP_FREG);
2541	return SLJIT_SUCCESS;
2542}
2543
2544/* --------------------------------------------------------------------- */
2545/*  Conditional instructions                                             */
2546/* --------------------------------------------------------------------- */
2547
2548SLJIT_API_FUNC_ATTRIBUTE struct sljit_label* sljit_emit_label(struct sljit_compiler *compiler)
2549{
2550	sljit_u8 *inst;
2551	struct sljit_label *label;
2552
2553	CHECK_ERROR_PTR();
2554	CHECK_PTR(check_sljit_emit_label(compiler));
2555
2556	/* We should restore the flags before the label,
2557	   since other taken jumps has their own flags as well. */
2558	if (SLJIT_UNLIKELY(compiler->flags_saved))
2559		PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2560
2561	if (compiler->last_label && compiler->last_label->size == compiler->size)
2562		return compiler->last_label;
2563
2564	label = (struct sljit_label*)ensure_abuf(compiler, sizeof(struct sljit_label));
2565	PTR_FAIL_IF(!label);
2566	set_label(label, compiler);
2567
2568	inst = (sljit_u8*)ensure_buf(compiler, 2);
2569	PTR_FAIL_IF(!inst);
2570
2571	*inst++ = 0;
2572	*inst++ = 0;
2573
2574	return label;
2575}
2576
2577SLJIT_API_FUNC_ATTRIBUTE struct sljit_jump* sljit_emit_jump(struct sljit_compiler *compiler, sljit_s32 type)
2578{
2579	sljit_u8 *inst;
2580	struct sljit_jump *jump;
2581
2582	CHECK_ERROR_PTR();
2583	CHECK_PTR(check_sljit_emit_jump(compiler, type));
2584
2585	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2586		if ((type & 0xff) <= SLJIT_JUMP)
2587			PTR_FAIL_IF(emit_restore_flags(compiler, 0));
2588		compiler->flags_saved = 0;
2589	}
2590
2591	jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2592	PTR_FAIL_IF_NULL(jump);
2593	set_jump(jump, compiler, type & SLJIT_REWRITABLE_JUMP);
2594	type &= 0xff;
2595
2596	if (type >= SLJIT_CALL1)
2597		PTR_FAIL_IF(call_with_args(compiler, type));
2598
2599	/* Worst case size. */
2600#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2601	compiler->size += (type >= SLJIT_JUMP) ? 5 : 6;
2602#else
2603	compiler->size += (type >= SLJIT_JUMP) ? (10 + 3) : (2 + 10 + 3);
2604#endif
2605
2606	inst = (sljit_u8*)ensure_buf(compiler, 2);
2607	PTR_FAIL_IF_NULL(inst);
2608
2609	*inst++ = 0;
2610	*inst++ = type + 4;
2611	return jump;
2612}
2613
2614SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_ijump(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 src, sljit_sw srcw)
2615{
2616	sljit_u8 *inst;
2617	struct sljit_jump *jump;
2618
2619	CHECK_ERROR();
2620	CHECK(check_sljit_emit_ijump(compiler, type, src, srcw));
2621	ADJUST_LOCAL_OFFSET(src, srcw);
2622
2623	CHECK_EXTRA_REGS(src, srcw, (void)0);
2624
2625	if (SLJIT_UNLIKELY(compiler->flags_saved)) {
2626		if (type <= SLJIT_JUMP)
2627			FAIL_IF(emit_restore_flags(compiler, 0));
2628		compiler->flags_saved = 0;
2629	}
2630
2631	if (type >= SLJIT_CALL1) {
2632#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2633#if (defined SLJIT_X86_32_FASTCALL && SLJIT_X86_32_FASTCALL)
2634		if (src == SLJIT_R2) {
2635			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2636			src = TMP_REG1;
2637		}
2638		if (src == SLJIT_MEM1(SLJIT_SP) && type >= SLJIT_CALL3)
2639			srcw += sizeof(sljit_sw);
2640#endif
2641#endif
2642#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64) && defined(_WIN64)
2643		if (src == SLJIT_R2) {
2644			EMIT_MOV(compiler, TMP_REG1, 0, src, 0);
2645			src = TMP_REG1;
2646		}
2647#endif
2648		FAIL_IF(call_with_args(compiler, type));
2649	}
2650
2651	if (src == SLJIT_IMM) {
2652		jump = (struct sljit_jump*)ensure_abuf(compiler, sizeof(struct sljit_jump));
2653		FAIL_IF_NULL(jump);
2654		set_jump(jump, compiler, JUMP_ADDR);
2655		jump->u.target = srcw;
2656
2657		/* Worst case size. */
2658#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2659		compiler->size += 5;
2660#else
2661		compiler->size += 10 + 3;
2662#endif
2663
2664		inst = (sljit_u8*)ensure_buf(compiler, 2);
2665		FAIL_IF_NULL(inst);
2666
2667		*inst++ = 0;
2668		*inst++ = type + 4;
2669	}
2670	else {
2671#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2672		/* REX_W is not necessary (src is not immediate). */
2673		compiler->mode32 = 1;
2674#endif
2675		inst = emit_x86_instruction(compiler, 1, 0, 0, src, srcw);
2676		FAIL_IF(!inst);
2677		*inst++ = GROUP_FF;
2678		*inst |= (type >= SLJIT_FAST_CALL) ? CALL_rm : JMP_rm;
2679	}
2680	return SLJIT_SUCCESS;
2681}
2682
2683SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_flags(struct sljit_compiler *compiler, sljit_s32 op,
2684	sljit_s32 dst, sljit_sw dstw,
2685	sljit_s32 src, sljit_sw srcw,
2686	sljit_s32 type)
2687{
2688	sljit_u8 *inst;
2689	sljit_u8 cond_set = 0;
2690#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2691	sljit_s32 reg;
2692#else
2693	/* CHECK_EXTRA_REGS migh overwrite these values. */
2694	sljit_s32 dst_save = dst;
2695	sljit_sw dstw_save = dstw;
2696#endif
2697
2698	CHECK_ERROR();
2699	CHECK(check_sljit_emit_op_flags(compiler, op, dst, dstw, src, srcw, type));
2700	SLJIT_UNUSED_ARG(srcw);
2701
2702	if (dst == SLJIT_UNUSED)
2703		return SLJIT_SUCCESS;
2704
2705	ADJUST_LOCAL_OFFSET(dst, dstw);
2706	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2707	if (SLJIT_UNLIKELY(compiler->flags_saved))
2708		FAIL_IF(emit_restore_flags(compiler, op & SLJIT_KEEP_FLAGS));
2709
2710	type &= 0xff;
2711	/* setcc = jcc + 0x10. */
2712	cond_set = get_jump_code(type) + 0x10;
2713
2714#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2715	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src) {
2716		inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 3);
2717		FAIL_IF(!inst);
2718		INC_SIZE(4 + 3);
2719		/* Set low register to conditional flag. */
2720		*inst++ = (reg_map[TMP_REG1] <= 7) ? REX : REX_B;
2721		*inst++ = GROUP_0F;
2722		*inst++ = cond_set;
2723		*inst++ = MOD_REG | reg_lmap[TMP_REG1];
2724		*inst++ = REX | (reg_map[TMP_REG1] <= 7 ? 0 : REX_R) | (reg_map[dst] <= 7 ? 0 : REX_B);
2725		*inst++ = OR_rm8_r8;
2726		*inst++ = MOD_REG | (reg_lmap[TMP_REG1] << 3) | reg_lmap[dst];
2727		return SLJIT_SUCCESS;
2728	}
2729
2730	reg = (op == SLJIT_MOV && FAST_IS_REG(dst)) ? dst : TMP_REG1;
2731
2732	inst = (sljit_u8*)ensure_buf(compiler, 1 + 4 + 4);
2733	FAIL_IF(!inst);
2734	INC_SIZE(4 + 4);
2735	/* Set low register to conditional flag. */
2736	*inst++ = (reg_map[reg] <= 7) ? REX : REX_B;
2737	*inst++ = GROUP_0F;
2738	*inst++ = cond_set;
2739	*inst++ = MOD_REG | reg_lmap[reg];
2740	*inst++ = REX_W | (reg_map[reg] <= 7 ? 0 : (REX_B | REX_R));
2741	*inst++ = GROUP_0F;
2742	*inst++ = MOVZX_r_rm8;
2743	*inst = MOD_REG | (reg_lmap[reg] << 3) | reg_lmap[reg];
2744
2745	if (reg != TMP_REG1)
2746		return SLJIT_SUCCESS;
2747
2748	if (GET_OPCODE(op) < SLJIT_ADD) {
2749		compiler->mode32 = GET_OPCODE(op) != SLJIT_MOV;
2750		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2751	}
2752#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2753		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2754	compiler->skip_checks = 1;
2755#endif
2756	return sljit_emit_op2(compiler, op, dst, dstw, dst, dstw, TMP_REG1, 0);
2757#else /* SLJIT_CONFIG_X86_64 */
2758	if (GET_OPCODE(op) < SLJIT_ADD && FAST_IS_REG(dst)) {
2759		if (reg_map[dst] <= 4) {
2760			/* Low byte is accessible. */
2761			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3 + 3);
2762			FAIL_IF(!inst);
2763			INC_SIZE(3 + 3);
2764			/* Set low byte to conditional flag. */
2765			*inst++ = GROUP_0F;
2766			*inst++ = cond_set;
2767			*inst++ = MOD_REG | reg_map[dst];
2768
2769			*inst++ = GROUP_0F;
2770			*inst++ = MOVZX_r_rm8;
2771			*inst = MOD_REG | (reg_map[dst] << 3) | reg_map[dst];
2772			return SLJIT_SUCCESS;
2773		}
2774
2775		/* Low byte is not accessible. */
2776		if (cpu_has_cmov == -1)
2777			get_cpu_features();
2778
2779		if (cpu_has_cmov) {
2780			EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, 1);
2781			/* a xor reg, reg operation would overwrite the flags. */
2782			EMIT_MOV(compiler, dst, 0, SLJIT_IMM, 0);
2783
2784			inst = (sljit_u8*)ensure_buf(compiler, 1 + 3);
2785			FAIL_IF(!inst);
2786			INC_SIZE(3);
2787
2788			*inst++ = GROUP_0F;
2789			/* cmovcc = setcc - 0x50. */
2790			*inst++ = cond_set - 0x50;
2791			*inst++ = MOD_REG | (reg_map[dst] << 3) | reg_map[TMP_REG1];
2792			return SLJIT_SUCCESS;
2793		}
2794
2795		inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2796		FAIL_IF(!inst);
2797		INC_SIZE(1 + 3 + 3 + 1);
2798		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2799		/* Set al to conditional flag. */
2800		*inst++ = GROUP_0F;
2801		*inst++ = cond_set;
2802		*inst++ = MOD_REG | 0 /* eax */;
2803
2804		*inst++ = GROUP_0F;
2805		*inst++ = MOVZX_r_rm8;
2806		*inst++ = MOD_REG | (reg_map[dst] << 3) | 0 /* eax */;
2807		*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2808		return SLJIT_SUCCESS;
2809	}
2810
2811	if (GET_OPCODE(op) == SLJIT_OR && !GET_ALL_FLAGS(op) && FAST_IS_REG(dst) && dst == src && reg_map[dst] <= 4) {
2812		SLJIT_COMPILE_ASSERT(reg_map[SLJIT_R0] == 0, scratch_reg1_must_be_eax);
2813		if (dst != SLJIT_R0) {
2814			inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 2 + 1);
2815			FAIL_IF(!inst);
2816			INC_SIZE(1 + 3 + 2 + 1);
2817			/* Set low register to conditional flag. */
2818			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2819			*inst++ = GROUP_0F;
2820			*inst++ = cond_set;
2821			*inst++ = MOD_REG | 0 /* eax */;
2822			*inst++ = OR_rm8_r8;
2823			*inst++ = MOD_REG | (0 /* eax */ << 3) | reg_map[dst];
2824			*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2825		}
2826		else {
2827			inst = (sljit_u8*)ensure_buf(compiler, 1 + 2 + 3 + 2 + 2);
2828			FAIL_IF(!inst);
2829			INC_SIZE(2 + 3 + 2 + 2);
2830			/* Set low register to conditional flag. */
2831			*inst++ = XCHG_r_rm;
2832			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2833			*inst++ = GROUP_0F;
2834			*inst++ = cond_set;
2835			*inst++ = MOD_REG | 1 /* ecx */;
2836			*inst++ = OR_rm8_r8;
2837			*inst++ = MOD_REG | (1 /* ecx */ << 3) | 0 /* eax */;
2838			*inst++ = XCHG_r_rm;
2839			*inst++ = MOD_REG | (1 /* ecx */ << 3) | reg_map[TMP_REG1];
2840		}
2841		return SLJIT_SUCCESS;
2842	}
2843
2844	/* Set TMP_REG1 to the bit. */
2845	inst = (sljit_u8*)ensure_buf(compiler, 1 + 1 + 3 + 3 + 1);
2846	FAIL_IF(!inst);
2847	INC_SIZE(1 + 3 + 3 + 1);
2848	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2849	/* Set al to conditional flag. */
2850	*inst++ = GROUP_0F;
2851	*inst++ = cond_set;
2852	*inst++ = MOD_REG | 0 /* eax */;
2853
2854	*inst++ = GROUP_0F;
2855	*inst++ = MOVZX_r_rm8;
2856	*inst++ = MOD_REG | (0 << 3) /* eax */ | 0 /* eax */;
2857
2858	*inst++ = XCHG_EAX_r + reg_map[TMP_REG1];
2859
2860	if (GET_OPCODE(op) < SLJIT_ADD)
2861		return emit_mov(compiler, dst, dstw, TMP_REG1, 0);
2862
2863#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE) \
2864		|| (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2865	compiler->skip_checks = 1;
2866#endif
2867	return sljit_emit_op2(compiler, op, dst_save, dstw_save, dst_save, dstw_save, TMP_REG1, 0);
2868#endif /* SLJIT_CONFIG_X86_64 */
2869}
2870
2871SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_local_base(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw offset)
2872{
2873	CHECK_ERROR();
2874	CHECK(check_sljit_get_local_base(compiler, dst, dstw, offset));
2875	ADJUST_LOCAL_OFFSET(dst, dstw);
2876
2877	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2878
2879#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2880	compiler->mode32 = 0;
2881#endif
2882
2883	ADJUST_LOCAL_OFFSET(SLJIT_MEM1(SLJIT_SP), offset);
2884
2885#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2886	if (NOT_HALFWORD(offset)) {
2887		FAIL_IF(emit_load_imm64(compiler, TMP_REG1, offset));
2888#if (defined SLJIT_DEBUG && SLJIT_DEBUG)
2889		SLJIT_ASSERT(emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0) != SLJIT_ERR_UNSUPPORTED);
2890		return compiler->error;
2891#else
2892		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, TMP_REG1, 0);
2893#endif
2894	}
2895#endif
2896
2897	if (offset != 0)
2898		return emit_lea_binary(compiler, SLJIT_KEEP_FLAGS, dst, dstw, SLJIT_SP, 0, SLJIT_IMM, offset);
2899	return emit_mov(compiler, dst, dstw, SLJIT_SP, 0);
2900}
2901
2902SLJIT_API_FUNC_ATTRIBUTE struct sljit_const* sljit_emit_const(struct sljit_compiler *compiler, sljit_s32 dst, sljit_sw dstw, sljit_sw init_value)
2903{
2904	sljit_u8 *inst;
2905	struct sljit_const *const_;
2906#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2907	sljit_s32 reg;
2908#endif
2909
2910	CHECK_ERROR_PTR();
2911	CHECK_PTR(check_sljit_emit_const(compiler, dst, dstw, init_value));
2912	ADJUST_LOCAL_OFFSET(dst, dstw);
2913
2914	CHECK_EXTRA_REGS(dst, dstw, (void)0);
2915
2916	const_ = (struct sljit_const*)ensure_abuf(compiler, sizeof(struct sljit_const));
2917	PTR_FAIL_IF(!const_);
2918	set_const(const_, compiler);
2919
2920#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2921	compiler->mode32 = 0;
2922	reg = SLOW_IS_REG(dst) ? dst : TMP_REG1;
2923
2924	if (emit_load_imm64(compiler, reg, init_value))
2925		return NULL;
2926#else
2927	if (dst == SLJIT_UNUSED)
2928		dst = TMP_REG1;
2929
2930	if (emit_mov(compiler, dst, dstw, SLJIT_IMM, init_value))
2931		return NULL;
2932#endif
2933
2934	inst = (sljit_u8*)ensure_buf(compiler, 2);
2935	PTR_FAIL_IF(!inst);
2936
2937	*inst++ = 0;
2938	*inst++ = 1;
2939
2940#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
2941	if (dst & SLJIT_MEM)
2942		if (emit_mov(compiler, dst, dstw, TMP_REG1, 0))
2943			return NULL;
2944#endif
2945
2946	return const_;
2947}
2948
2949SLJIT_API_FUNC_ATTRIBUTE void sljit_set_jump_addr(sljit_uw addr, sljit_uw new_addr)
2950{
2951#if (defined SLJIT_CONFIG_X86_32 && SLJIT_CONFIG_X86_32)
2952	sljit_unaligned_store_sw((void*)addr, new_addr - (addr + 4));
2953#else
2954	sljit_unaligned_store_sw((void*)addr, (sljit_sw) new_addr);
2955#endif
2956}
2957
2958SLJIT_API_FUNC_ATTRIBUTE void sljit_set_const(sljit_uw addr, sljit_sw new_constant)
2959{
2960	sljit_unaligned_store_sw((void*)addr, new_constant);
2961}
2962
2963SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_sse2_available(void)
2964{
2965#if (defined SLJIT_DETECT_SSE2 && SLJIT_DETECT_SSE2)
2966	if (cpu_has_sse2 == -1)
2967		get_cpu_features();
2968	return cpu_has_sse2;
2969#else
2970	return 1;
2971#endif
2972}
2973
2974SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_is_cmov_available(void)
2975{
2976	if (cpu_has_cmov == -1)
2977		get_cpu_features();
2978	return cpu_has_cmov;
2979}
2980
2981SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_x86_emit_cmov(struct sljit_compiler *compiler,
2982	sljit_s32 type,
2983	sljit_s32 dst_reg,
2984	sljit_s32 src, sljit_sw srcw)
2985{
2986	sljit_u8* inst;
2987
2988	CHECK_ERROR();
2989#if (defined SLJIT_ARGUMENT_CHECKS && SLJIT_ARGUMENT_CHECKS)
2990	CHECK_ARGUMENT(sljit_x86_is_cmov_available());
2991	CHECK_ARGUMENT(!(type & ~(0xff | SLJIT_I32_OP)));
2992	CHECK_ARGUMENT((type & 0xff) >= SLJIT_EQUAL && (type & 0xff) <= SLJIT_ORDERED_F64);
2993	CHECK_ARGUMENT(FUNCTION_CHECK_IS_REG(dst_reg & ~SLJIT_I32_OP));
2994	FUNCTION_CHECK_SRC(src, srcw);
2995#endif
2996#if (defined SLJIT_VERBOSE && SLJIT_VERBOSE)
2997	if (SLJIT_UNLIKELY(!!compiler->verbose)) {
2998		fprintf(compiler->verbose, "  x86_cmov%s %s%s, ",
2999			!(dst_reg & SLJIT_I32_OP) ? "" : ".i",
3000			jump_names[type & 0xff], JUMP_POSTFIX(type));
3001		sljit_verbose_reg(compiler, dst_reg & ~SLJIT_I32_OP);
3002		fprintf(compiler->verbose, ", ");
3003		sljit_verbose_param(compiler, src, srcw);
3004		fprintf(compiler->verbose, "\n");
3005	}
3006#endif
3007
3008	ADJUST_LOCAL_OFFSET(src, srcw);
3009	CHECK_EXTRA_REGS(src, srcw, (void)0);
3010
3011#if (defined SLJIT_CONFIG_X86_64 && SLJIT_CONFIG_X86_64)
3012	compiler->mode32 = dst_reg & SLJIT_I32_OP;
3013#endif
3014	dst_reg &= ~SLJIT_I32_OP;
3015
3016	if (SLJIT_UNLIKELY(src & SLJIT_IMM)) {
3017		EMIT_MOV(compiler, TMP_REG1, 0, SLJIT_IMM, srcw);
3018		src = TMP_REG1;
3019		srcw = 0;
3020	}
3021
3022	inst = emit_x86_instruction(compiler, 2, dst_reg, 0, src, srcw);
3023	FAIL_IF(!inst);
3024	*inst++ = GROUP_0F;
3025	*inst = get_jump_code(type & 0xff) - 0x40;
3026	return SLJIT_SUCCESS;
3027}
3028