LLVMReactor.cpp revision b98fe5cd1eaa821083d816cf86a20eefe22f57c7
1// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//    http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include "Nucleus.hpp"
16
17#include "llvm/Support/IRBuilder.h"
18#include "llvm/Function.h"
19#include "llvm/GlobalVariable.h"
20#include "llvm/Module.h"
21#include "llvm/LLVMContext.h"
22#include "llvm/Constants.h"
23#include "llvm/Intrinsics.h"
24#include "llvm/PassManager.h"
25#include "llvm/Analysis/LoopPass.h"
26#include "llvm/Transforms/Scalar.h"
27#include "llvm/Target/TargetData.h"
28#include "llvm/Target/TargetOptions.h"
29#include "llvm/Support/TargetSelect.h"
30#include "../lib/ExecutionEngine/JIT/JIT.h"
31
32#include "LLVMRoutine.hpp"
33#include "LLVMRoutineManager.hpp"
34#include "x86.hpp"
35#include "CPUID.hpp"
36#include "Thread.hpp"
37#include "Memory.hpp"
38#include "MutexLock.hpp"
39
40#include <xmmintrin.h>
41#include <fstream>
42
43#if defined(__x86_64__) && defined(_WIN32)
44extern "C" void X86CompilationCallback()
45{
46	assert(false);   // UNIMPLEMENTED
47}
48#endif
49
50extern "C"
51{
52	bool (*CodeAnalystInitialize)() = 0;
53	void (*CodeAnalystCompleteJITLog)() = 0;
54	bool (*CodeAnalystLogJITCode)(const void *jitCodeStartAddr, unsigned int jitCodeSize, const wchar_t *functionName) = 0;
55}
56
57namespace llvm
58{
59	extern bool JITEmitDebugInfo;
60}
61
62namespace
63{
64	sw::LLVMRoutineManager *routineManager = nullptr;
65	llvm::ExecutionEngine *executionEngine = nullptr;
66	llvm::IRBuilder<> *builder = nullptr;
67	llvm::LLVMContext *context = nullptr;
68	llvm::Module *module = nullptr;
69	llvm::Function *function = nullptr;
70
71	sw::BackoffLock codegenMutex;
72
73	sw::BasicBlock *falseBB = nullptr;
74}
75
76namespace sw
77{
78	using namespace llvm;
79
80	Optimization optimization[10] = {InstructionCombining, Disabled};
81
82	class Type : public llvm::Type {};
83	class Value : public llvm::Value {};
84	class SwitchCases : public llvm::SwitchInst {};
85	class BasicBlock : public llvm::BasicBlock {};
86
87	inline Type *T(llvm::Type *t)
88	{
89		return reinterpret_cast<Type*>(t);
90	}
91
92	inline Value *V(llvm::Value *t)
93	{
94		return reinterpret_cast<Value*>(t);
95	}
96
97	inline std::vector<llvm::Type*> &T(std::vector<Type*> &t)
98	{
99		return reinterpret_cast<std::vector<llvm::Type*>&>(t);
100	}
101
102	inline BasicBlock *B(llvm::BasicBlock *t)
103	{
104		return reinterpret_cast<BasicBlock*>(t);
105	}
106
107	Nucleus::Nucleus()
108	{
109		::codegenMutex.lock();   // Reactor and LLVM are currently not thread safe
110
111		InitializeNativeTarget();
112		JITEmitDebugInfo = false;
113
114		if(!::context)
115		{
116			::context = new LLVMContext();
117		}
118
119		::module = new Module("", *::context);
120		::routineManager = new LLVMRoutineManager();
121
122		#if defined(__x86_64__)
123			const char *architecture = "x86-64";
124		#else
125			const char *architecture = "x86";
126		#endif
127
128		SmallVector<std::string, 1> MAttrs;
129		MAttrs.push_back(CPUID::supportsMMX()    ? "+mmx"   : "-mmx");
130		MAttrs.push_back(CPUID::supportsCMOV()   ? "+cmov"  : "-cmov");
131		MAttrs.push_back(CPUID::supportsSSE()    ? "+sse"   : "-sse");
132		MAttrs.push_back(CPUID::supportsSSE2()   ? "+sse2"  : "-sse2");
133		MAttrs.push_back(CPUID::supportsSSE3()   ? "+sse3"  : "-sse3");
134		MAttrs.push_back(CPUID::supportsSSSE3()  ? "+ssse3" : "-ssse3");
135		MAttrs.push_back(CPUID::supportsSSE4_1() ? "+sse41" : "-sse41");
136
137		std::string error;
138		TargetMachine *targetMachine = EngineBuilder::selectTarget(::module, architecture, "", MAttrs, Reloc::Default, CodeModel::JITDefault, &error);
139		::executionEngine = JIT::createJIT(::module, 0, ::routineManager, CodeGenOpt::Aggressive, true, targetMachine);
140
141		if(!::builder)
142		{
143			::builder = new IRBuilder<>(*::context);
144
145			#if defined(_WIN32)
146				HMODULE CodeAnalyst = LoadLibrary("CAJitNtfyLib.dll");
147				if(CodeAnalyst)
148				{
149					CodeAnalystInitialize = (bool(*)())GetProcAddress(CodeAnalyst, "CAJIT_Initialize");
150					CodeAnalystCompleteJITLog = (void(*)())GetProcAddress(CodeAnalyst, "CAJIT_CompleteJITLog");
151					CodeAnalystLogJITCode = (bool(*)(const void*, unsigned int, const wchar_t*))GetProcAddress(CodeAnalyst, "CAJIT_LogJITCode");
152
153					CodeAnalystInitialize();
154				}
155			#endif
156		}
157	}
158
159	Nucleus::~Nucleus()
160	{
161		delete ::executionEngine;
162		::executionEngine = nullptr;
163
164		::routineManager = nullptr;
165		::function = nullptr;
166		::module = nullptr;
167
168		::codegenMutex.unlock();
169	}
170
171	Routine *Nucleus::acquireRoutine(const wchar_t *name, bool runOptimizations)
172	{
173		if(::builder->GetInsertBlock()->empty() || !::builder->GetInsertBlock()->back().isTerminator())
174		{
175			llvm::Type *type = ::function->getReturnType();
176
177			if(type->isVoidTy())
178			{
179				createRetVoid();
180			}
181			else
182			{
183				createRet(V(UndefValue::get(type)));
184			}
185		}
186
187		if(false)
188		{
189			std::string error;
190			raw_fd_ostream file("llvm-dump-unopt.txt", error);
191			::module->print(file, 0);
192		}
193
194		if(runOptimizations)
195		{
196			optimize();
197		}
198
199		if(false)
200		{
201			std::string error;
202			raw_fd_ostream file("llvm-dump-opt.txt", error);
203			::module->print(file, 0);
204		}
205
206		void *entry = ::executionEngine->getPointerToFunction(::function);
207		LLVMRoutine *routine = ::routineManager->acquireRoutine(entry);
208
209		if(CodeAnalystLogJITCode)
210		{
211			CodeAnalystLogJITCode(routine->getEntry(), routine->getCodeSize(), name);
212		}
213
214		return routine;
215	}
216
217	void Nucleus::optimize()
218	{
219		static PassManager *passManager = nullptr;
220
221		if(!passManager)
222		{
223			passManager = new PassManager();
224
225			UnsafeFPMath = true;
226		//	NoInfsFPMath = true;
227		//	NoNaNsFPMath = true;
228
229			passManager->add(new TargetData(*::executionEngine->getTargetData()));
230			passManager->add(createScalarReplAggregatesPass());
231
232			for(int pass = 0; pass < 10 && optimization[pass] != Disabled; pass++)
233			{
234				switch(optimization[pass])
235				{
236				case Disabled:                                                                 break;
237				case CFGSimplification:    passManager->add(createCFGSimplificationPass());    break;
238				case LICM:                 passManager->add(createLICMPass());                 break;
239				case AggressiveDCE:        passManager->add(createAggressiveDCEPass());        break;
240				case GVN:                  passManager->add(createGVNPass());                  break;
241				case InstructionCombining: passManager->add(createInstructionCombiningPass()); break;
242				case Reassociate:          passManager->add(createReassociatePass());          break;
243				case DeadStoreElimination: passManager->add(createDeadStoreEliminationPass()); break;
244				case SCCP:                 passManager->add(createSCCPPass());                 break;
245				case ScalarReplAggregates: passManager->add(createScalarReplAggregatesPass()); break;
246				default:
247					assert(false);
248				}
249			}
250		}
251
252		passManager->run(*::module);
253	}
254
255	Value *Nucleus::allocateStackVariable(Type *type, int arraySize)
256	{
257		// Need to allocate it in the entry block for mem2reg to work
258		llvm::BasicBlock &entryBlock = ::function->getEntryBlock();
259
260		Instruction *declaration;
261
262		if(arraySize)
263		{
264			declaration = new AllocaInst(type, Nucleus::createConstantInt(arraySize));
265		}
266		else
267		{
268			declaration = new AllocaInst(type, (Value*)0);
269		}
270
271		entryBlock.getInstList().push_front(declaration);
272
273		return V(declaration);
274	}
275
276	BasicBlock *Nucleus::createBasicBlock()
277	{
278		return B(BasicBlock::Create(*::context, "", ::function));
279	}
280
281	BasicBlock *Nucleus::getInsertBlock()
282	{
283		return B(::builder->GetInsertBlock());
284	}
285
286	void Nucleus::setInsertBlock(BasicBlock *basicBlock)
287	{
288	//	assert(::builder->GetInsertBlock()->back().isTerminator());
289		return ::builder->SetInsertPoint(basicBlock);
290	}
291
292	void Nucleus::createFunction(Type *ReturnType, std::vector<Type*> &Params)
293	{
294		llvm::FunctionType *functionType = llvm::FunctionType::get(ReturnType, T(Params), false);
295		::function = llvm::Function::Create(functionType, llvm::GlobalValue::InternalLinkage, "", ::module);
296		::function->setCallingConv(llvm::CallingConv::C);
297
298		::builder->SetInsertPoint(BasicBlock::Create(*::context, "", ::function));
299	}
300
301	Value *Nucleus::getArgument(unsigned int index)
302	{
303		llvm::Function::arg_iterator args = ::function->arg_begin();
304
305		while(index)
306		{
307			args++;
308			index--;
309		}
310
311		return V(&*args);
312	}
313
314	void Nucleus::createRetVoid()
315	{
316		x86::emms();
317
318		::builder->CreateRetVoid();
319	}
320
321	void Nucleus::createRet(Value *v)
322	{
323		x86::emms();
324
325		::builder->CreateRet(v);
326	}
327
328	void Nucleus::createBr(BasicBlock *dest)
329	{
330		::builder->CreateBr(dest);
331	}
332
333	void Nucleus::createCondBr(Value *cond, BasicBlock *ifTrue, BasicBlock *ifFalse)
334	{
335		::builder->CreateCondBr(cond, ifTrue, ifFalse);
336	}
337
338	Value *Nucleus::createAdd(Value *lhs, Value *rhs)
339	{
340		return V(::builder->CreateAdd(lhs, rhs));
341	}
342
343	Value *Nucleus::createSub(Value *lhs, Value *rhs)
344	{
345		return V(::builder->CreateSub(lhs, rhs));
346	}
347
348	Value *Nucleus::createMul(Value *lhs, Value *rhs)
349	{
350		return V(::builder->CreateMul(lhs, rhs));
351	}
352
353	Value *Nucleus::createUDiv(Value *lhs, Value *rhs)
354	{
355		return V(::builder->CreateUDiv(lhs, rhs));
356	}
357
358	Value *Nucleus::createSDiv(Value *lhs, Value *rhs)
359	{
360		return V(::builder->CreateSDiv(lhs, rhs));
361	}
362
363	Value *Nucleus::createFAdd(Value *lhs, Value *rhs)
364	{
365		return V(::builder->CreateFAdd(lhs, rhs));
366	}
367
368	Value *Nucleus::createFSub(Value *lhs, Value *rhs)
369	{
370		return V(::builder->CreateFSub(lhs, rhs));
371	}
372
373	Value *Nucleus::createFMul(Value *lhs, Value *rhs)
374	{
375		return V(::builder->CreateFMul(lhs, rhs));
376	}
377
378	Value *Nucleus::createFDiv(Value *lhs, Value *rhs)
379	{
380		return V(::builder->CreateFDiv(lhs, rhs));
381	}
382
383	Value *Nucleus::createURem(Value *lhs, Value *rhs)
384	{
385		return V(::builder->CreateURem(lhs, rhs));
386	}
387
388	Value *Nucleus::createSRem(Value *lhs, Value *rhs)
389	{
390		return V(::builder->CreateSRem(lhs, rhs));
391	}
392
393	Value *Nucleus::createFRem(Value *lhs, Value *rhs)
394	{
395		return V(::builder->CreateFRem(lhs, rhs));
396	}
397
398	Value *Nucleus::createShl(Value *lhs, Value *rhs)
399	{
400		return V(::builder->CreateShl(lhs, rhs));
401	}
402
403	Value *Nucleus::createLShr(Value *lhs, Value *rhs)
404	{
405		return V(::builder->CreateLShr(lhs, rhs));
406	}
407
408	Value *Nucleus::createAShr(Value *lhs, Value *rhs)
409	{
410		return V(::builder->CreateAShr(lhs, rhs));
411	}
412
413	Value *Nucleus::createAnd(Value *lhs, Value *rhs)
414	{
415		return V(::builder->CreateAnd(lhs, rhs));
416	}
417
418	Value *Nucleus::createOr(Value *lhs, Value *rhs)
419	{
420		return V(::builder->CreateOr(lhs, rhs));
421	}
422
423	Value *Nucleus::createXor(Value *lhs, Value *rhs)
424	{
425		return V(::builder->CreateXor(lhs, rhs));
426	}
427
428	Value *Nucleus::createNeg(Value *v)
429	{
430		return V(::builder->CreateNeg(v));
431	}
432
433	Value *Nucleus::createFNeg(Value *v)
434	{
435		return V(::builder->CreateFNeg(v));
436	}
437
438	Value *Nucleus::createNot(Value *v)
439	{
440		return V(::builder->CreateNot(v));
441	}
442
443	Value *Nucleus::createLoad(Value *ptr, Type *type, bool isVolatile, unsigned int align)
444	{
445		assert(ptr->getType()->getContainedType(0) == type);
446		return V(::builder->Insert(new LoadInst(ptr, "", isVolatile, align)));
447	}
448
449	Value *Nucleus::createStore(Value *value, Value *ptr, Type *type, bool isVolatile, unsigned int align)
450	{
451		assert(ptr->getType()->getContainedType(0) == type);
452		::builder->Insert(new StoreInst(value, ptr, isVolatile, align));
453		return value;
454	}
455
456	Value *Nucleus::createGEP(Value *ptr, Type *type, Value *index)
457	{
458		assert(ptr->getType()->getContainedType(0) == type);
459		return V(::builder->CreateGEP(ptr, index));
460	}
461
462	Value *Nucleus::createAtomicAdd(Value *ptr, Value *value)
463	{
464		return V(::builder->CreateAtomicRMW(AtomicRMWInst::Add, ptr, value, SequentiallyConsistent));
465	}
466
467	Value *Nucleus::createTrunc(Value *v, Type *destType)
468	{
469		return V(::builder->CreateTrunc(v, destType));
470	}
471
472	Value *Nucleus::createZExt(Value *v, Type *destType)
473	{
474		return V(::builder->CreateZExt(v, destType));
475	}
476
477	Value *Nucleus::createSExt(Value *v, Type *destType)
478	{
479		return V(::builder->CreateSExt(v, destType));
480	}
481
482	Value *Nucleus::createFPToSI(Value *v, Type *destType)
483	{
484		return V(::builder->CreateFPToSI(v, destType));
485	}
486
487	Value *Nucleus::createUIToFP(Value *v, Type *destType)
488	{
489		return V(::builder->CreateUIToFP(v, destType));
490	}
491
492	Value *Nucleus::createSIToFP(Value *v, Type *destType)
493	{
494		return V(::builder->CreateSIToFP(v, destType));
495	}
496
497	Value *Nucleus::createFPTrunc(Value *v, Type *destType)
498	{
499		return V(::builder->CreateFPTrunc(v, destType));
500	}
501
502	Value *Nucleus::createFPExt(Value *v, Type *destType)
503	{
504		return V(::builder->CreateFPExt(v, destType));
505	}
506
507	Value *Nucleus::createBitCast(Value *v, Type *destType)
508	{
509		return V(::builder->CreateBitCast(v, destType));
510	}
511
512	Value *Nucleus::createICmpEQ(Value *lhs, Value *rhs)
513	{
514		return V(::builder->CreateICmpEQ(lhs, rhs));
515	}
516
517	Value *Nucleus::createICmpNE(Value *lhs, Value *rhs)
518	{
519		return V(::builder->CreateICmpNE(lhs, rhs));
520	}
521
522	Value *Nucleus::createICmpUGT(Value *lhs, Value *rhs)
523	{
524		return V(::builder->CreateICmpUGT(lhs, rhs));
525	}
526
527	Value *Nucleus::createICmpUGE(Value *lhs, Value *rhs)
528	{
529		return V(::builder->CreateICmpUGE(lhs, rhs));
530	}
531
532	Value *Nucleus::createICmpULT(Value *lhs, Value *rhs)
533	{
534		return V(::builder->CreateICmpULT(lhs, rhs));
535	}
536
537	Value *Nucleus::createICmpULE(Value *lhs, Value *rhs)
538	{
539		return V(::builder->CreateICmpULE(lhs, rhs));
540	}
541
542	Value *Nucleus::createICmpSGT(Value *lhs, Value *rhs)
543	{
544		return V(::builder->CreateICmpSGT(lhs, rhs));
545	}
546
547	Value *Nucleus::createICmpSGE(Value *lhs, Value *rhs)
548	{
549		return V(::builder->CreateICmpSGE(lhs, rhs));
550	}
551
552	Value *Nucleus::createICmpSLT(Value *lhs, Value *rhs)
553	{
554		return V(::builder->CreateICmpSLT(lhs, rhs));
555	}
556
557	Value *Nucleus::createICmpSLE(Value *lhs, Value *rhs)
558	{
559		return V(::builder->CreateICmpSLE(lhs, rhs));
560	}
561
562	Value *Nucleus::createFCmpOEQ(Value *lhs, Value *rhs)
563	{
564		return V(::builder->CreateFCmpOEQ(lhs, rhs));
565	}
566
567	Value *Nucleus::createFCmpOGT(Value *lhs, Value *rhs)
568	{
569		return V(::builder->CreateFCmpOGT(lhs, rhs));
570	}
571
572	Value *Nucleus::createFCmpOGE(Value *lhs, Value *rhs)
573	{
574		return V(::builder->CreateFCmpOGE(lhs, rhs));
575	}
576
577	Value *Nucleus::createFCmpOLT(Value *lhs, Value *rhs)
578	{
579		return V(::builder->CreateFCmpOLT(lhs, rhs));
580	}
581
582	Value *Nucleus::createFCmpOLE(Value *lhs, Value *rhs)
583	{
584		return V(::builder->CreateFCmpOLE(lhs, rhs));
585	}
586
587	Value *Nucleus::createFCmpONE(Value *lhs, Value *rhs)
588	{
589		return V(::builder->CreateFCmpONE(lhs, rhs));
590	}
591
592	Value *Nucleus::createFCmpORD(Value *lhs, Value *rhs)
593	{
594		return V(::builder->CreateFCmpORD(lhs, rhs));
595	}
596
597	Value *Nucleus::createFCmpUNO(Value *lhs, Value *rhs)
598	{
599		return V(::builder->CreateFCmpUNO(lhs, rhs));
600	}
601
602	Value *Nucleus::createFCmpUEQ(Value *lhs, Value *rhs)
603	{
604		return V(::builder->CreateFCmpUEQ(lhs, rhs));
605	}
606
607	Value *Nucleus::createFCmpUGT(Value *lhs, Value *rhs)
608	{
609		return V(::builder->CreateFCmpUGT(lhs, rhs));
610	}
611
612	Value *Nucleus::createFCmpUGE(Value *lhs, Value *rhs)
613	{
614		return V(::builder->CreateFCmpUGE(lhs, rhs));
615	}
616
617	Value *Nucleus::createFCmpULT(Value *lhs, Value *rhs)
618	{
619		return V(::builder->CreateFCmpULT(lhs, rhs));
620	}
621
622	Value *Nucleus::createFCmpULE(Value *lhs, Value *rhs)
623	{
624		return V(::builder->CreateFCmpULE(lhs, rhs));
625	}
626
627	Value *Nucleus::createFCmpUNE(Value *lhs, Value *rhs)
628	{
629		return V(::builder->CreateFCmpULE(lhs, rhs));
630	}
631
632	Value *Nucleus::createExtractElement(Value *vector, Type *type, int index)
633	{
634		assert(vector->getType()->getContainedType(0) == type);
635		return V(::builder->CreateExtractElement(vector, createConstantInt(index)));
636	}
637
638	Value *Nucleus::createInsertElement(Value *vector, Value *element, int index)
639	{
640		return V(::builder->CreateInsertElement(vector, element, createConstantInt(index)));
641	}
642
643	Value *Nucleus::createShuffleVector(Value *V1, Value *V2, const int *select)
644	{
645		int size = llvm::cast<llvm::VectorType>(V1->getType())->getNumElements();
646		const int maxSize = 16;
647		llvm::Constant *swizzle[maxSize];
648		assert(size <= maxSize);
649
650		for(int i = 0; i < size; i++)
651		{
652			swizzle[i] = llvm::ConstantInt::get(Type::getInt32Ty(*::context), select[i]);
653		}
654
655		llvm::Value *shuffle = llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(swizzle, size));
656
657		return V(::builder->CreateShuffleVector(V1, V2, shuffle));
658	}
659
660	Value *Nucleus::createSelect(Value *C, Value *ifTrue, Value *ifFalse)
661	{
662		return V(::builder->CreateSelect(C, ifTrue, ifFalse));
663	}
664
665	SwitchCases *Nucleus::createSwitch(Value *control, BasicBlock *defaultBranch, unsigned numCases)
666	{
667		return reinterpret_cast<SwitchCases*>(::builder->CreateSwitch(control, defaultBranch, numCases));
668	}
669
670	void Nucleus::addSwitchCase(SwitchCases *switchCases, int label, BasicBlock *branch)
671	{
672		switchCases->addCase(llvm::ConstantInt::get(Type::getInt32Ty(*::context), label, true), branch);
673	}
674
675	void Nucleus::createUnreachable()
676	{
677		::builder->CreateUnreachable();
678	}
679
680	static Value *createSwizzle4(Value *val, unsigned char select)
681	{
682		int swizzle[4] =
683		{
684			(select >> 0) & 0x03,
685			(select >> 2) & 0x03,
686			(select >> 4) & 0x03,
687			(select >> 6) & 0x03,
688		};
689
690		return Nucleus::createShuffleVector(val, val, swizzle);
691	}
692
693	static Value *createMask4(Value *lhs, Value *rhs, unsigned char select)
694	{
695		bool mask[4] = {false, false, false, false};
696
697		mask[(select >> 0) & 0x03] = true;
698		mask[(select >> 2) & 0x03] = true;
699		mask[(select >> 4) & 0x03] = true;
700		mask[(select >> 6) & 0x03] = true;
701
702		int swizzle[4] =
703		{
704			mask[0] ? 4 : 0,
705			mask[1] ? 5 : 1,
706			mask[2] ? 6 : 2,
707			mask[3] ? 7 : 3,
708		};
709
710		Value *shuffle = Nucleus::createShuffleVector(lhs, rhs, swizzle);
711
712		return shuffle;
713	}
714
715	Value *Nucleus::createConstantPointer(const void *address, Type *Ty, unsigned int align)
716	{
717		const GlobalValue *existingGlobal = ::executionEngine->getGlobalValueAtAddress(const_cast<void*>(address));   // FIXME: Const
718
719		if(existingGlobal)
720		{
721			return (Value*)existingGlobal;
722		}
723
724		llvm::GlobalValue *global = new llvm::GlobalVariable(*::module, Ty, true, llvm::GlobalValue::ExternalLinkage, 0, "");
725		global->setAlignment(align);
726
727		::executionEngine->addGlobalMapping(global, const_cast<void*>(address));
728
729		return V(global);
730	}
731
732	Type *Nucleus::getPointerType(Type *ElementType)
733	{
734		return T(llvm::PointerType::get(ElementType, 0));
735	}
736
737	Value *Nucleus::createNullValue(Type *Ty)
738	{
739		return V(llvm::Constant::getNullValue(Ty));
740	}
741
742	Value *Nucleus::createConstantLong(int64_t i)
743	{
744		return V(llvm::ConstantInt::get(Type::getInt64Ty(*::context), i, true));
745	}
746
747	Value *Nucleus::createConstantInt(int i)
748	{
749		return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, true));
750	}
751
752	Value *Nucleus::createConstantInt(unsigned int i)
753	{
754		return V(llvm::ConstantInt::get(Type::getInt32Ty(*::context), i, false));
755	}
756
757	Value *Nucleus::createConstantBool(bool b)
758	{
759		return V(llvm::ConstantInt::get(Type::getInt1Ty(*::context), b));
760	}
761
762	Value *Nucleus::createConstantByte(signed char i)
763	{
764		return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, true));
765	}
766
767	Value *Nucleus::createConstantByte(unsigned char i)
768	{
769		return V(llvm::ConstantInt::get(Type::getInt8Ty(*::context), i, false));
770	}
771
772	Value *Nucleus::createConstantShort(short i)
773	{
774		return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, true));
775	}
776
777	Value *Nucleus::createConstantShort(unsigned short i)
778	{
779		return V(llvm::ConstantInt::get(Type::getInt16Ty(*::context), i, false));
780	}
781
782	Value *Nucleus::createConstantFloat(float x)
783	{
784		return V(llvm::ConstantFP::get(Float::getType(), x));
785	}
786
787	Value *Nucleus::createNullPointer(Type *Ty)
788	{
789		return V(llvm::ConstantPointerNull::get(llvm::PointerType::get(Ty, 0)));
790	}
791
792	Value *Nucleus::createConstantVector(const int64_t *constants, Type *type)
793	{
794		assert(llvm::isa<VectorType>(type));
795		const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
796		assert(numConstants <= 16);
797		llvm::Constant *constantVector[16];
798
799		for(int i = 0; i < numConstants; i++)
800		{
801			constantVector[i] = llvm::ConstantInt::get(type->getContainedType(0), constants[i]);
802		}
803
804		return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
805	}
806
807	Value *Nucleus::createConstantVector(const double *constants, Type *type)
808	{
809		assert(llvm::isa<VectorType>(type));
810		const int numConstants = llvm::cast<VectorType>(type)->getNumElements();
811		assert(numConstants <= 8);
812		llvm::Constant *constantVector[8];
813
814		for(int i = 0; i < numConstants; i++)
815		{
816			constantVector[i] = llvm::ConstantFP::get(type->getContainedType(0), constants[i]);
817		}
818
819		return V(llvm::ConstantVector::get(llvm::ArrayRef<llvm::Constant*>(constantVector, numConstants)));
820	}
821
822	Type *Void::getType()
823	{
824		return T(llvm::Type::getVoidTy(*::context));
825	}
826
827	class MMX : public Variable<MMX>
828	{
829	public:
830		static Type *getType();
831	};
832
833	Type *MMX::getType()
834	{
835		return T(llvm::Type::getX86_MMXTy(*::context));
836	}
837
838	Bool::Bool(Argument<Bool> argument)
839	{
840		storeValue(argument.value);
841	}
842
843	Bool::Bool()
844	{
845	}
846
847	Bool::Bool(bool x)
848	{
849		storeValue(Nucleus::createConstantBool(x));
850	}
851
852	Bool::Bool(RValue<Bool> rhs)
853	{
854		storeValue(rhs.value);
855	}
856
857	Bool::Bool(const Bool &rhs)
858	{
859		Value *value = rhs.loadValue();
860		storeValue(value);
861	}
862
863	Bool::Bool(const Reference<Bool> &rhs)
864	{
865		Value *value = rhs.loadValue();
866		storeValue(value);
867	}
868
869	RValue<Bool> Bool::operator=(RValue<Bool> rhs) const
870	{
871		storeValue(rhs.value);
872
873		return rhs;
874	}
875
876	RValue<Bool> Bool::operator=(const Bool &rhs) const
877	{
878		Value *value = rhs.loadValue();
879		storeValue(value);
880
881		return RValue<Bool>(value);
882	}
883
884	RValue<Bool> Bool::operator=(const Reference<Bool> &rhs) const
885	{
886		Value *value = rhs.loadValue();
887		storeValue(value);
888
889		return RValue<Bool>(value);
890	}
891
892	RValue<Bool> operator!(RValue<Bool> val)
893	{
894		return RValue<Bool>(Nucleus::createNot(val.value));
895	}
896
897	RValue<Bool> operator&&(RValue<Bool> lhs, RValue<Bool> rhs)
898	{
899		return RValue<Bool>(Nucleus::createAnd(lhs.value, rhs.value));
900	}
901
902	RValue<Bool> operator||(RValue<Bool> lhs, RValue<Bool> rhs)
903	{
904		return RValue<Bool>(Nucleus::createOr(lhs.value, rhs.value));
905	}
906
907	Type *Bool::getType()
908	{
909		return T(llvm::Type::getInt1Ty(*::context));
910	}
911
912	Byte::Byte(Argument<Byte> argument)
913	{
914		storeValue(argument.value);
915	}
916
917	Byte::Byte(RValue<Int> cast)
918	{
919		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
920
921		storeValue(integer);
922	}
923
924	Byte::Byte(RValue<UInt> cast)
925	{
926		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
927
928		storeValue(integer);
929	}
930
931	Byte::Byte(RValue<UShort> cast)
932	{
933		Value *integer = Nucleus::createTrunc(cast.value, Byte::getType());
934
935		storeValue(integer);
936	}
937
938	Byte::Byte()
939	{
940	}
941
942	Byte::Byte(int x)
943	{
944		storeValue(Nucleus::createConstantByte((unsigned char)x));
945	}
946
947	Byte::Byte(unsigned char x)
948	{
949		storeValue(Nucleus::createConstantByte(x));
950	}
951
952	Byte::Byte(RValue<Byte> rhs)
953	{
954		storeValue(rhs.value);
955	}
956
957	Byte::Byte(const Byte &rhs)
958	{
959		Value *value = rhs.loadValue();
960		storeValue(value);
961	}
962
963	Byte::Byte(const Reference<Byte> &rhs)
964	{
965		Value *value = rhs.loadValue();
966		storeValue(value);
967	}
968
969	RValue<Byte> Byte::operator=(RValue<Byte> rhs) const
970	{
971		storeValue(rhs.value);
972
973		return rhs;
974	}
975
976	RValue<Byte> Byte::operator=(const Byte &rhs) const
977	{
978		Value *value = rhs.loadValue();
979		storeValue(value);
980
981		return RValue<Byte>(value);
982	}
983
984	RValue<Byte> Byte::operator=(const Reference<Byte> &rhs) const
985	{
986		Value *value = rhs.loadValue();
987		storeValue(value);
988
989		return RValue<Byte>(value);
990	}
991
992	RValue<Byte> operator+(RValue<Byte> lhs, RValue<Byte> rhs)
993	{
994		return RValue<Byte>(Nucleus::createAdd(lhs.value, rhs.value));
995	}
996
997	RValue<Byte> operator-(RValue<Byte> lhs, RValue<Byte> rhs)
998	{
999		return RValue<Byte>(Nucleus::createSub(lhs.value, rhs.value));
1000	}
1001
1002	RValue<Byte> operator*(RValue<Byte> lhs, RValue<Byte> rhs)
1003	{
1004		return RValue<Byte>(Nucleus::createMul(lhs.value, rhs.value));
1005	}
1006
1007	RValue<Byte> operator/(RValue<Byte> lhs, RValue<Byte> rhs)
1008	{
1009		return RValue<Byte>(Nucleus::createUDiv(lhs.value, rhs.value));
1010	}
1011
1012	RValue<Byte> operator%(RValue<Byte> lhs, RValue<Byte> rhs)
1013	{
1014		return RValue<Byte>(Nucleus::createURem(lhs.value, rhs.value));
1015	}
1016
1017	RValue<Byte> operator&(RValue<Byte> lhs, RValue<Byte> rhs)
1018	{
1019		return RValue<Byte>(Nucleus::createAnd(lhs.value, rhs.value));
1020	}
1021
1022	RValue<Byte> operator|(RValue<Byte> lhs, RValue<Byte> rhs)
1023	{
1024		return RValue<Byte>(Nucleus::createOr(lhs.value, rhs.value));
1025	}
1026
1027	RValue<Byte> operator^(RValue<Byte> lhs, RValue<Byte> rhs)
1028	{
1029		return RValue<Byte>(Nucleus::createXor(lhs.value, rhs.value));
1030	}
1031
1032	RValue<Byte> operator<<(RValue<Byte> lhs, RValue<Byte> rhs)
1033	{
1034		return RValue<Byte>(Nucleus::createShl(lhs.value, rhs.value));
1035	}
1036
1037	RValue<Byte> operator>>(RValue<Byte> lhs, RValue<Byte> rhs)
1038	{
1039		return RValue<Byte>(Nucleus::createLShr(lhs.value, rhs.value));
1040	}
1041
1042	RValue<Byte> operator+=(const Byte &lhs, RValue<Byte> rhs)
1043	{
1044		return lhs = lhs + rhs;
1045	}
1046
1047	RValue<Byte> operator-=(const Byte &lhs, RValue<Byte> rhs)
1048	{
1049		return lhs = lhs - rhs;
1050	}
1051
1052	RValue<Byte> operator*=(const Byte &lhs, RValue<Byte> rhs)
1053	{
1054		return lhs = lhs * rhs;
1055	}
1056
1057	RValue<Byte> operator/=(const Byte &lhs, RValue<Byte> rhs)
1058	{
1059		return lhs = lhs / rhs;
1060	}
1061
1062	RValue<Byte> operator%=(const Byte &lhs, RValue<Byte> rhs)
1063	{
1064		return lhs = lhs % rhs;
1065	}
1066
1067	RValue<Byte> operator&=(const Byte &lhs, RValue<Byte> rhs)
1068	{
1069		return lhs = lhs & rhs;
1070	}
1071
1072	RValue<Byte> operator|=(const Byte &lhs, RValue<Byte> rhs)
1073	{
1074		return lhs = lhs | rhs;
1075	}
1076
1077	RValue<Byte> operator^=(const Byte &lhs, RValue<Byte> rhs)
1078	{
1079		return lhs = lhs ^ rhs;
1080	}
1081
1082	RValue<Byte> operator<<=(const Byte &lhs, RValue<Byte> rhs)
1083	{
1084		return lhs = lhs << rhs;
1085	}
1086
1087	RValue<Byte> operator>>=(const Byte &lhs, RValue<Byte> rhs)
1088	{
1089		return lhs = lhs >> rhs;
1090	}
1091
1092	RValue<Byte> operator+(RValue<Byte> val)
1093	{
1094		return val;
1095	}
1096
1097	RValue<Byte> operator-(RValue<Byte> val)
1098	{
1099		return RValue<Byte>(Nucleus::createNeg(val.value));
1100	}
1101
1102	RValue<Byte> operator~(RValue<Byte> val)
1103	{
1104		return RValue<Byte>(Nucleus::createNot(val.value));
1105	}
1106
1107	RValue<Byte> operator++(const Byte &val, int)   // Post-increment
1108	{
1109		RValue<Byte> res = val;
1110
1111		Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1112		val.storeValue(inc);
1113
1114		return res;
1115	}
1116
1117	const Byte &operator++(const Byte &val)   // Pre-increment
1118	{
1119		Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1120		val.storeValue(inc);
1121
1122		return val;
1123	}
1124
1125	RValue<Byte> operator--(const Byte &val, int)   // Post-decrement
1126	{
1127		RValue<Byte> res = val;
1128
1129		Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((unsigned char)1)));
1130		val.storeValue(inc);
1131
1132		return res;
1133	}
1134
1135	const Byte &operator--(const Byte &val)   // Pre-decrement
1136	{
1137		Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((unsigned char)1)));
1138		val.storeValue(inc);
1139
1140		return val;
1141	}
1142
1143	RValue<Bool> operator<(RValue<Byte> lhs, RValue<Byte> rhs)
1144	{
1145		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1146	}
1147
1148	RValue<Bool> operator<=(RValue<Byte> lhs, RValue<Byte> rhs)
1149	{
1150		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1151	}
1152
1153	RValue<Bool> operator>(RValue<Byte> lhs, RValue<Byte> rhs)
1154	{
1155		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1156	}
1157
1158	RValue<Bool> operator>=(RValue<Byte> lhs, RValue<Byte> rhs)
1159	{
1160		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1161	}
1162
1163	RValue<Bool> operator!=(RValue<Byte> lhs, RValue<Byte> rhs)
1164	{
1165		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1166	}
1167
1168	RValue<Bool> operator==(RValue<Byte> lhs, RValue<Byte> rhs)
1169	{
1170		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1171	}
1172
1173	Type *Byte::getType()
1174	{
1175		return T(llvm::Type::getInt8Ty(*::context));
1176	}
1177
1178	SByte::SByte(Argument<SByte> argument)
1179	{
1180		storeValue(argument.value);
1181	}
1182
1183	SByte::SByte(RValue<Int> cast)
1184	{
1185		Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1186
1187		storeValue(integer);
1188	}
1189
1190	SByte::SByte(RValue<Short> cast)
1191	{
1192		Value *integer = Nucleus::createTrunc(cast.value, SByte::getType());
1193
1194		storeValue(integer);
1195	}
1196
1197	SByte::SByte()
1198	{
1199	}
1200
1201	SByte::SByte(signed char x)
1202	{
1203		storeValue(Nucleus::createConstantByte(x));
1204	}
1205
1206	SByte::SByte(RValue<SByte> rhs)
1207	{
1208		storeValue(rhs.value);
1209	}
1210
1211	SByte::SByte(const SByte &rhs)
1212	{
1213		Value *value = rhs.loadValue();
1214		storeValue(value);
1215	}
1216
1217	SByte::SByte(const Reference<SByte> &rhs)
1218	{
1219		Value *value = rhs.loadValue();
1220		storeValue(value);
1221	}
1222
1223	RValue<SByte> SByte::operator=(RValue<SByte> rhs) const
1224	{
1225		storeValue(rhs.value);
1226
1227		return rhs;
1228	}
1229
1230	RValue<SByte> SByte::operator=(const SByte &rhs) const
1231	{
1232		Value *value = rhs.loadValue();
1233		storeValue(value);
1234
1235		return RValue<SByte>(value);
1236	}
1237
1238	RValue<SByte> SByte::operator=(const Reference<SByte> &rhs) const
1239	{
1240		Value *value = rhs.loadValue();
1241		storeValue(value);
1242
1243		return RValue<SByte>(value);
1244	}
1245
1246	RValue<SByte> operator+(RValue<SByte> lhs, RValue<SByte> rhs)
1247	{
1248		return RValue<SByte>(Nucleus::createAdd(lhs.value, rhs.value));
1249	}
1250
1251	RValue<SByte> operator-(RValue<SByte> lhs, RValue<SByte> rhs)
1252	{
1253		return RValue<SByte>(Nucleus::createSub(lhs.value, rhs.value));
1254	}
1255
1256	RValue<SByte> operator*(RValue<SByte> lhs, RValue<SByte> rhs)
1257	{
1258		return RValue<SByte>(Nucleus::createMul(lhs.value, rhs.value));
1259	}
1260
1261	RValue<SByte> operator/(RValue<SByte> lhs, RValue<SByte> rhs)
1262	{
1263		return RValue<SByte>(Nucleus::createSDiv(lhs.value, rhs.value));
1264	}
1265
1266	RValue<SByte> operator%(RValue<SByte> lhs, RValue<SByte> rhs)
1267	{
1268		return RValue<SByte>(Nucleus::createSRem(lhs.value, rhs.value));
1269	}
1270
1271	RValue<SByte> operator&(RValue<SByte> lhs, RValue<SByte> rhs)
1272	{
1273		return RValue<SByte>(Nucleus::createAnd(lhs.value, rhs.value));
1274	}
1275
1276	RValue<SByte> operator|(RValue<SByte> lhs, RValue<SByte> rhs)
1277	{
1278		return RValue<SByte>(Nucleus::createOr(lhs.value, rhs.value));
1279	}
1280
1281	RValue<SByte> operator^(RValue<SByte> lhs, RValue<SByte> rhs)
1282	{
1283		return RValue<SByte>(Nucleus::createXor(lhs.value, rhs.value));
1284	}
1285
1286	RValue<SByte> operator<<(RValue<SByte> lhs, RValue<SByte> rhs)
1287	{
1288		return RValue<SByte>(Nucleus::createShl(lhs.value, rhs.value));
1289	}
1290
1291	RValue<SByte> operator>>(RValue<SByte> lhs, RValue<SByte> rhs)
1292	{
1293		return RValue<SByte>(Nucleus::createAShr(lhs.value, rhs.value));
1294	}
1295
1296	RValue<SByte> operator+=(const SByte &lhs, RValue<SByte> rhs)
1297	{
1298		return lhs = lhs + rhs;
1299	}
1300
1301	RValue<SByte> operator-=(const SByte &lhs, RValue<SByte> rhs)
1302	{
1303		return lhs = lhs - rhs;
1304	}
1305
1306	RValue<SByte> operator*=(const SByte &lhs, RValue<SByte> rhs)
1307	{
1308		return lhs = lhs * rhs;
1309	}
1310
1311	RValue<SByte> operator/=(const SByte &lhs, RValue<SByte> rhs)
1312	{
1313		return lhs = lhs / rhs;
1314	}
1315
1316	RValue<SByte> operator%=(const SByte &lhs, RValue<SByte> rhs)
1317	{
1318		return lhs = lhs % rhs;
1319	}
1320
1321	RValue<SByte> operator&=(const SByte &lhs, RValue<SByte> rhs)
1322	{
1323		return lhs = lhs & rhs;
1324	}
1325
1326	RValue<SByte> operator|=(const SByte &lhs, RValue<SByte> rhs)
1327	{
1328		return lhs = lhs | rhs;
1329	}
1330
1331	RValue<SByte> operator^=(const SByte &lhs, RValue<SByte> rhs)
1332	{
1333		return lhs = lhs ^ rhs;
1334	}
1335
1336	RValue<SByte> operator<<=(const SByte &lhs, RValue<SByte> rhs)
1337	{
1338		return lhs = lhs << rhs;
1339	}
1340
1341	RValue<SByte> operator>>=(const SByte &lhs, RValue<SByte> rhs)
1342	{
1343		return lhs = lhs >> rhs;
1344	}
1345
1346	RValue<SByte> operator+(RValue<SByte> val)
1347	{
1348		return val;
1349	}
1350
1351	RValue<SByte> operator-(RValue<SByte> val)
1352	{
1353		return RValue<SByte>(Nucleus::createNeg(val.value));
1354	}
1355
1356	RValue<SByte> operator~(RValue<SByte> val)
1357	{
1358		return RValue<SByte>(Nucleus::createNot(val.value));
1359	}
1360
1361	RValue<SByte> operator++(const SByte &val, int)   // Post-increment
1362	{
1363		RValue<SByte> res = val;
1364
1365		Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantByte((signed char)1)));
1366		val.storeValue(inc);
1367
1368		return res;
1369	}
1370
1371	const SByte &operator++(const SByte &val)   // Pre-increment
1372	{
1373		Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1374		val.storeValue(inc);
1375
1376		return val;
1377	}
1378
1379	RValue<SByte> operator--(const SByte &val, int)   // Post-decrement
1380	{
1381		RValue<SByte> res = val;
1382
1383		Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantByte((signed char)1)));
1384		val.storeValue(inc);
1385
1386		return res;
1387	}
1388
1389	const SByte &operator--(const SByte &val)   // Pre-decrement
1390	{
1391		Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantByte((signed char)1)));
1392		val.storeValue(inc);
1393
1394		return val;
1395	}
1396
1397	RValue<Bool> operator<(RValue<SByte> lhs, RValue<SByte> rhs)
1398	{
1399		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1400	}
1401
1402	RValue<Bool> operator<=(RValue<SByte> lhs, RValue<SByte> rhs)
1403	{
1404		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1405	}
1406
1407	RValue<Bool> operator>(RValue<SByte> lhs, RValue<SByte> rhs)
1408	{
1409		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1410	}
1411
1412	RValue<Bool> operator>=(RValue<SByte> lhs, RValue<SByte> rhs)
1413	{
1414		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1415	}
1416
1417	RValue<Bool> operator!=(RValue<SByte> lhs, RValue<SByte> rhs)
1418	{
1419		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1420	}
1421
1422	RValue<Bool> operator==(RValue<SByte> lhs, RValue<SByte> rhs)
1423	{
1424		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1425	}
1426
1427	Type *SByte::getType()
1428	{
1429		return T(llvm::Type::getInt8Ty(*::context));
1430	}
1431
1432	Short::Short(Argument<Short> argument)
1433	{
1434		storeValue(argument.value);
1435	}
1436
1437	Short::Short(RValue<Int> cast)
1438	{
1439		Value *integer = Nucleus::createTrunc(cast.value, Short::getType());
1440
1441		storeValue(integer);
1442	}
1443
1444	Short::Short()
1445	{
1446	}
1447
1448	Short::Short(short x)
1449	{
1450		storeValue(Nucleus::createConstantShort(x));
1451	}
1452
1453	Short::Short(RValue<Short> rhs)
1454	{
1455		storeValue(rhs.value);
1456	}
1457
1458	Short::Short(const Short &rhs)
1459	{
1460		Value *value = rhs.loadValue();
1461		storeValue(value);
1462	}
1463
1464	Short::Short(const Reference<Short> &rhs)
1465	{
1466		Value *value = rhs.loadValue();
1467		storeValue(value);
1468	}
1469
1470	RValue<Short> Short::operator=(RValue<Short> rhs) const
1471	{
1472		storeValue(rhs.value);
1473
1474		return rhs;
1475	}
1476
1477	RValue<Short> Short::operator=(const Short &rhs) const
1478	{
1479		Value *value = rhs.loadValue();
1480		storeValue(value);
1481
1482		return RValue<Short>(value);
1483	}
1484
1485	RValue<Short> Short::operator=(const Reference<Short> &rhs) const
1486	{
1487		Value *value = rhs.loadValue();
1488		storeValue(value);
1489
1490		return RValue<Short>(value);
1491	}
1492
1493	RValue<Short> operator+(RValue<Short> lhs, RValue<Short> rhs)
1494	{
1495		return RValue<Short>(Nucleus::createAdd(lhs.value, rhs.value));
1496	}
1497
1498	RValue<Short> operator-(RValue<Short> lhs, RValue<Short> rhs)
1499	{
1500		return RValue<Short>(Nucleus::createSub(lhs.value, rhs.value));
1501	}
1502
1503	RValue<Short> operator*(RValue<Short> lhs, RValue<Short> rhs)
1504	{
1505		return RValue<Short>(Nucleus::createMul(lhs.value, rhs.value));
1506	}
1507
1508	RValue<Short> operator/(RValue<Short> lhs, RValue<Short> rhs)
1509	{
1510		return RValue<Short>(Nucleus::createSDiv(lhs.value, rhs.value));
1511	}
1512
1513	RValue<Short> operator%(RValue<Short> lhs, RValue<Short> rhs)
1514	{
1515		return RValue<Short>(Nucleus::createSRem(lhs.value, rhs.value));
1516	}
1517
1518	RValue<Short> operator&(RValue<Short> lhs, RValue<Short> rhs)
1519	{
1520		return RValue<Short>(Nucleus::createAnd(lhs.value, rhs.value));
1521	}
1522
1523	RValue<Short> operator|(RValue<Short> lhs, RValue<Short> rhs)
1524	{
1525		return RValue<Short>(Nucleus::createOr(lhs.value, rhs.value));
1526	}
1527
1528	RValue<Short> operator^(RValue<Short> lhs, RValue<Short> rhs)
1529	{
1530		return RValue<Short>(Nucleus::createXor(lhs.value, rhs.value));
1531	}
1532
1533	RValue<Short> operator<<(RValue<Short> lhs, RValue<Short> rhs)
1534	{
1535		return RValue<Short>(Nucleus::createShl(lhs.value, rhs.value));
1536	}
1537
1538	RValue<Short> operator>>(RValue<Short> lhs, RValue<Short> rhs)
1539	{
1540		return RValue<Short>(Nucleus::createAShr(lhs.value, rhs.value));
1541	}
1542
1543	RValue<Short> operator+=(const Short &lhs, RValue<Short> rhs)
1544	{
1545		return lhs = lhs + rhs;
1546	}
1547
1548	RValue<Short> operator-=(const Short &lhs, RValue<Short> rhs)
1549	{
1550		return lhs = lhs - rhs;
1551	}
1552
1553	RValue<Short> operator*=(const Short &lhs, RValue<Short> rhs)
1554	{
1555		return lhs = lhs * rhs;
1556	}
1557
1558	RValue<Short> operator/=(const Short &lhs, RValue<Short> rhs)
1559	{
1560		return lhs = lhs / rhs;
1561	}
1562
1563	RValue<Short> operator%=(const Short &lhs, RValue<Short> rhs)
1564	{
1565		return lhs = lhs % rhs;
1566	}
1567
1568	RValue<Short> operator&=(const Short &lhs, RValue<Short> rhs)
1569	{
1570		return lhs = lhs & rhs;
1571	}
1572
1573	RValue<Short> operator|=(const Short &lhs, RValue<Short> rhs)
1574	{
1575		return lhs = lhs | rhs;
1576	}
1577
1578	RValue<Short> operator^=(const Short &lhs, RValue<Short> rhs)
1579	{
1580		return lhs = lhs ^ rhs;
1581	}
1582
1583	RValue<Short> operator<<=(const Short &lhs, RValue<Short> rhs)
1584	{
1585		return lhs = lhs << rhs;
1586	}
1587
1588	RValue<Short> operator>>=(const Short &lhs, RValue<Short> rhs)
1589	{
1590		return lhs = lhs >> rhs;
1591	}
1592
1593	RValue<Short> operator+(RValue<Short> val)
1594	{
1595		return val;
1596	}
1597
1598	RValue<Short> operator-(RValue<Short> val)
1599	{
1600		return RValue<Short>(Nucleus::createNeg(val.value));
1601	}
1602
1603	RValue<Short> operator~(RValue<Short> val)
1604	{
1605		return RValue<Short>(Nucleus::createNot(val.value));
1606	}
1607
1608	RValue<Short> operator++(const Short &val, int)   // Post-increment
1609	{
1610		RValue<Short> res = val;
1611
1612		Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((short)1)));
1613		val.storeValue(inc);
1614
1615		return res;
1616	}
1617
1618	const Short &operator++(const Short &val)   // Pre-increment
1619	{
1620		Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1621		val.storeValue(inc);
1622
1623		return val;
1624	}
1625
1626	RValue<Short> operator--(const Short &val, int)   // Post-decrement
1627	{
1628		RValue<Short> res = val;
1629
1630		Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((short)1)));
1631		val.storeValue(inc);
1632
1633		return res;
1634	}
1635
1636	const Short &operator--(const Short &val)   // Pre-decrement
1637	{
1638		Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((short)1)));
1639		val.storeValue(inc);
1640
1641		return val;
1642	}
1643
1644	RValue<Bool> operator<(RValue<Short> lhs, RValue<Short> rhs)
1645	{
1646		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
1647	}
1648
1649	RValue<Bool> operator<=(RValue<Short> lhs, RValue<Short> rhs)
1650	{
1651		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
1652	}
1653
1654	RValue<Bool> operator>(RValue<Short> lhs, RValue<Short> rhs)
1655	{
1656		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
1657	}
1658
1659	RValue<Bool> operator>=(RValue<Short> lhs, RValue<Short> rhs)
1660	{
1661		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
1662	}
1663
1664	RValue<Bool> operator!=(RValue<Short> lhs, RValue<Short> rhs)
1665	{
1666		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1667	}
1668
1669	RValue<Bool> operator==(RValue<Short> lhs, RValue<Short> rhs)
1670	{
1671		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1672	}
1673
1674	Type *Short::getType()
1675	{
1676		return T(llvm::Type::getInt16Ty(*::context));
1677	}
1678
1679	UShort::UShort(Argument<UShort> argument)
1680	{
1681		storeValue(argument.value);
1682	}
1683
1684	UShort::UShort(RValue<UInt> cast)
1685	{
1686		Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1687
1688		storeValue(integer);
1689	}
1690
1691	UShort::UShort(RValue<Int> cast)
1692	{
1693		Value *integer = Nucleus::createTrunc(cast.value, UShort::getType());
1694
1695		storeValue(integer);
1696	}
1697
1698	UShort::UShort()
1699	{
1700	}
1701
1702	UShort::UShort(unsigned short x)
1703	{
1704		storeValue(Nucleus::createConstantShort(x));
1705	}
1706
1707	UShort::UShort(RValue<UShort> rhs)
1708	{
1709		storeValue(rhs.value);
1710	}
1711
1712	UShort::UShort(const UShort &rhs)
1713	{
1714		Value *value = rhs.loadValue();
1715		storeValue(value);
1716	}
1717
1718	UShort::UShort(const Reference<UShort> &rhs)
1719	{
1720		Value *value = rhs.loadValue();
1721		storeValue(value);
1722	}
1723
1724	RValue<UShort> UShort::operator=(RValue<UShort> rhs) const
1725	{
1726		storeValue(rhs.value);
1727
1728		return rhs;
1729	}
1730
1731	RValue<UShort> UShort::operator=(const UShort &rhs) const
1732	{
1733		Value *value = rhs.loadValue();
1734		storeValue(value);
1735
1736		return RValue<UShort>(value);
1737	}
1738
1739	RValue<UShort> UShort::operator=(const Reference<UShort> &rhs) const
1740	{
1741		Value *value = rhs.loadValue();
1742		storeValue(value);
1743
1744		return RValue<UShort>(value);
1745	}
1746
1747	RValue<UShort> operator+(RValue<UShort> lhs, RValue<UShort> rhs)
1748	{
1749		return RValue<UShort>(Nucleus::createAdd(lhs.value, rhs.value));
1750	}
1751
1752	RValue<UShort> operator-(RValue<UShort> lhs, RValue<UShort> rhs)
1753	{
1754		return RValue<UShort>(Nucleus::createSub(lhs.value, rhs.value));
1755	}
1756
1757	RValue<UShort> operator*(RValue<UShort> lhs, RValue<UShort> rhs)
1758	{
1759		return RValue<UShort>(Nucleus::createMul(lhs.value, rhs.value));
1760	}
1761
1762	RValue<UShort> operator/(RValue<UShort> lhs, RValue<UShort> rhs)
1763	{
1764		return RValue<UShort>(Nucleus::createUDiv(lhs.value, rhs.value));
1765	}
1766
1767	RValue<UShort> operator%(RValue<UShort> lhs, RValue<UShort> rhs)
1768	{
1769		return RValue<UShort>(Nucleus::createURem(lhs.value, rhs.value));
1770	}
1771
1772	RValue<UShort> operator&(RValue<UShort> lhs, RValue<UShort> rhs)
1773	{
1774		return RValue<UShort>(Nucleus::createAnd(lhs.value, rhs.value));
1775	}
1776
1777	RValue<UShort> operator|(RValue<UShort> lhs, RValue<UShort> rhs)
1778	{
1779		return RValue<UShort>(Nucleus::createOr(lhs.value, rhs.value));
1780	}
1781
1782	RValue<UShort> operator^(RValue<UShort> lhs, RValue<UShort> rhs)
1783	{
1784		return RValue<UShort>(Nucleus::createXor(lhs.value, rhs.value));
1785	}
1786
1787	RValue<UShort> operator<<(RValue<UShort> lhs, RValue<UShort> rhs)
1788	{
1789		return RValue<UShort>(Nucleus::createShl(lhs.value, rhs.value));
1790	}
1791
1792	RValue<UShort> operator>>(RValue<UShort> lhs, RValue<UShort> rhs)
1793	{
1794		return RValue<UShort>(Nucleus::createLShr(lhs.value, rhs.value));
1795	}
1796
1797	RValue<UShort> operator+=(const UShort &lhs, RValue<UShort> rhs)
1798	{
1799		return lhs = lhs + rhs;
1800	}
1801
1802	RValue<UShort> operator-=(const UShort &lhs, RValue<UShort> rhs)
1803	{
1804		return lhs = lhs - rhs;
1805	}
1806
1807	RValue<UShort> operator*=(const UShort &lhs, RValue<UShort> rhs)
1808	{
1809		return lhs = lhs * rhs;
1810	}
1811
1812	RValue<UShort> operator/=(const UShort &lhs, RValue<UShort> rhs)
1813	{
1814		return lhs = lhs / rhs;
1815	}
1816
1817	RValue<UShort> operator%=(const UShort &lhs, RValue<UShort> rhs)
1818	{
1819		return lhs = lhs % rhs;
1820	}
1821
1822	RValue<UShort> operator&=(const UShort &lhs, RValue<UShort> rhs)
1823	{
1824		return lhs = lhs & rhs;
1825	}
1826
1827	RValue<UShort> operator|=(const UShort &lhs, RValue<UShort> rhs)
1828	{
1829		return lhs = lhs | rhs;
1830	}
1831
1832	RValue<UShort> operator^=(const UShort &lhs, RValue<UShort> rhs)
1833	{
1834		return lhs = lhs ^ rhs;
1835	}
1836
1837	RValue<UShort> operator<<=(const UShort &lhs, RValue<UShort> rhs)
1838	{
1839		return lhs = lhs << rhs;
1840	}
1841
1842	RValue<UShort> operator>>=(const UShort &lhs, RValue<UShort> rhs)
1843	{
1844		return lhs = lhs >> rhs;
1845	}
1846
1847	RValue<UShort> operator+(RValue<UShort> val)
1848	{
1849		return val;
1850	}
1851
1852	RValue<UShort> operator-(RValue<UShort> val)
1853	{
1854		return RValue<UShort>(Nucleus::createNeg(val.value));
1855	}
1856
1857	RValue<UShort> operator~(RValue<UShort> val)
1858	{
1859		return RValue<UShort>(Nucleus::createNot(val.value));
1860	}
1861
1862	RValue<UShort> operator++(const UShort &val, int)   // Post-increment
1863	{
1864		RValue<UShort> res = val;
1865
1866		Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1867		val.storeValue(inc);
1868
1869		return res;
1870	}
1871
1872	const UShort &operator++(const UShort &val)   // Pre-increment
1873	{
1874		Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1875		val.storeValue(inc);
1876
1877		return val;
1878	}
1879
1880	RValue<UShort> operator--(const UShort &val, int)   // Post-decrement
1881	{
1882		RValue<UShort> res = val;
1883
1884		Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantShort((unsigned short)1)));
1885		val.storeValue(inc);
1886
1887		return res;
1888	}
1889
1890	const UShort &operator--(const UShort &val)   // Pre-decrement
1891	{
1892		Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantShort((unsigned short)1)));
1893		val.storeValue(inc);
1894
1895		return val;
1896	}
1897
1898	RValue<Bool> operator<(RValue<UShort> lhs, RValue<UShort> rhs)
1899	{
1900		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
1901	}
1902
1903	RValue<Bool> operator<=(RValue<UShort> lhs, RValue<UShort> rhs)
1904	{
1905		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
1906	}
1907
1908	RValue<Bool> operator>(RValue<UShort> lhs, RValue<UShort> rhs)
1909	{
1910		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
1911	}
1912
1913	RValue<Bool> operator>=(RValue<UShort> lhs, RValue<UShort> rhs)
1914	{
1915		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
1916	}
1917
1918	RValue<Bool> operator!=(RValue<UShort> lhs, RValue<UShort> rhs)
1919	{
1920		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
1921	}
1922
1923	RValue<Bool> operator==(RValue<UShort> lhs, RValue<UShort> rhs)
1924	{
1925		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
1926	}
1927
1928	Type *UShort::getType()
1929	{
1930		return T(llvm::Type::getInt16Ty(*::context));
1931	}
1932
1933	Byte4::Byte4(RValue<Byte8> cast)
1934	{
1935	//	xyzw.parent = this;
1936
1937		storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), Int::getType()));
1938	}
1939
1940	Byte4::Byte4(const Reference<Byte4> &rhs)
1941	{
1942	//	xyzw.parent = this;
1943
1944		Value *value = rhs.loadValue();
1945		storeValue(value);
1946	}
1947
1948	Type *Byte4::getType()
1949	{
1950		#if 0
1951			return T(VectorType::get(Byte::getType(), 4));
1952		#else
1953			return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1954		#endif
1955	}
1956
1957	Type *SByte4::getType()
1958	{
1959		#if 0
1960			return T(VectorType::get(SByte::getType(), 4));
1961		#else
1962			return Int::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
1963		#endif
1964	}
1965
1966	Byte8::Byte8()
1967	{
1968	//	xyzw.parent = this;
1969	}
1970
1971	Byte8::Byte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
1972	{
1973	//	xyzw.parent = this;
1974
1975		int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
1976		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Byte::getType(), 8))));
1977
1978		storeValue(Nucleus::createBitCast(vector, getType()));
1979	}
1980
1981	Byte8::Byte8(RValue<Byte8> rhs)
1982	{
1983	//	xyzw.parent = this;
1984
1985		storeValue(rhs.value);
1986	}
1987
1988	Byte8::Byte8(const Byte8 &rhs)
1989	{
1990	//	xyzw.parent = this;
1991
1992		Value *value = rhs.loadValue();
1993		storeValue(value);
1994	}
1995
1996	Byte8::Byte8(const Reference<Byte8> &rhs)
1997	{
1998	//	xyzw.parent = this;
1999
2000		Value *value = rhs.loadValue();
2001		storeValue(value);
2002	}
2003
2004	RValue<Byte8> Byte8::operator=(RValue<Byte8> rhs) const
2005	{
2006		storeValue(rhs.value);
2007
2008		return rhs;
2009	}
2010
2011	RValue<Byte8> Byte8::operator=(const Byte8 &rhs) const
2012	{
2013		Value *value = rhs.loadValue();
2014		storeValue(value);
2015
2016		return RValue<Byte8>(value);
2017	}
2018
2019	RValue<Byte8> Byte8::operator=(const Reference<Byte8> &rhs) const
2020	{
2021		Value *value = rhs.loadValue();
2022		storeValue(value);
2023
2024		return RValue<Byte8>(value);
2025	}
2026
2027	RValue<Byte8> operator+(RValue<Byte8> lhs, RValue<Byte8> rhs)
2028	{
2029		if(CPUID::supportsMMX2())
2030		{
2031			return x86::paddb(lhs, rhs);
2032		}
2033		else
2034		{
2035			return RValue<Byte8>(Nucleus::createAdd(lhs.value, rhs.value));
2036		}
2037	}
2038
2039	RValue<Byte8> operator-(RValue<Byte8> lhs, RValue<Byte8> rhs)
2040	{
2041		if(CPUID::supportsMMX2())
2042		{
2043			return x86::psubb(lhs, rhs);
2044		}
2045		else
2046		{
2047			return RValue<Byte8>(Nucleus::createSub(lhs.value, rhs.value));
2048		}
2049	}
2050
2051//	RValue<Byte8> operator*(RValue<Byte8> lhs, RValue<Byte8> rhs)
2052//	{
2053//		return RValue<Byte8>(Nucleus::createMul(lhs.value, rhs.value));
2054//	}
2055
2056//	RValue<Byte8> operator/(RValue<Byte8> lhs, RValue<Byte8> rhs)
2057//	{
2058//		return RValue<Byte8>(Nucleus::createUDiv(lhs.value, rhs.value));
2059//	}
2060
2061//	RValue<Byte8> operator%(RValue<Byte8> lhs, RValue<Byte8> rhs)
2062//	{
2063//		return RValue<Byte8>(Nucleus::createURem(lhs.value, rhs.value));
2064//	}
2065
2066	RValue<Byte8> operator&(RValue<Byte8> lhs, RValue<Byte8> rhs)
2067	{
2068		if(CPUID::supportsMMX2())
2069		{
2070			return As<Byte8>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
2071		}
2072		else
2073		{
2074			return RValue<Byte8>(Nucleus::createAnd(lhs.value, rhs.value));
2075		}
2076	}
2077
2078	RValue<Byte8> operator|(RValue<Byte8> lhs, RValue<Byte8> rhs)
2079	{
2080		if(CPUID::supportsMMX2())
2081		{
2082			return As<Byte8>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
2083		}
2084		else
2085		{
2086			return RValue<Byte8>(Nucleus::createOr(lhs.value, rhs.value));
2087		}
2088	}
2089
2090	RValue<Byte8> operator^(RValue<Byte8> lhs, RValue<Byte8> rhs)
2091	{
2092		if(CPUID::supportsMMX2())
2093		{
2094			return As<Byte8>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
2095		}
2096		else
2097		{
2098			return RValue<Byte8>(Nucleus::createXor(lhs.value, rhs.value));
2099		}
2100	}
2101
2102//	RValue<Byte8> operator<<(RValue<Byte8> lhs, unsigned char rhs)
2103//	{
2104//		return RValue<Byte8>(Nucleus::createShl(lhs.value, rhs.value));
2105//	}
2106
2107//	RValue<Byte8> operator>>(RValue<Byte8> lhs, unsigned char rhs)
2108//	{
2109//		return RValue<Byte8>(Nucleus::createLShr(lhs.value, rhs.value));
2110//	}
2111
2112	RValue<Byte8> operator+=(const Byte8 &lhs, RValue<Byte8> rhs)
2113	{
2114		return lhs = lhs + rhs;
2115	}
2116
2117	RValue<Byte8> operator-=(const Byte8 &lhs, RValue<Byte8> rhs)
2118	{
2119		return lhs = lhs - rhs;
2120	}
2121
2122//	RValue<Byte8> operator*=(const Byte8 &lhs, RValue<Byte8> rhs)
2123//	{
2124//		return lhs = lhs * rhs;
2125//	}
2126
2127//	RValue<Byte8> operator/=(const Byte8 &lhs, RValue<Byte8> rhs)
2128//	{
2129//		return lhs = lhs / rhs;
2130//	}
2131
2132//	RValue<Byte8> operator%=(const Byte8 &lhs, RValue<Byte8> rhs)
2133//	{
2134//		return lhs = lhs % rhs;
2135//	}
2136
2137	RValue<Byte8> operator&=(const Byte8 &lhs, RValue<Byte8> rhs)
2138	{
2139		return lhs = lhs & rhs;
2140	}
2141
2142	RValue<Byte8> operator|=(const Byte8 &lhs, RValue<Byte8> rhs)
2143	{
2144		return lhs = lhs | rhs;
2145	}
2146
2147	RValue<Byte8> operator^=(const Byte8 &lhs, RValue<Byte8> rhs)
2148	{
2149		return lhs = lhs ^ rhs;
2150	}
2151
2152//	RValue<Byte8> operator<<=(const Byte8 &lhs, RValue<Byte8> rhs)
2153//	{
2154//		return lhs = lhs << rhs;
2155//	}
2156
2157//	RValue<Byte8> operator>>=(const Byte8 &lhs, RValue<Byte8> rhs)
2158//	{
2159//		return lhs = lhs >> rhs;
2160//	}
2161
2162//	RValue<Byte8> operator+(RValue<Byte8> val)
2163//	{
2164//		return val;
2165//	}
2166
2167//	RValue<Byte8> operator-(RValue<Byte8> val)
2168//	{
2169//		return RValue<Byte8>(Nucleus::createNeg(val.value));
2170//	}
2171
2172	RValue<Byte8> operator~(RValue<Byte8> val)
2173	{
2174		if(CPUID::supportsMMX2())
2175		{
2176			return val ^ Byte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2177		}
2178		else
2179		{
2180			return RValue<Byte8>(Nucleus::createNot(val.value));
2181		}
2182	}
2183
2184	RValue<Byte8> AddSat(RValue<Byte8> x, RValue<Byte8> y)
2185	{
2186		return x86::paddusb(x, y);
2187	}
2188
2189	RValue<Byte8> SubSat(RValue<Byte8> x, RValue<Byte8> y)
2190	{
2191		return x86::psubusb(x, y);
2192	}
2193
2194	RValue<Short4> Unpack(RValue<Byte4> x)
2195	{
2196		Value *int2 = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), x.value, 0);
2197		Value *byte8 = Nucleus::createBitCast(int2, Byte8::getType());
2198
2199		return UnpackLow(RValue<Byte8>(byte8), RValue<Byte8>(byte8));
2200	}
2201
2202	RValue<Short4> UnpackLow(RValue<Byte8> x, RValue<Byte8> y)
2203	{
2204		if(CPUID::supportsMMX2())
2205		{
2206			return x86::punpcklbw(x, y);
2207		}
2208		else
2209		{
2210			int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2211			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2212
2213			return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2214		}
2215	}
2216
2217	RValue<Short4> UnpackHigh(RValue<Byte8> x, RValue<Byte8> y)
2218	{
2219		if(CPUID::supportsMMX2())
2220		{
2221			return x86::punpckhbw(x, y);
2222		}
2223		else
2224		{
2225			int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2226			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2227
2228			return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2229		}
2230	}
2231
2232	RValue<Int> SignMask(RValue<Byte8> x)
2233	{
2234		return x86::pmovmskb(x);
2235	}
2236
2237//	RValue<Byte8> CmpGT(RValue<Byte8> x, RValue<Byte8> y)
2238//	{
2239//		return x86::pcmpgtb(x, y);   // FIXME: Signedness
2240//	}
2241
2242	RValue<Byte8> CmpEQ(RValue<Byte8> x, RValue<Byte8> y)
2243	{
2244		return x86::pcmpeqb(x, y);
2245	}
2246
2247	Type *Byte8::getType()
2248	{
2249		if(CPUID::supportsMMX2())
2250		{
2251			return MMX::getType();
2252		}
2253		else
2254		{
2255			return T(VectorType::get(Byte::getType(), 8));
2256		}
2257	}
2258
2259	SByte8::SByte8()
2260	{
2261	//	xyzw.parent = this;
2262	}
2263
2264	SByte8::SByte8(uint8_t x0, uint8_t x1, uint8_t x2, uint8_t x3, uint8_t x4, uint8_t x5, uint8_t x6, uint8_t x7)
2265	{
2266	//	xyzw.parent = this;
2267
2268		int64_t constantVector[8] = {x0, x1, x2, x3, x4, x5, x6, x7};
2269		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(SByte::getType(), 8))));
2270
2271		storeValue(Nucleus::createBitCast(vector, getType()));
2272	}
2273
2274	SByte8::SByte8(RValue<SByte8> rhs)
2275	{
2276	//	xyzw.parent = this;
2277
2278		storeValue(rhs.value);
2279	}
2280
2281	SByte8::SByte8(const SByte8 &rhs)
2282	{
2283	//	xyzw.parent = this;
2284
2285		Value *value = rhs.loadValue();
2286		storeValue(value);
2287	}
2288
2289	SByte8::SByte8(const Reference<SByte8> &rhs)
2290	{
2291	//	xyzw.parent = this;
2292
2293		Value *value = rhs.loadValue();
2294		storeValue(value);
2295	}
2296
2297	RValue<SByte8> SByte8::operator=(RValue<SByte8> rhs) const
2298	{
2299		storeValue(rhs.value);
2300
2301		return rhs;
2302	}
2303
2304	RValue<SByte8> SByte8::operator=(const SByte8 &rhs) const
2305	{
2306		Value *value = rhs.loadValue();
2307		storeValue(value);
2308
2309		return RValue<SByte8>(value);
2310	}
2311
2312	RValue<SByte8> SByte8::operator=(const Reference<SByte8> &rhs) const
2313	{
2314		Value *value = rhs.loadValue();
2315		storeValue(value);
2316
2317		return RValue<SByte8>(value);
2318	}
2319
2320	RValue<SByte8> operator+(RValue<SByte8> lhs, RValue<SByte8> rhs)
2321	{
2322		if(CPUID::supportsMMX2())
2323		{
2324			return As<SByte8>(x86::paddb(As<Byte8>(lhs), As<Byte8>(rhs)));
2325		}
2326		else
2327		{
2328			return RValue<SByte8>(Nucleus::createAdd(lhs.value, rhs.value));
2329		}
2330	}
2331
2332	RValue<SByte8> operator-(RValue<SByte8> lhs, RValue<SByte8> rhs)
2333	{
2334		if(CPUID::supportsMMX2())
2335		{
2336			return As<SByte8>(x86::psubb(As<Byte8>(lhs), As<Byte8>(rhs)));
2337		}
2338		else
2339		{
2340			return RValue<SByte8>(Nucleus::createSub(lhs.value, rhs.value));
2341		}
2342	}
2343
2344//	RValue<SByte8> operator*(RValue<SByte8> lhs, RValue<SByte8> rhs)
2345//	{
2346//		return RValue<SByte8>(Nucleus::createMul(lhs.value, rhs.value));
2347//	}
2348
2349//	RValue<SByte8> operator/(RValue<SByte8> lhs, RValue<SByte8> rhs)
2350//	{
2351//		return RValue<SByte8>(Nucleus::createSDiv(lhs.value, rhs.value));
2352//	}
2353
2354//	RValue<SByte8> operator%(RValue<SByte8> lhs, RValue<SByte8> rhs)
2355//	{
2356//		return RValue<SByte8>(Nucleus::createSRem(lhs.value, rhs.value));
2357//	}
2358
2359	RValue<SByte8> operator&(RValue<SByte8> lhs, RValue<SByte8> rhs)
2360	{
2361		return RValue<SByte8>(Nucleus::createAnd(lhs.value, rhs.value));
2362	}
2363
2364	RValue<SByte8> operator|(RValue<SByte8> lhs, RValue<SByte8> rhs)
2365	{
2366		return RValue<SByte8>(Nucleus::createOr(lhs.value, rhs.value));
2367	}
2368
2369	RValue<SByte8> operator^(RValue<SByte8> lhs, RValue<SByte8> rhs)
2370	{
2371		return RValue<SByte8>(Nucleus::createXor(lhs.value, rhs.value));
2372	}
2373
2374//	RValue<SByte8> operator<<(RValue<SByte8> lhs, unsigned char rhs)
2375//	{
2376//		return RValue<SByte8>(Nucleus::createShl(lhs.value, rhs.value));
2377//	}
2378
2379//	RValue<SByte8> operator>>(RValue<SByte8> lhs, unsigned char rhs)
2380//	{
2381//		return RValue<SByte8>(Nucleus::createAShr(lhs.value, rhs.value));
2382//	}
2383
2384	RValue<SByte8> operator+=(const SByte8 &lhs, RValue<SByte8> rhs)
2385	{
2386		return lhs = lhs + rhs;
2387	}
2388
2389	RValue<SByte8> operator-=(const SByte8 &lhs, RValue<SByte8> rhs)
2390	{
2391		return lhs = lhs - rhs;
2392	}
2393
2394//	RValue<SByte8> operator*=(const SByte8 &lhs, RValue<SByte8> rhs)
2395//	{
2396//		return lhs = lhs * rhs;
2397//	}
2398
2399//	RValue<SByte8> operator/=(const SByte8 &lhs, RValue<SByte8> rhs)
2400//	{
2401//		return lhs = lhs / rhs;
2402//	}
2403
2404//	RValue<SByte8> operator%=(const SByte8 &lhs, RValue<SByte8> rhs)
2405//	{
2406//		return lhs = lhs % rhs;
2407//	}
2408
2409	RValue<SByte8> operator&=(const SByte8 &lhs, RValue<SByte8> rhs)
2410	{
2411		return lhs = lhs & rhs;
2412	}
2413
2414	RValue<SByte8> operator|=(const SByte8 &lhs, RValue<SByte8> rhs)
2415	{
2416		return lhs = lhs | rhs;
2417	}
2418
2419	RValue<SByte8> operator^=(const SByte8 &lhs, RValue<SByte8> rhs)
2420	{
2421		return lhs = lhs ^ rhs;
2422	}
2423
2424//	RValue<SByte8> operator<<=(const SByte8 &lhs, RValue<SByte8> rhs)
2425//	{
2426//		return lhs = lhs << rhs;
2427//	}
2428
2429//	RValue<SByte8> operator>>=(const SByte8 &lhs, RValue<SByte8> rhs)
2430//	{
2431//		return lhs = lhs >> rhs;
2432//	}
2433
2434//	RValue<SByte8> operator+(RValue<SByte8> val)
2435//	{
2436//		return val;
2437//	}
2438
2439//	RValue<SByte8> operator-(RValue<SByte8> val)
2440//	{
2441//		return RValue<SByte8>(Nucleus::createNeg(val.value));
2442//	}
2443
2444	RValue<SByte8> operator~(RValue<SByte8> val)
2445	{
2446		if(CPUID::supportsMMX2())
2447		{
2448			return val ^ SByte8(0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF);
2449		}
2450		else
2451		{
2452			return RValue<SByte8>(Nucleus::createNot(val.value));
2453		}
2454	}
2455
2456	RValue<SByte8> AddSat(RValue<SByte8> x, RValue<SByte8> y)
2457	{
2458		return x86::paddsb(x, y);
2459	}
2460
2461	RValue<SByte8> SubSat(RValue<SByte8> x, RValue<SByte8> y)
2462	{
2463		return x86::psubsb(x, y);
2464	}
2465
2466	RValue<Short4> UnpackLow(RValue<SByte8> x, RValue<SByte8> y)
2467	{
2468		if(CPUID::supportsMMX2())
2469		{
2470			return As<Short4>(x86::punpcklbw(As<Byte8>(x), As<Byte8>(y)));
2471		}
2472		else
2473		{
2474			int shuffle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
2475			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2476
2477			return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2478		}
2479	}
2480
2481	RValue<Short4> UnpackHigh(RValue<SByte8> x, RValue<SByte8> y)
2482	{
2483		if(CPUID::supportsMMX2())
2484		{
2485			return As<Short4>(x86::punpckhbw(As<Byte8>(x), As<Byte8>(y)));
2486		}
2487		else
2488		{
2489			int shuffle[8] = {4, 12, 5, 13, 6, 14, 7, 15};
2490			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
2491
2492			return RValue<Short4>(Nucleus::createBitCast(packed, Short4::getType()));
2493		}
2494	}
2495
2496	RValue<Int> SignMask(RValue<SByte8> x)
2497	{
2498		return x86::pmovmskb(As<Byte8>(x));
2499	}
2500
2501	RValue<Byte8> CmpGT(RValue<SByte8> x, RValue<SByte8> y)
2502	{
2503		return x86::pcmpgtb(x, y);
2504	}
2505
2506	RValue<Byte8> CmpEQ(RValue<SByte8> x, RValue<SByte8> y)
2507	{
2508		return x86::pcmpeqb(As<Byte8>(x), As<Byte8>(y));
2509	}
2510
2511	Type *SByte8::getType()
2512	{
2513		if(CPUID::supportsMMX2())
2514		{
2515			return MMX::getType();
2516		}
2517		else
2518		{
2519			return T(VectorType::get(SByte::getType(), 8));
2520		}
2521	}
2522
2523	Byte16::Byte16(RValue<Byte16> rhs)
2524	{
2525	//	xyzw.parent = this;
2526
2527		storeValue(rhs.value);
2528	}
2529
2530	Byte16::Byte16(const Byte16 &rhs)
2531	{
2532	//	xyzw.parent = this;
2533
2534		Value *value = rhs.loadValue();
2535		storeValue(value);
2536	}
2537
2538	Byte16::Byte16(const Reference<Byte16> &rhs)
2539	{
2540	//	xyzw.parent = this;
2541
2542		Value *value = rhs.loadValue();
2543		storeValue(value);
2544	}
2545
2546	RValue<Byte16> Byte16::operator=(RValue<Byte16> rhs) const
2547	{
2548		storeValue(rhs.value);
2549
2550		return rhs;
2551	}
2552
2553	RValue<Byte16> Byte16::operator=(const Byte16 &rhs) const
2554	{
2555		Value *value = rhs.loadValue();
2556		storeValue(value);
2557
2558		return RValue<Byte16>(value);
2559	}
2560
2561	RValue<Byte16> Byte16::operator=(const Reference<Byte16> &rhs) const
2562	{
2563		Value *value = rhs.loadValue();
2564		storeValue(value);
2565
2566		return RValue<Byte16>(value);
2567	}
2568
2569	Type *Byte16::getType()
2570	{
2571		return T(VectorType::get(Byte::getType(), 16));
2572	}
2573
2574	Type *SByte16::getType()
2575	{
2576		return T( VectorType::get(SByte::getType(), 16));
2577	}
2578
2579	Short2::Short2(RValue<Short4> cast)
2580	{
2581		storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2582	}
2583
2584	Type *Short2::getType()
2585	{
2586		#if 0
2587			return T(VectorType::get(Short::getType(), 2));
2588		#else
2589			return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2590		#endif
2591	}
2592
2593	UShort2::UShort2(RValue<UShort4> cast)
2594	{
2595		storeValue(Nucleus::createTrunc(Nucleus::createBitCast(cast.value, Long::getType()), UInt::getType()));
2596	}
2597
2598	Type *UShort2::getType()
2599	{
2600		#if 0
2601			return T(VectorType::get(UShort::getType(), 2));
2602		#else
2603			return UInt::getType();   // FIXME: LLVM doesn't manipulate it as one 32-bit block
2604		#endif
2605	}
2606
2607	Short4::Short4(RValue<Int> cast)
2608	{
2609		Value *extend = Nucleus::createZExt(cast.value, Long::getType());
2610		Value *swizzle = Swizzle(RValue<Short4>(extend), 0x00).value;
2611
2612		storeValue(swizzle);
2613	}
2614
2615	Short4::Short4(RValue<Int4> cast)
2616	{
2617		Value *short8 = Nucleus::createBitCast(cast.value, Short8::getType());
2618
2619		#if 0   // FIXME: Check codegen (pshuflw phshufhw pshufd)
2620			Constant *pack[8];
2621			pack[0] = Nucleus::createConstantInt(0);
2622			pack[1] = Nucleus::createConstantInt(2);
2623			pack[2] = Nucleus::createConstantInt(4);
2624			pack[3] = Nucleus::createConstantInt(6);
2625
2626			Value *short4 = Nucleus::createShuffleVector(short8, short8, Nucleus::createConstantVector(pack, 4));
2627		#else
2628			Value *packed;
2629
2630			// FIXME: Use Swizzle<Short8>
2631			if(!CPUID::supportsSSSE3())
2632			{
2633				int pshuflw[8] = {0, 2, 0, 2, 4, 5, 6, 7};
2634				int pshufhw[8] = {0, 1, 2, 3, 4, 6, 4, 6};
2635
2636				Value *shuffle1 = Nucleus::createShuffleVector(short8, short8, pshuflw);
2637				Value *shuffle2 = Nucleus::createShuffleVector(shuffle1, shuffle1, pshufhw);
2638				Value *int4 = Nucleus::createBitCast(shuffle2, Int4::getType());
2639				packed = createSwizzle4(int4, 0x88);
2640			}
2641			else
2642			{
2643				int pshufb[16] = {0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 4, 5, 8, 9, 12, 13};
2644				Value *byte16 = Nucleus::createBitCast(cast.value, Byte16::getType());
2645				packed = Nucleus::createShuffleVector(byte16, byte16, pshufb);
2646			}
2647
2648			#if 0   // FIXME: No optimal instruction selection
2649				Value *qword2 = Nucleus::createBitCast(packed, T(VectorType::get(Long::getType(), 2)));
2650				Value *element = Nucleus::createExtractElement(qword2, 0);
2651				Value *short4 = Nucleus::createBitCast(element, Short4::getType());
2652			#else   // FIXME: Requires SSE
2653				Value *int2 = RValue<Int2>(Int2(RValue<Int4>(packed))).value;
2654				Value *short4 = Nucleus::createBitCast(int2, Short4::getType());
2655			#endif
2656		#endif
2657
2658		storeValue(short4);
2659	}
2660
2661//	Short4::Short4(RValue<Float> cast)
2662//	{
2663//	}
2664
2665	Short4::Short4(RValue<Float4> cast)
2666	{
2667		Int4 v4i32 = Int4(cast);
2668		v4i32 = As<Int4>(x86::packssdw(v4i32, v4i32));
2669
2670		storeValue(As<Short4>(Int2(v4i32)).value);
2671	}
2672
2673	Short4::Short4()
2674	{
2675	//	xyzw.parent = this;
2676	}
2677
2678	Short4::Short4(short xyzw)
2679	{
2680		//	xyzw.parent = this;
2681
2682		int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
2683		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2684
2685		storeValue(Nucleus::createBitCast(vector, getType()));
2686	}
2687
2688	Short4::Short4(short x, short y, short z, short w)
2689	{
2690	//	xyzw.parent = this;
2691
2692		int64_t constantVector[4] = {x, y, z, w};
2693		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Short::getType(), 4))));
2694
2695		storeValue(Nucleus::createBitCast(vector, getType()));
2696	}
2697
2698	Short4::Short4(RValue<Short4> rhs)
2699	{
2700	//	xyzw.parent = this;
2701
2702		storeValue(rhs.value);
2703	}
2704
2705	Short4::Short4(const Short4 &rhs)
2706	{
2707	//	xyzw.parent = this;
2708
2709		Value *value = rhs.loadValue();
2710		storeValue(value);
2711	}
2712
2713	Short4::Short4(const Reference<Short4> &rhs)
2714	{
2715	//	xyzw.parent = this;
2716
2717		Value *value = rhs.loadValue();
2718		storeValue(value);
2719	}
2720
2721	Short4::Short4(RValue<UShort4> rhs)
2722	{
2723	//	xyzw.parent = this;
2724
2725		storeValue(rhs.value);
2726	}
2727
2728	Short4::Short4(const UShort4 &rhs)
2729	{
2730	//	xyzw.parent = this;
2731
2732		storeValue(rhs.loadValue());
2733	}
2734
2735	Short4::Short4(const Reference<UShort4> &rhs)
2736	{
2737	//	xyzw.parent = this;
2738
2739		storeValue(rhs.loadValue());
2740	}
2741
2742	RValue<Short4> Short4::operator=(RValue<Short4> rhs) const
2743	{
2744		storeValue(rhs.value);
2745
2746		return rhs;
2747	}
2748
2749	RValue<Short4> Short4::operator=(const Short4 &rhs) const
2750	{
2751		Value *value = rhs.loadValue();
2752		storeValue(value);
2753
2754		return RValue<Short4>(value);
2755	}
2756
2757	RValue<Short4> Short4::operator=(const Reference<Short4> &rhs) const
2758	{
2759		Value *value = rhs.loadValue();
2760		storeValue(value);
2761
2762		return RValue<Short4>(value);
2763	}
2764
2765	RValue<Short4> Short4::operator=(RValue<UShort4> rhs) const
2766	{
2767		storeValue(rhs.value);
2768
2769		return RValue<Short4>(rhs);
2770	}
2771
2772	RValue<Short4> Short4::operator=(const UShort4 &rhs) const
2773	{
2774		Value *value = rhs.loadValue();
2775		storeValue(value);
2776
2777		return RValue<Short4>(value);
2778	}
2779
2780	RValue<Short4> Short4::operator=(const Reference<UShort4> &rhs) const
2781	{
2782		Value *value = rhs.loadValue();
2783		storeValue(value);
2784
2785		return RValue<Short4>(value);
2786	}
2787
2788	RValue<Short4> operator+(RValue<Short4> lhs, RValue<Short4> rhs)
2789	{
2790		if(CPUID::supportsMMX2())
2791		{
2792			return x86::paddw(lhs, rhs);
2793		}
2794		else
2795		{
2796			return RValue<Short4>(Nucleus::createAdd(lhs.value, rhs.value));
2797		}
2798	}
2799
2800	RValue<Short4> operator-(RValue<Short4> lhs, RValue<Short4> rhs)
2801	{
2802		if(CPUID::supportsMMX2())
2803		{
2804			return x86::psubw(lhs, rhs);
2805		}
2806		else
2807		{
2808			return RValue<Short4>(Nucleus::createSub(lhs.value, rhs.value));
2809		}
2810	}
2811
2812	RValue<Short4> operator*(RValue<Short4> lhs, RValue<Short4> rhs)
2813	{
2814		if(CPUID::supportsMMX2())
2815		{
2816			return x86::pmullw(lhs, rhs);
2817		}
2818		else
2819		{
2820			return RValue<Short4>(Nucleus::createMul(lhs.value, rhs.value));
2821		}
2822	}
2823
2824//	RValue<Short4> operator/(RValue<Short4> lhs, RValue<Short4> rhs)
2825//	{
2826//		return RValue<Short4>(Nucleus::createSDiv(lhs.value, rhs.value));
2827//	}
2828
2829//	RValue<Short4> operator%(RValue<Short4> lhs, RValue<Short4> rhs)
2830//	{
2831//		return RValue<Short4>(Nucleus::createSRem(lhs.value, rhs.value));
2832//	}
2833
2834	RValue<Short4> operator&(RValue<Short4> lhs, RValue<Short4> rhs)
2835	{
2836		if(CPUID::supportsMMX2())
2837		{
2838			return x86::pand(lhs, rhs);
2839		}
2840		else
2841		{
2842			return RValue<Short4>(Nucleus::createAnd(lhs.value, rhs.value));
2843		}
2844	}
2845
2846	RValue<Short4> operator|(RValue<Short4> lhs, RValue<Short4> rhs)
2847	{
2848		if(CPUID::supportsMMX2())
2849		{
2850			return x86::por(lhs, rhs);
2851		}
2852		else
2853		{
2854			return RValue<Short4>(Nucleus::createOr(lhs.value, rhs.value));
2855		}
2856	}
2857
2858	RValue<Short4> operator^(RValue<Short4> lhs, RValue<Short4> rhs)
2859	{
2860		if(CPUID::supportsMMX2())
2861		{
2862			return x86::pxor(lhs, rhs);
2863		}
2864		else
2865		{
2866			return RValue<Short4>(Nucleus::createXor(lhs.value, rhs.value));
2867		}
2868	}
2869
2870	RValue<Short4> operator<<(RValue<Short4> lhs, unsigned char rhs)
2871	{
2872	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2873
2874		return x86::psllw(lhs, rhs);
2875	}
2876
2877	RValue<Short4> operator>>(RValue<Short4> lhs, unsigned char rhs)
2878	{
2879	//	return RValue<Short4>(Nucleus::createAShr(lhs.value, rhs.value));
2880
2881		return x86::psraw(lhs, rhs);
2882	}
2883
2884	RValue<Short4> operator<<(RValue<Short4> lhs, RValue<Long1> rhs)
2885	{
2886	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
2887
2888		return x86::psllw(lhs, rhs);
2889	}
2890
2891	RValue<Short4> operator>>(RValue<Short4> lhs, RValue<Long1> rhs)
2892	{
2893	//	return RValue<Short4>(Nucleus::createAShr(lhs.value, rhs.value));
2894
2895		return x86::psraw(lhs, rhs);
2896	}
2897
2898	RValue<Short4> operator+=(const Short4 &lhs, RValue<Short4> rhs)
2899	{
2900		return lhs = lhs + rhs;
2901	}
2902
2903	RValue<Short4> operator-=(const Short4 &lhs, RValue<Short4> rhs)
2904	{
2905		return lhs = lhs - rhs;
2906	}
2907
2908	RValue<Short4> operator*=(const Short4 &lhs, RValue<Short4> rhs)
2909	{
2910		return lhs = lhs * rhs;
2911	}
2912
2913//	RValue<Short4> operator/=(const Short4 &lhs, RValue<Short4> rhs)
2914//	{
2915//		return lhs = lhs / rhs;
2916//	}
2917
2918//	RValue<Short4> operator%=(const Short4 &lhs, RValue<Short4> rhs)
2919//	{
2920//		return lhs = lhs % rhs;
2921//	}
2922
2923	RValue<Short4> operator&=(const Short4 &lhs, RValue<Short4> rhs)
2924	{
2925		return lhs = lhs & rhs;
2926	}
2927
2928	RValue<Short4> operator|=(const Short4 &lhs, RValue<Short4> rhs)
2929	{
2930		return lhs = lhs | rhs;
2931	}
2932
2933	RValue<Short4> operator^=(const Short4 &lhs, RValue<Short4> rhs)
2934	{
2935		return lhs = lhs ^ rhs;
2936	}
2937
2938	RValue<Short4> operator<<=(const Short4 &lhs, unsigned char rhs)
2939	{
2940		return lhs = lhs << rhs;
2941	}
2942
2943	RValue<Short4> operator>>=(const Short4 &lhs, unsigned char rhs)
2944	{
2945		return lhs = lhs >> rhs;
2946	}
2947
2948	RValue<Short4> operator<<=(const Short4 &lhs, RValue<Long1> rhs)
2949	{
2950		return lhs = lhs << rhs;
2951	}
2952
2953	RValue<Short4> operator>>=(const Short4 &lhs, RValue<Long1> rhs)
2954	{
2955		return lhs = lhs >> rhs;
2956	}
2957
2958//	RValue<Short4> operator+(RValue<Short4> val)
2959//	{
2960//		return val;
2961//	}
2962
2963	RValue<Short4> operator-(RValue<Short4> val)
2964	{
2965		if(CPUID::supportsMMX2())
2966		{
2967			return Short4(0, 0, 0, 0) - val;
2968		}
2969		else
2970		{
2971			return RValue<Short4>(Nucleus::createNeg(val.value));
2972		}
2973	}
2974
2975	RValue<Short4> operator~(RValue<Short4> val)
2976	{
2977		if(CPUID::supportsMMX2())
2978		{
2979			return val ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu);
2980		}
2981		else
2982		{
2983			return RValue<Short4>(Nucleus::createNot(val.value));
2984		}
2985	}
2986
2987	RValue<Short4> RoundShort4(RValue<Float4> cast)
2988	{
2989		RValue<Int4> v4i32 = x86::cvtps2dq(cast);
2990		RValue<Short8> v8i16 = x86::packssdw(v4i32, v4i32);
2991
2992		return As<Short4>(Int2(As<Int4>(v8i16)));
2993	}
2994
2995	RValue<Short4> Max(RValue<Short4> x, RValue<Short4> y)
2996	{
2997		return x86::pmaxsw(x, y);
2998	}
2999
3000	RValue<Short4> Min(RValue<Short4> x, RValue<Short4> y)
3001	{
3002		return x86::pminsw(x, y);
3003	}
3004
3005	RValue<Short4> AddSat(RValue<Short4> x, RValue<Short4> y)
3006	{
3007		return x86::paddsw(x, y);
3008	}
3009
3010	RValue<Short4> SubSat(RValue<Short4> x, RValue<Short4> y)
3011	{
3012		return x86::psubsw(x, y);
3013	}
3014
3015	RValue<Short4> MulHigh(RValue<Short4> x, RValue<Short4> y)
3016	{
3017		return x86::pmulhw(x, y);
3018	}
3019
3020	RValue<Int2> MulAdd(RValue<Short4> x, RValue<Short4> y)
3021	{
3022		return x86::pmaddwd(x, y);
3023	}
3024
3025	RValue<SByte8> Pack(RValue<Short4> x, RValue<Short4> y)
3026	{
3027		return x86::packsswb(x, y);
3028	}
3029
3030	RValue<Int2> UnpackLow(RValue<Short4> x, RValue<Short4> y)
3031	{
3032		if(CPUID::supportsMMX2())
3033		{
3034			return x86::punpcklwd(x, y);
3035		}
3036		else
3037		{
3038			int shuffle[4] = {0, 4, 1, 5};
3039			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
3040
3041			return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
3042		}
3043	}
3044
3045	RValue<Int2> UnpackHigh(RValue<Short4> x, RValue<Short4> y)
3046	{
3047		if(CPUID::supportsMMX2())
3048		{
3049			return x86::punpckhwd(x, y);
3050		}
3051		else
3052		{
3053			int shuffle[4] = {2, 6, 3, 7};
3054			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
3055
3056			return RValue<Int2>(Nucleus::createBitCast(packed, Int2::getType()));
3057		}
3058	}
3059
3060	RValue<Short4> Swizzle(RValue<Short4> x, unsigned char select)
3061	{
3062		if(CPUID::supportsMMX2())
3063		{
3064			return x86::pshufw(x, select);
3065		}
3066		else
3067		{
3068			return RValue<Short4>(createSwizzle4(x.value, select));
3069		}
3070	}
3071
3072	RValue<Short4> Insert(RValue<Short4> val, RValue<Short> element, int i)
3073	{
3074		if(CPUID::supportsMMX2())
3075		{
3076			return x86::pinsrw(val, Int(element), i);
3077		}
3078		else
3079		{
3080			return RValue<Short4>(Nucleus::createInsertElement(val.value, element.value, i));
3081		}
3082	}
3083
3084	RValue<Short> Extract(RValue<Short4> val, int i)
3085	{
3086		if(CPUID::supportsMMX2())
3087		{
3088			return Short(x86::pextrw(val, i));
3089		}
3090		else
3091		{
3092			return RValue<Short>(Nucleus::createExtractElement(val.value, Short::getType(), i));
3093		}
3094	}
3095
3096	RValue<Short4> CmpGT(RValue<Short4> x, RValue<Short4> y)
3097	{
3098		return x86::pcmpgtw(x, y);
3099	}
3100
3101	RValue<Short4> CmpEQ(RValue<Short4> x, RValue<Short4> y)
3102	{
3103		return x86::pcmpeqw(x, y);
3104	}
3105
3106	Type *Short4::getType()
3107	{
3108		if(CPUID::supportsMMX2())
3109		{
3110			return MMX::getType();
3111		}
3112		else
3113		{
3114			return T(VectorType::get(Short::getType(), 4));
3115		}
3116	}
3117
3118	UShort4::UShort4(RValue<Int4> cast)
3119	{
3120		*this = Short4(cast);
3121	}
3122
3123	UShort4::UShort4(RValue<Float4> cast, bool saturate)
3124	{
3125		Float4 sat;
3126
3127		if(saturate)
3128		{
3129			if(CPUID::supportsSSE4_1())
3130			{
3131				sat = Min(cast, Float4(0xFFFF));   // packusdw takes care of 0x0000 saturation
3132			}
3133			else
3134			{
3135				sat = Max(Min(cast, Float4(0xFFFF)), Float4(0x0000));
3136			}
3137		}
3138		else
3139		{
3140			sat = cast;
3141		}
3142
3143		Int4 int4(sat);
3144
3145		if(!saturate || !CPUID::supportsSSE4_1())
3146		{
3147			*this = Short4(Int4(int4));
3148		}
3149		else
3150		{
3151			*this = As<Short4>(Int2(As<Int4>(x86::packusdw(As<UInt4>(int4), As<UInt4>(int4)))));
3152		}
3153	}
3154
3155	UShort4::UShort4()
3156	{
3157	//	xyzw.parent = this;
3158	}
3159
3160	UShort4::UShort4(unsigned short xyzw)
3161	{
3162		//	xyzw.parent = this;
3163
3164		int64_t constantVector[4] = {xyzw, xyzw, xyzw, xyzw};
3165		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3166
3167		storeValue(Nucleus::createBitCast(vector, getType()));
3168	}
3169
3170	UShort4::UShort4(unsigned short x, unsigned short y, unsigned short z, unsigned short w)
3171	{
3172	//	xyzw.parent = this;
3173
3174		int64_t constantVector[4] = {x, y, z, w};
3175		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UShort::getType(), 4))));
3176
3177		storeValue(Nucleus::createBitCast(vector, getType()));
3178	}
3179
3180	UShort4::UShort4(RValue<UShort4> rhs)
3181	{
3182	//	xyzw.parent = this;
3183
3184		storeValue(rhs.value);
3185	}
3186
3187	UShort4::UShort4(const UShort4 &rhs)
3188	{
3189	//	xyzw.parent = this;
3190
3191		Value *value = rhs.loadValue();
3192		storeValue(value);
3193	}
3194
3195	UShort4::UShort4(const Reference<UShort4> &rhs)
3196	{
3197	//	xyzw.parent = this;
3198
3199		Value *value = rhs.loadValue();
3200		storeValue(value);
3201	}
3202
3203	UShort4::UShort4(RValue<Short4> rhs)
3204	{
3205	//	xyzw.parent = this;
3206
3207		storeValue(rhs.value);
3208	}
3209
3210	UShort4::UShort4(const Short4 &rhs)
3211	{
3212	//	xyzw.parent = this;
3213
3214		Value *value = rhs.loadValue();
3215		storeValue(value);
3216	}
3217
3218	UShort4::UShort4(const Reference<Short4> &rhs)
3219	{
3220	//	xyzw.parent = this;
3221
3222		Value *value = rhs.loadValue();
3223		storeValue(value);
3224	}
3225
3226	RValue<UShort4> UShort4::operator=(RValue<UShort4> rhs) const
3227	{
3228		storeValue(rhs.value);
3229
3230		return rhs;
3231	}
3232
3233	RValue<UShort4> UShort4::operator=(const UShort4 &rhs) const
3234	{
3235		Value *value = rhs.loadValue();
3236		storeValue(value);
3237
3238		return RValue<UShort4>(value);
3239	}
3240
3241	RValue<UShort4> UShort4::operator=(const Reference<UShort4> &rhs) const
3242	{
3243		Value *value = rhs.loadValue();
3244		storeValue(value);
3245
3246		return RValue<UShort4>(value);
3247	}
3248
3249	RValue<UShort4> UShort4::operator=(RValue<Short4> rhs) const
3250	{
3251		storeValue(rhs.value);
3252
3253		return RValue<UShort4>(rhs);
3254	}
3255
3256	RValue<UShort4> UShort4::operator=(const Short4 &rhs) const
3257	{
3258		Value *value = rhs.loadValue();
3259		storeValue(value);
3260
3261		return RValue<UShort4>(value);
3262	}
3263
3264	RValue<UShort4> UShort4::operator=(const Reference<Short4> &rhs) const
3265	{
3266		Value *value = rhs.loadValue();
3267		storeValue(value);
3268
3269		return RValue<UShort4>(value);
3270	}
3271
3272	RValue<UShort4> operator+(RValue<UShort4> lhs, RValue<UShort4> rhs)
3273	{
3274		if(CPUID::supportsMMX2())
3275		{
3276			return As<UShort4>(x86::paddw(As<Short4>(lhs), As<Short4>(rhs)));
3277		}
3278		else
3279		{
3280			return RValue<UShort4>(Nucleus::createAdd(lhs.value, rhs.value));
3281		}
3282	}
3283
3284	RValue<UShort4> operator-(RValue<UShort4> lhs, RValue<UShort4> rhs)
3285	{
3286		if(CPUID::supportsMMX2())
3287		{
3288			return As<UShort4>(x86::psubw(As<Short4>(lhs), As<Short4>(rhs)));
3289		}
3290		else
3291		{
3292			return RValue<UShort4>(Nucleus::createSub(lhs.value, rhs.value));
3293		}
3294	}
3295
3296	RValue<UShort4> operator*(RValue<UShort4> lhs, RValue<UShort4> rhs)
3297	{
3298		if(CPUID::supportsMMX2())
3299		{
3300			return As<UShort4>(x86::pmullw(As<Short4>(lhs), As<Short4>(rhs)));
3301		}
3302		else
3303		{
3304			return RValue<UShort4>(Nucleus::createMul(lhs.value, rhs.value));
3305		}
3306	}
3307
3308	RValue<UShort4> operator&(RValue<UShort4> lhs, RValue<UShort4> rhs)
3309	{
3310		if(CPUID::supportsMMX2())
3311		{
3312			return As<UShort4>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
3313		}
3314		else
3315		{
3316			return RValue<UShort4>(Nucleus::createAnd(lhs.value, rhs.value));
3317		}
3318	}
3319
3320	RValue<UShort4> operator|(RValue<UShort4> lhs, RValue<UShort4> rhs)
3321	{
3322		if(CPUID::supportsMMX2())
3323		{
3324			return As<UShort4>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
3325		}
3326		else
3327		{
3328			return RValue<UShort4>(Nucleus::createOr(lhs.value, rhs.value));
3329		}
3330	}
3331
3332	RValue<UShort4> operator^(RValue<UShort4> lhs, RValue<UShort4> rhs)
3333	{
3334		if(CPUID::supportsMMX2())
3335		{
3336			return As<UShort4>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
3337		}
3338		else
3339		{
3340			return RValue<UShort4>(Nucleus::createXor(lhs.value, rhs.value));
3341		}
3342	}
3343
3344	RValue<UShort4> operator<<(RValue<UShort4> lhs, unsigned char rhs)
3345	{
3346	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
3347
3348		return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
3349	}
3350
3351	RValue<UShort4> operator>>(RValue<UShort4> lhs, unsigned char rhs)
3352	{
3353	//	return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
3354
3355		return x86::psrlw(lhs, rhs);
3356	}
3357
3358	RValue<UShort4> operator<<(RValue<UShort4> lhs, RValue<Long1> rhs)
3359	{
3360	//	return RValue<Short4>(Nucleus::createShl(lhs.value, rhs.value));
3361
3362		return As<UShort4>(x86::psllw(As<Short4>(lhs), rhs));
3363	}
3364
3365	RValue<UShort4> operator>>(RValue<UShort4> lhs, RValue<Long1> rhs)
3366	{
3367	//	return RValue<Short4>(Nucleus::createLShr(lhs.value, rhs.value));
3368
3369		return x86::psrlw(lhs, rhs);
3370	}
3371
3372	RValue<UShort4> operator<<=(const UShort4 &lhs, unsigned char rhs)
3373	{
3374		return lhs = lhs << rhs;
3375	}
3376
3377	RValue<UShort4> operator>>=(const UShort4 &lhs, unsigned char rhs)
3378	{
3379		return lhs = lhs >> rhs;
3380	}
3381
3382	RValue<UShort4> operator<<=(const UShort4 &lhs, RValue<Long1> rhs)
3383	{
3384		return lhs = lhs << rhs;
3385	}
3386
3387	RValue<UShort4> operator>>=(const UShort4 &lhs, RValue<Long1> rhs)
3388	{
3389		return lhs = lhs >> rhs;
3390	}
3391
3392	RValue<UShort4> operator~(RValue<UShort4> val)
3393	{
3394		if(CPUID::supportsMMX2())
3395		{
3396			return As<UShort4>(As<Short4>(val) ^ Short4(0xFFFFu, 0xFFFFu, 0xFFFFu, 0xFFFFu));
3397		}
3398		else
3399		{
3400			return RValue<UShort4>(Nucleus::createNot(val.value));
3401		}
3402	}
3403
3404	RValue<UShort4> Max(RValue<UShort4> x, RValue<UShort4> y)
3405	{
3406		return RValue<UShort4>(Max(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3407	}
3408
3409	RValue<UShort4> Min(RValue<UShort4> x, RValue<UShort4> y)
3410	{
3411		return RValue<UShort4>(Min(As<Short4>(x) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u), As<Short4>(y) - Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u)) + Short4(0x8000u, 0x8000u, 0x8000u, 0x8000u));
3412	}
3413
3414	RValue<UShort4> AddSat(RValue<UShort4> x, RValue<UShort4> y)
3415	{
3416		return x86::paddusw(x, y);
3417	}
3418
3419	RValue<UShort4> SubSat(RValue<UShort4> x, RValue<UShort4> y)
3420	{
3421		return x86::psubusw(x, y);
3422	}
3423
3424	RValue<UShort4> MulHigh(RValue<UShort4> x, RValue<UShort4> y)
3425	{
3426		return x86::pmulhuw(x, y);
3427	}
3428
3429	RValue<UShort4> Average(RValue<UShort4> x, RValue<UShort4> y)
3430	{
3431		return x86::pavgw(x, y);
3432	}
3433
3434	RValue<Byte8> Pack(RValue<UShort4> x, RValue<UShort4> y)
3435	{
3436		return x86::packuswb(x, y);
3437	}
3438
3439	Type *UShort4::getType()
3440	{
3441		if(CPUID::supportsMMX2())
3442		{
3443			return MMX::getType();
3444		}
3445		else
3446		{
3447			return T(VectorType::get(UShort::getType(), 4));
3448		}
3449	}
3450
3451	Short8::Short8(short c0, short c1, short c2, short c3, short c4, short c5, short c6, short c7)
3452	{
3453		int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3454		storeValue(Nucleus::createConstantVector(constantVector, getType()));
3455	}
3456
3457	Short8::Short8(RValue<Short8> rhs)
3458	{
3459		storeValue(rhs.value);
3460	}
3461
3462	Short8::Short8(const Reference<Short8> &rhs)
3463	{
3464		Value *value = rhs.loadValue();
3465		storeValue(value);
3466	}
3467
3468	Short8::Short8(RValue<Short4> lo, RValue<Short4> hi)
3469	{
3470		Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3471		Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3472
3473		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3474		long2 = Nucleus::createInsertElement(long2, loLong, 0);
3475		long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3476		Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3477
3478		storeValue(short8);
3479	}
3480
3481	RValue<Short8> operator+(RValue<Short8> lhs, RValue<Short8> rhs)
3482	{
3483		return RValue<Short8>(Nucleus::createAdd(lhs.value, rhs.value));
3484	}
3485
3486	RValue<Short8> operator&(RValue<Short8> lhs, RValue<Short8> rhs)
3487	{
3488		return RValue<Short8>(Nucleus::createAnd(lhs.value, rhs.value));
3489	}
3490
3491	RValue<Short8> operator<<(RValue<Short8> lhs, unsigned char rhs)
3492	{
3493		return x86::psllw(lhs, rhs);   // FIXME: Fallback required
3494	}
3495
3496	RValue<Short8> operator>>(RValue<Short8> lhs, unsigned char rhs)
3497	{
3498		return x86::psraw(lhs, rhs);   // FIXME: Fallback required
3499	}
3500
3501	RValue<Int4> MulAdd(RValue<Short8> x, RValue<Short8> y)
3502	{
3503		return x86::pmaddwd(x, y);   // FIXME: Fallback required
3504	}
3505
3506	RValue<Int4> Abs(RValue<Int4> x)
3507	{
3508		if(CPUID::supportsSSSE3())
3509		{
3510			return x86::pabsd(x);
3511		}
3512		else
3513		{
3514			Int4 mask = (x >> 31);
3515			return (mask ^ x) - mask;
3516		}
3517	}
3518
3519	RValue<Short8> MulHigh(RValue<Short8> x, RValue<Short8> y)
3520	{
3521		return x86::pmulhw(x, y);   // FIXME: Fallback required
3522	}
3523
3524	Type *Short8::getType()
3525	{
3526		return T(VectorType::get(Short::getType(), 8));
3527	}
3528
3529	UShort8::UShort8(unsigned short c0, unsigned short c1, unsigned short c2, unsigned short c3, unsigned short c4, unsigned short c5, unsigned short c6, unsigned short c7)
3530	{
3531		int64_t constantVector[8] = {c0, c1, c2, c3, c4, c5, c6, c7};
3532		storeValue(Nucleus::createConstantVector(constantVector, getType()));
3533	}
3534
3535	UShort8::UShort8(RValue<UShort8> rhs)
3536	{
3537		storeValue(rhs.value);
3538	}
3539
3540	UShort8::UShort8(const Reference<UShort8> &rhs)
3541	{
3542		Value *value = rhs.loadValue();
3543		storeValue(value);
3544	}
3545
3546	UShort8::UShort8(RValue<UShort4> lo, RValue<UShort4> hi)
3547	{
3548		Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
3549		Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
3550
3551		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
3552		long2 = Nucleus::createInsertElement(long2, loLong, 0);
3553		long2 = Nucleus::createInsertElement(long2, hiLong, 1);
3554		Value *short8 = Nucleus::createBitCast(long2, Short8::getType());
3555
3556		storeValue(short8);
3557	}
3558
3559	RValue<UShort8> UShort8::operator=(RValue<UShort8> rhs) const
3560	{
3561		storeValue(rhs.value);
3562
3563		return rhs;
3564	}
3565
3566	RValue<UShort8> UShort8::operator=(const UShort8 &rhs) const
3567	{
3568		Value *value = rhs.loadValue();
3569		storeValue(value);
3570
3571		return RValue<UShort8>(value);
3572	}
3573
3574	RValue<UShort8> UShort8::operator=(const Reference<UShort8> &rhs) const
3575	{
3576		Value *value = rhs.loadValue();
3577		storeValue(value);
3578
3579		return RValue<UShort8>(value);
3580	}
3581
3582	RValue<UShort8> operator&(RValue<UShort8> lhs, RValue<UShort8> rhs)
3583	{
3584		return RValue<UShort8>(Nucleus::createAnd(lhs.value, rhs.value));
3585	}
3586
3587	RValue<UShort8> operator<<(RValue<UShort8> lhs, unsigned char rhs)
3588	{
3589		return As<UShort8>(x86::psllw(As<Short8>(lhs), rhs));   // FIXME: Fallback required
3590	}
3591
3592	RValue<UShort8> operator>>(RValue<UShort8> lhs, unsigned char rhs)
3593	{
3594		return x86::psrlw(lhs, rhs);   // FIXME: Fallback required
3595	}
3596
3597	RValue<UShort8> operator+(RValue<UShort8> lhs, RValue<UShort8> rhs)
3598	{
3599		return RValue<UShort8>(Nucleus::createAdd(lhs.value, rhs.value));
3600	}
3601
3602	RValue<UShort8> operator*(RValue<UShort8> lhs, RValue<UShort8> rhs)
3603	{
3604		return RValue<UShort8>(Nucleus::createMul(lhs.value, rhs.value));
3605	}
3606
3607	RValue<UShort8> operator+=(const UShort8 &lhs, RValue<UShort8> rhs)
3608	{
3609		return lhs = lhs + rhs;
3610	}
3611
3612	RValue<UShort8> operator~(RValue<UShort8> val)
3613	{
3614		return RValue<UShort8>(Nucleus::createNot(val.value));
3615	}
3616
3617	RValue<UShort8> Swizzle(RValue<UShort8> x, char select0, char select1, char select2, char select3, char select4, char select5, char select6, char select7)
3618	{
3619		int pshufb[16] =
3620		{
3621			select0 + 0,
3622			select0 + 1,
3623			select1 + 0,
3624			select1 + 1,
3625			select2 + 0,
3626			select2 + 1,
3627			select3 + 0,
3628			select3 + 1,
3629			select4 + 0,
3630			select4 + 1,
3631			select5 + 0,
3632			select5 + 1,
3633			select6 + 0,
3634			select6 + 1,
3635			select7 + 0,
3636			select7 + 1,
3637		};
3638
3639		Value *byte16 = Nucleus::createBitCast(x.value, Byte16::getType());
3640		Value *shuffle = Nucleus::createShuffleVector(byte16, byte16, pshufb);
3641		Value *short8 = Nucleus::createBitCast(shuffle, UShort8::getType());
3642
3643		return RValue<UShort8>(short8);
3644	}
3645
3646	RValue<UShort8> MulHigh(RValue<UShort8> x, RValue<UShort8> y)
3647	{
3648		return x86::pmulhuw(x, y);   // FIXME: Fallback required
3649	}
3650
3651	Type *UShort8::getType()
3652	{
3653		return T(VectorType::get(UShort::getType(), 8));
3654	}
3655
3656	Int::Int(Argument<Int> argument)
3657	{
3658		storeValue(argument.value);
3659	}
3660
3661	Int::Int(RValue<Byte> cast)
3662	{
3663		Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3664
3665		storeValue(integer);
3666	}
3667
3668	Int::Int(RValue<SByte> cast)
3669	{
3670		Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3671
3672		storeValue(integer);
3673	}
3674
3675	Int::Int(RValue<Short> cast)
3676	{
3677		Value *integer = Nucleus::createSExt(cast.value, Int::getType());
3678
3679		storeValue(integer);
3680	}
3681
3682	Int::Int(RValue<UShort> cast)
3683	{
3684		Value *integer = Nucleus::createZExt(cast.value, Int::getType());
3685
3686		storeValue(integer);
3687	}
3688
3689	Int::Int(RValue<Int2> cast)
3690	{
3691		*this = Extract(cast, 0);
3692	}
3693
3694	Int::Int(RValue<Long> cast)
3695	{
3696		Value *integer = Nucleus::createTrunc(cast.value, Int::getType());
3697
3698		storeValue(integer);
3699	}
3700
3701	Int::Int(RValue<Float> cast)
3702	{
3703		Value *integer = Nucleus::createFPToSI(cast.value, Int::getType());
3704
3705		storeValue(integer);
3706	}
3707
3708	Int::Int()
3709	{
3710	}
3711
3712	Int::Int(int x)
3713	{
3714		storeValue(Nucleus::createConstantInt(x));
3715	}
3716
3717	Int::Int(RValue<Int> rhs)
3718	{
3719		storeValue(rhs.value);
3720	}
3721
3722	Int::Int(RValue<UInt> rhs)
3723	{
3724		storeValue(rhs.value);
3725	}
3726
3727	Int::Int(const Int &rhs)
3728	{
3729		Value *value = rhs.loadValue();
3730		storeValue(value);
3731	}
3732
3733	Int::Int(const Reference<Int> &rhs)
3734	{
3735		Value *value = rhs.loadValue();
3736		storeValue(value);
3737	}
3738
3739	Int::Int(const UInt &rhs)
3740	{
3741		Value *value = rhs.loadValue();
3742		storeValue(value);
3743	}
3744
3745	Int::Int(const Reference<UInt> &rhs)
3746	{
3747		Value *value = rhs.loadValue();
3748		storeValue(value);
3749	}
3750
3751	RValue<Int> Int::operator=(int rhs) const
3752	{
3753		return RValue<Int>(storeValue(Nucleus::createConstantInt(rhs)));
3754	}
3755
3756	RValue<Int> Int::operator=(RValue<Int> rhs) const
3757	{
3758		storeValue(rhs.value);
3759
3760		return rhs;
3761	}
3762
3763	RValue<Int> Int::operator=(RValue<UInt> rhs) const
3764	{
3765		storeValue(rhs.value);
3766
3767		return RValue<Int>(rhs);
3768	}
3769
3770	RValue<Int> Int::operator=(const Int &rhs) const
3771	{
3772		Value *value = rhs.loadValue();
3773		storeValue(value);
3774
3775		return RValue<Int>(value);
3776	}
3777
3778	RValue<Int> Int::operator=(const Reference<Int> &rhs) const
3779	{
3780		Value *value = rhs.loadValue();
3781		storeValue(value);
3782
3783		return RValue<Int>(value);
3784	}
3785
3786	RValue<Int> Int::operator=(const UInt &rhs) const
3787	{
3788		Value *value = rhs.loadValue();
3789		storeValue(value);
3790
3791		return RValue<Int>(value);
3792	}
3793
3794	RValue<Int> Int::operator=(const Reference<UInt> &rhs) const
3795	{
3796		Value *value = rhs.loadValue();
3797		storeValue(value);
3798
3799		return RValue<Int>(value);
3800	}
3801
3802	RValue<Int> operator+(RValue<Int> lhs, RValue<Int> rhs)
3803	{
3804		return RValue<Int>(Nucleus::createAdd(lhs.value, rhs.value));
3805	}
3806
3807	RValue<Int> operator-(RValue<Int> lhs, RValue<Int> rhs)
3808	{
3809		return RValue<Int>(Nucleus::createSub(lhs.value, rhs.value));
3810	}
3811
3812	RValue<Int> operator*(RValue<Int> lhs, RValue<Int> rhs)
3813	{
3814		return RValue<Int>(Nucleus::createMul(lhs.value, rhs.value));
3815	}
3816
3817	RValue<Int> operator/(RValue<Int> lhs, RValue<Int> rhs)
3818	{
3819		return RValue<Int>(Nucleus::createSDiv(lhs.value, rhs.value));
3820	}
3821
3822	RValue<Int> operator%(RValue<Int> lhs, RValue<Int> rhs)
3823	{
3824		return RValue<Int>(Nucleus::createSRem(lhs.value, rhs.value));
3825	}
3826
3827	RValue<Int> operator&(RValue<Int> lhs, RValue<Int> rhs)
3828	{
3829		return RValue<Int>(Nucleus::createAnd(lhs.value, rhs.value));
3830	}
3831
3832	RValue<Int> operator|(RValue<Int> lhs, RValue<Int> rhs)
3833	{
3834		return RValue<Int>(Nucleus::createOr(lhs.value, rhs.value));
3835	}
3836
3837	RValue<Int> operator^(RValue<Int> lhs, RValue<Int> rhs)
3838	{
3839		return RValue<Int>(Nucleus::createXor(lhs.value, rhs.value));
3840	}
3841
3842	RValue<Int> operator<<(RValue<Int> lhs, RValue<Int> rhs)
3843	{
3844		return RValue<Int>(Nucleus::createShl(lhs.value, rhs.value));
3845	}
3846
3847	RValue<Int> operator>>(RValue<Int> lhs, RValue<Int> rhs)
3848	{
3849		return RValue<Int>(Nucleus::createAShr(lhs.value, rhs.value));
3850	}
3851
3852	RValue<Int> operator+=(const Int &lhs, RValue<Int> rhs)
3853	{
3854		return lhs = lhs + rhs;
3855	}
3856
3857	RValue<Int> operator-=(const Int &lhs, RValue<Int> rhs)
3858	{
3859		return lhs = lhs - rhs;
3860	}
3861
3862	RValue<Int> operator*=(const Int &lhs, RValue<Int> rhs)
3863	{
3864		return lhs = lhs * rhs;
3865	}
3866
3867	RValue<Int> operator/=(const Int &lhs, RValue<Int> rhs)
3868	{
3869		return lhs = lhs / rhs;
3870	}
3871
3872	RValue<Int> operator%=(const Int &lhs, RValue<Int> rhs)
3873	{
3874		return lhs = lhs % rhs;
3875	}
3876
3877	RValue<Int> operator&=(const Int &lhs, RValue<Int> rhs)
3878	{
3879		return lhs = lhs & rhs;
3880	}
3881
3882	RValue<Int> operator|=(const Int &lhs, RValue<Int> rhs)
3883	{
3884		return lhs = lhs | rhs;
3885	}
3886
3887	RValue<Int> operator^=(const Int &lhs, RValue<Int> rhs)
3888	{
3889		return lhs = lhs ^ rhs;
3890	}
3891
3892	RValue<Int> operator<<=(const Int &lhs, RValue<Int> rhs)
3893	{
3894		return lhs = lhs << rhs;
3895	}
3896
3897	RValue<Int> operator>>=(const Int &lhs, RValue<Int> rhs)
3898	{
3899		return lhs = lhs >> rhs;
3900	}
3901
3902	RValue<Int> operator+(RValue<Int> val)
3903	{
3904		return val;
3905	}
3906
3907	RValue<Int> operator-(RValue<Int> val)
3908	{
3909		return RValue<Int>(Nucleus::createNeg(val.value));
3910	}
3911
3912	RValue<Int> operator~(RValue<Int> val)
3913	{
3914		return RValue<Int>(Nucleus::createNot(val.value));
3915	}
3916
3917	RValue<Int> operator++(const Int &val, int)   // Post-increment
3918	{
3919		RValue<Int> res = val;
3920
3921		Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
3922		val.storeValue(inc);
3923
3924		return res;
3925	}
3926
3927	const Int &operator++(const Int &val)   // Pre-increment
3928	{
3929		Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
3930		val.storeValue(inc);
3931
3932		return val;
3933	}
3934
3935	RValue<Int> operator--(const Int &val, int)   // Post-decrement
3936	{
3937		RValue<Int> res = val;
3938
3939		Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
3940		val.storeValue(inc);
3941
3942		return res;
3943	}
3944
3945	const Int &operator--(const Int &val)   // Pre-decrement
3946	{
3947		Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
3948		val.storeValue(inc);
3949
3950		return val;
3951	}
3952
3953	RValue<Bool> operator<(RValue<Int> lhs, RValue<Int> rhs)
3954	{
3955		return RValue<Bool>(Nucleus::createICmpSLT(lhs.value, rhs.value));
3956	}
3957
3958	RValue<Bool> operator<=(RValue<Int> lhs, RValue<Int> rhs)
3959	{
3960		return RValue<Bool>(Nucleus::createICmpSLE(lhs.value, rhs.value));
3961	}
3962
3963	RValue<Bool> operator>(RValue<Int> lhs, RValue<Int> rhs)
3964	{
3965		return RValue<Bool>(Nucleus::createICmpSGT(lhs.value, rhs.value));
3966	}
3967
3968	RValue<Bool> operator>=(RValue<Int> lhs, RValue<Int> rhs)
3969	{
3970		return RValue<Bool>(Nucleus::createICmpSGE(lhs.value, rhs.value));
3971	}
3972
3973	RValue<Bool> operator!=(RValue<Int> lhs, RValue<Int> rhs)
3974	{
3975		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
3976	}
3977
3978	RValue<Bool> operator==(RValue<Int> lhs, RValue<Int> rhs)
3979	{
3980		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
3981	}
3982
3983	RValue<Int> Max(RValue<Int> x, RValue<Int> y)
3984	{
3985		return IfThenElse(x > y, x, y);
3986	}
3987
3988	RValue<Int> Min(RValue<Int> x, RValue<Int> y)
3989	{
3990		return IfThenElse(x < y, x, y);
3991	}
3992
3993	RValue<Int> Clamp(RValue<Int> x, RValue<Int> min, RValue<Int> max)
3994	{
3995		return Min(Max(x, min), max);
3996	}
3997
3998	RValue<Int> RoundInt(RValue<Float> cast)
3999	{
4000		return x86::cvtss2si(cast);
4001
4002	//	return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
4003	}
4004
4005	Type *Int::getType()
4006	{
4007		return T(llvm::Type::getInt32Ty(*::context));
4008	}
4009
4010	Long::Long(RValue<Int> cast)
4011	{
4012		Value *integer = Nucleus::createSExt(cast.value, Long::getType());
4013
4014		storeValue(integer);
4015	}
4016
4017	Long::Long(RValue<UInt> cast)
4018	{
4019		Value *integer = Nucleus::createZExt(cast.value, Long::getType());
4020
4021		storeValue(integer);
4022	}
4023
4024	Long::Long()
4025	{
4026	}
4027
4028	Long::Long(RValue<Long> rhs)
4029	{
4030		storeValue(rhs.value);
4031	}
4032
4033	RValue<Long> Long::operator=(int64_t rhs) const
4034	{
4035		return RValue<Long>(storeValue(Nucleus::createConstantLong(rhs)));
4036	}
4037
4038	RValue<Long> Long::operator=(RValue<Long> rhs) const
4039	{
4040		storeValue(rhs.value);
4041
4042		return rhs;
4043	}
4044
4045	RValue<Long> Long::operator=(const Long &rhs) const
4046	{
4047		Value *value = rhs.loadValue();
4048		storeValue(value);
4049
4050		return RValue<Long>(value);
4051	}
4052
4053	RValue<Long> Long::operator=(const Reference<Long> &rhs) const
4054	{
4055		Value *value = rhs.loadValue();
4056		storeValue(value);
4057
4058		return RValue<Long>(value);
4059	}
4060
4061	RValue<Long> operator+(RValue<Long> lhs, RValue<Long> rhs)
4062	{
4063		return RValue<Long>(Nucleus::createAdd(lhs.value, rhs.value));
4064	}
4065
4066	RValue<Long> operator-(RValue<Long> lhs, RValue<Long> rhs)
4067	{
4068		return RValue<Long>(Nucleus::createSub(lhs.value, rhs.value));
4069	}
4070
4071	RValue<Long> operator+=(const Long &lhs, RValue<Long> rhs)
4072	{
4073		return lhs = lhs + rhs;
4074	}
4075
4076	RValue<Long> operator-=(const Long &lhs, RValue<Long> rhs)
4077	{
4078		return lhs = lhs - rhs;
4079	}
4080
4081	RValue<Long> AddAtomic(RValue<Pointer<Long> > x, RValue<Long> y)
4082	{
4083		return RValue<Long>(Nucleus::createAtomicAdd(x.value, y.value));
4084	}
4085
4086	Type *Long::getType()
4087	{
4088		return T(llvm::Type::getInt64Ty(*::context));
4089	}
4090
4091	Long1::Long1(const RValue<UInt> cast)
4092	{
4093		Value *undefCast = Nucleus::createInsertElement(V(UndefValue::get(VectorType::get(Int::getType(), 2))), cast.value, 0);
4094		Value *zeroCast = Nucleus::createInsertElement(undefCast, V(Nucleus::createConstantInt(0)), 1);
4095
4096		storeValue(Nucleus::createBitCast(zeroCast, Long1::getType()));
4097	}
4098
4099	Long1::Long1(RValue<Long1> rhs)
4100	{
4101		storeValue(rhs.value);
4102	}
4103
4104	Type *Long1::getType()
4105	{
4106		if(CPUID::supportsMMX2())
4107		{
4108			return MMX::getType();
4109		}
4110		else
4111		{
4112			return T(VectorType::get(Long::getType(), 1));
4113		}
4114	}
4115
4116	UInt::UInt(Argument<UInt> argument)
4117	{
4118		storeValue(argument.value);
4119	}
4120
4121	UInt::UInt(RValue<UShort> cast)
4122	{
4123		Value *integer = Nucleus::createZExt(cast.value, UInt::getType());
4124
4125		storeValue(integer);
4126	}
4127
4128	UInt::UInt(RValue<Long> cast)
4129	{
4130		Value *integer = Nucleus::createTrunc(cast.value, UInt::getType());
4131
4132		storeValue(integer);
4133	}
4134
4135	UInt::UInt(RValue<Float> cast)
4136	{
4137		// Note: createFPToUI is broken, must perform conversion using createFPtoSI
4138		// Value *integer = Nucleus::createFPToUI(cast.value, UInt::getType());
4139
4140		// Smallest positive value representable in UInt, but not in Int
4141		const unsigned int ustart = 0x80000000u;
4142		const float ustartf = float(ustart);
4143
4144		// If the value is negative, store 0, otherwise store the result of the conversion
4145		storeValue((~(As<Int>(cast) >> 31) &
4146		// Check if the value can be represented as an Int
4147			IfThenElse(cast >= ustartf,
4148		// If the value is too large, subtract ustart and re-add it after conversion.
4149				As<Int>(As<UInt>(Int(cast - Float(ustartf))) + UInt(ustart)),
4150		// Otherwise, just convert normally
4151				Int(cast))).value);
4152	}
4153
4154	UInt::UInt()
4155	{
4156	}
4157
4158	UInt::UInt(int x)
4159	{
4160		storeValue(Nucleus::createConstantInt(x));
4161	}
4162
4163	UInt::UInt(unsigned int x)
4164	{
4165		storeValue(Nucleus::createConstantInt(x));
4166	}
4167
4168	UInt::UInt(RValue<UInt> rhs)
4169	{
4170		storeValue(rhs.value);
4171	}
4172
4173	UInt::UInt(RValue<Int> rhs)
4174	{
4175		storeValue(rhs.value);
4176	}
4177
4178	UInt::UInt(const UInt &rhs)
4179	{
4180		Value *value = rhs.loadValue();
4181		storeValue(value);
4182	}
4183
4184	UInt::UInt(const Reference<UInt> &rhs)
4185	{
4186		Value *value = rhs.loadValue();
4187		storeValue(value);
4188	}
4189
4190	UInt::UInt(const Int &rhs)
4191	{
4192		Value *value = rhs.loadValue();
4193		storeValue(value);
4194	}
4195
4196	UInt::UInt(const Reference<Int> &rhs)
4197	{
4198		Value *value = rhs.loadValue();
4199		storeValue(value);
4200	}
4201
4202	RValue<UInt> UInt::operator=(unsigned int rhs) const
4203	{
4204		return RValue<UInt>(storeValue(Nucleus::createConstantInt(rhs)));
4205	}
4206
4207	RValue<UInt> UInt::operator=(RValue<UInt> rhs) const
4208	{
4209		storeValue(rhs.value);
4210
4211		return rhs;
4212	}
4213
4214	RValue<UInt> UInt::operator=(RValue<Int> rhs) const
4215	{
4216		storeValue(rhs.value);
4217
4218		return RValue<UInt>(rhs);
4219	}
4220
4221	RValue<UInt> UInt::operator=(const UInt &rhs) const
4222	{
4223		Value *value = rhs.loadValue();
4224		storeValue(value);
4225
4226		return RValue<UInt>(value);
4227	}
4228
4229	RValue<UInt> UInt::operator=(const Reference<UInt> &rhs) const
4230	{
4231		Value *value = rhs.loadValue();
4232		storeValue(value);
4233
4234		return RValue<UInt>(value);
4235	}
4236
4237	RValue<UInt> UInt::operator=(const Int &rhs) const
4238	{
4239		Value *value = rhs.loadValue();
4240		storeValue(value);
4241
4242		return RValue<UInt>(value);
4243	}
4244
4245	RValue<UInt> UInt::operator=(const Reference<Int> &rhs) const
4246	{
4247		Value *value = rhs.loadValue();
4248		storeValue(value);
4249
4250		return RValue<UInt>(value);
4251	}
4252
4253	RValue<UInt> operator+(RValue<UInt> lhs, RValue<UInt> rhs)
4254	{
4255		return RValue<UInt>(Nucleus::createAdd(lhs.value, rhs.value));
4256	}
4257
4258	RValue<UInt> operator-(RValue<UInt> lhs, RValue<UInt> rhs)
4259	{
4260		return RValue<UInt>(Nucleus::createSub(lhs.value, rhs.value));
4261	}
4262
4263	RValue<UInt> operator*(RValue<UInt> lhs, RValue<UInt> rhs)
4264	{
4265		return RValue<UInt>(Nucleus::createMul(lhs.value, rhs.value));
4266	}
4267
4268	RValue<UInt> operator/(RValue<UInt> lhs, RValue<UInt> rhs)
4269	{
4270		return RValue<UInt>(Nucleus::createUDiv(lhs.value, rhs.value));
4271	}
4272
4273	RValue<UInt> operator%(RValue<UInt> lhs, RValue<UInt> rhs)
4274	{
4275		return RValue<UInt>(Nucleus::createURem(lhs.value, rhs.value));
4276	}
4277
4278	RValue<UInt> operator&(RValue<UInt> lhs, RValue<UInt> rhs)
4279	{
4280		return RValue<UInt>(Nucleus::createAnd(lhs.value, rhs.value));
4281	}
4282
4283	RValue<UInt> operator|(RValue<UInt> lhs, RValue<UInt> rhs)
4284	{
4285		return RValue<UInt>(Nucleus::createOr(lhs.value, rhs.value));
4286	}
4287
4288	RValue<UInt> operator^(RValue<UInt> lhs, RValue<UInt> rhs)
4289	{
4290		return RValue<UInt>(Nucleus::createXor(lhs.value, rhs.value));
4291	}
4292
4293	RValue<UInt> operator<<(RValue<UInt> lhs, RValue<UInt> rhs)
4294	{
4295		return RValue<UInt>(Nucleus::createShl(lhs.value, rhs.value));
4296	}
4297
4298	RValue<UInt> operator>>(RValue<UInt> lhs, RValue<UInt> rhs)
4299	{
4300		return RValue<UInt>(Nucleus::createLShr(lhs.value, rhs.value));
4301	}
4302
4303	RValue<UInt> operator+=(const UInt &lhs, RValue<UInt> rhs)
4304	{
4305		return lhs = lhs + rhs;
4306	}
4307
4308	RValue<UInt> operator-=(const UInt &lhs, RValue<UInt> rhs)
4309	{
4310		return lhs = lhs - rhs;
4311	}
4312
4313	RValue<UInt> operator*=(const UInt &lhs, RValue<UInt> rhs)
4314	{
4315		return lhs = lhs * rhs;
4316	}
4317
4318	RValue<UInt> operator/=(const UInt &lhs, RValue<UInt> rhs)
4319	{
4320		return lhs = lhs / rhs;
4321	}
4322
4323	RValue<UInt> operator%=(const UInt &lhs, RValue<UInt> rhs)
4324	{
4325		return lhs = lhs % rhs;
4326	}
4327
4328	RValue<UInt> operator&=(const UInt &lhs, RValue<UInt> rhs)
4329	{
4330		return lhs = lhs & rhs;
4331	}
4332
4333	RValue<UInt> operator|=(const UInt &lhs, RValue<UInt> rhs)
4334	{
4335		return lhs = lhs | rhs;
4336	}
4337
4338	RValue<UInt> operator^=(const UInt &lhs, RValue<UInt> rhs)
4339	{
4340		return lhs = lhs ^ rhs;
4341	}
4342
4343	RValue<UInt> operator<<=(const UInt &lhs, RValue<UInt> rhs)
4344	{
4345		return lhs = lhs << rhs;
4346	}
4347
4348	RValue<UInt> operator>>=(const UInt &lhs, RValue<UInt> rhs)
4349	{
4350		return lhs = lhs >> rhs;
4351	}
4352
4353	RValue<UInt> operator+(RValue<UInt> val)
4354	{
4355		return val;
4356	}
4357
4358	RValue<UInt> operator-(RValue<UInt> val)
4359	{
4360		return RValue<UInt>(Nucleus::createNeg(val.value));
4361	}
4362
4363	RValue<UInt> operator~(RValue<UInt> val)
4364	{
4365		return RValue<UInt>(Nucleus::createNot(val.value));
4366	}
4367
4368	RValue<UInt> operator++(const UInt &val, int)   // Post-increment
4369	{
4370		RValue<UInt> res = val;
4371
4372		Value *inc = Nucleus::createAdd(res.value, V(Nucleus::createConstantInt(1)));
4373		val.storeValue(inc);
4374
4375		return res;
4376	}
4377
4378	const UInt &operator++(const UInt &val)   // Pre-increment
4379	{
4380		Value *inc = Nucleus::createAdd(val.loadValue(), V(Nucleus::createConstantInt(1)));
4381		val.storeValue(inc);
4382
4383		return val;
4384	}
4385
4386	RValue<UInt> operator--(const UInt &val, int)   // Post-decrement
4387	{
4388		RValue<UInt> res = val;
4389
4390		Value *inc = Nucleus::createSub(res.value, V(Nucleus::createConstantInt(1)));
4391		val.storeValue(inc);
4392
4393		return res;
4394	}
4395
4396	const UInt &operator--(const UInt &val)   // Pre-decrement
4397	{
4398		Value *inc = Nucleus::createSub(val.loadValue(), V(Nucleus::createConstantInt(1)));
4399		val.storeValue(inc);
4400
4401		return val;
4402	}
4403
4404	RValue<UInt> Max(RValue<UInt> x, RValue<UInt> y)
4405	{
4406		return IfThenElse(x > y, x, y);
4407	}
4408
4409	RValue<UInt> Min(RValue<UInt> x, RValue<UInt> y)
4410	{
4411		return IfThenElse(x < y, x, y);
4412	}
4413
4414	RValue<UInt> Clamp(RValue<UInt> x, RValue<UInt> min, RValue<UInt> max)
4415	{
4416		return Min(Max(x, min), max);
4417	}
4418
4419	RValue<Bool> operator<(RValue<UInt> lhs, RValue<UInt> rhs)
4420	{
4421		return RValue<Bool>(Nucleus::createICmpULT(lhs.value, rhs.value));
4422	}
4423
4424	RValue<Bool> operator<=(RValue<UInt> lhs, RValue<UInt> rhs)
4425	{
4426		return RValue<Bool>(Nucleus::createICmpULE(lhs.value, rhs.value));
4427	}
4428
4429	RValue<Bool> operator>(RValue<UInt> lhs, RValue<UInt> rhs)
4430	{
4431		return RValue<Bool>(Nucleus::createICmpUGT(lhs.value, rhs.value));
4432	}
4433
4434	RValue<Bool> operator>=(RValue<UInt> lhs, RValue<UInt> rhs)
4435	{
4436		return RValue<Bool>(Nucleus::createICmpUGE(lhs.value, rhs.value));
4437	}
4438
4439	RValue<Bool> operator!=(RValue<UInt> lhs, RValue<UInt> rhs)
4440	{
4441		return RValue<Bool>(Nucleus::createICmpNE(lhs.value, rhs.value));
4442	}
4443
4444	RValue<Bool> operator==(RValue<UInt> lhs, RValue<UInt> rhs)
4445	{
4446		return RValue<Bool>(Nucleus::createICmpEQ(lhs.value, rhs.value));
4447	}
4448
4449//	RValue<UInt> RoundUInt(RValue<Float> cast)
4450//	{
4451//		return x86::cvtss2si(val);   // FIXME: Unsigned
4452//
4453//	//	return IfThenElse(val > 0.0f, Int(val + 0.5f), Int(val - 0.5f));
4454//	}
4455
4456	Type *UInt::getType()
4457	{
4458		return T(llvm::Type::getInt32Ty(*::context));
4459	}
4460
4461//	Int2::Int2(RValue<Int> cast)
4462//	{
4463//		Value *extend = Nucleus::createZExt(cast.value, Long::getType());
4464//		Value *vector = Nucleus::createBitCast(extend, Int2::getType());
4465//
4466//		int shuffle[2] = {0, 0};
4467//		Value *replicate = Nucleus::createShuffleVector(vector, vector, shuffle);
4468//
4469//		storeValue(replicate);
4470//	}
4471
4472	Int2::Int2(RValue<Int4> cast)
4473	{
4474		Value *long2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
4475		Value *element = Nucleus::createExtractElement(long2, Long::getType(), 0);
4476		Value *int2 = Nucleus::createBitCast(element, Int2::getType());
4477
4478		storeValue(int2);
4479	}
4480
4481	Int2::Int2()
4482	{
4483	//	xy.parent = this;
4484	}
4485
4486	Int2::Int2(int x, int y)
4487	{
4488	//	xy.parent = this;
4489
4490		int64_t constantVector[2] = {x, y};
4491		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(Int::getType(), 2))));
4492
4493		storeValue(Nucleus::createBitCast(vector, getType()));
4494	}
4495
4496	Int2::Int2(RValue<Int2> rhs)
4497	{
4498	//	xy.parent = this;
4499
4500		storeValue(rhs.value);
4501	}
4502
4503	Int2::Int2(const Int2 &rhs)
4504	{
4505	//	xy.parent = this;
4506
4507		Value *value = rhs.loadValue();
4508		storeValue(value);
4509	}
4510
4511	Int2::Int2(const Reference<Int2> &rhs)
4512	{
4513	//	xy.parent = this;
4514
4515		Value *value = rhs.loadValue();
4516		storeValue(value);
4517	}
4518
4519	Int2::Int2(RValue<Int> lo, RValue<Int> hi)
4520	{
4521		if(CPUID::supportsMMX2())
4522		{
4523			// movd mm0, lo
4524			// movd mm1, hi
4525			// punpckldq mm0, mm1
4526			storeValue(As<Int2>(UnpackLow(As<Int2>(Long1(RValue<UInt>(lo))), As<Int2>(Long1(RValue<UInt>(hi))))).value);
4527		}
4528		else
4529		{
4530			int shuffle[2] = {0, 1};
4531			Value *packed = Nucleus::createShuffleVector(Nucleus::createBitCast(lo.value, T(VectorType::get(Int::getType(), 1))), Nucleus::createBitCast(hi.value, T(VectorType::get(Int::getType(), 1))), shuffle);
4532
4533			storeValue(Nucleus::createBitCast(packed, Int2::getType()));
4534		}
4535	}
4536
4537	RValue<Int2> Int2::operator=(RValue<Int2> rhs) const
4538	{
4539		storeValue(rhs.value);
4540
4541		return rhs;
4542	}
4543
4544	RValue<Int2> Int2::operator=(const Int2 &rhs) const
4545	{
4546		Value *value = rhs.loadValue();
4547		storeValue(value);
4548
4549		return RValue<Int2>(value);
4550	}
4551
4552	RValue<Int2> Int2::operator=(const Reference<Int2> &rhs) const
4553	{
4554		Value *value = rhs.loadValue();
4555		storeValue(value);
4556
4557		return RValue<Int2>(value);
4558	}
4559
4560	RValue<Int2> operator+(RValue<Int2> lhs, RValue<Int2> rhs)
4561	{
4562		if(CPUID::supportsMMX2())
4563		{
4564			return x86::paddd(lhs, rhs);
4565		}
4566		else
4567		{
4568			return RValue<Int2>(Nucleus::createAdd(lhs.value, rhs.value));
4569		}
4570	}
4571
4572	RValue<Int2> operator-(RValue<Int2> lhs, RValue<Int2> rhs)
4573	{
4574		if(CPUID::supportsMMX2())
4575		{
4576			return x86::psubd(lhs, rhs);
4577		}
4578		else
4579		{
4580			return RValue<Int2>(Nucleus::createSub(lhs.value, rhs.value));
4581		}
4582	}
4583
4584//	RValue<Int2> operator*(RValue<Int2> lhs, RValue<Int2> rhs)
4585//	{
4586//		return RValue<Int2>(Nucleus::createMul(lhs.value, rhs.value));
4587//	}
4588
4589//	RValue<Int2> operator/(RValue<Int2> lhs, RValue<Int2> rhs)
4590//	{
4591//		return RValue<Int2>(Nucleus::createSDiv(lhs.value, rhs.value));
4592//	}
4593
4594//	RValue<Int2> operator%(RValue<Int2> lhs, RValue<Int2> rhs)
4595//	{
4596//		return RValue<Int2>(Nucleus::createSRem(lhs.value, rhs.value));
4597//	}
4598
4599	RValue<Int2> operator&(RValue<Int2> lhs, RValue<Int2> rhs)
4600	{
4601		if(CPUID::supportsMMX2())
4602		{
4603			return As<Int2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4604		}
4605		else
4606		{
4607			return RValue<Int2>(Nucleus::createAnd(lhs.value, rhs.value));
4608		}
4609	}
4610
4611	RValue<Int2> operator|(RValue<Int2> lhs, RValue<Int2> rhs)
4612	{
4613		if(CPUID::supportsMMX2())
4614		{
4615			return As<Int2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4616		}
4617		else
4618		{
4619			return RValue<Int2>(Nucleus::createOr(lhs.value, rhs.value));
4620		}
4621	}
4622
4623	RValue<Int2> operator^(RValue<Int2> lhs, RValue<Int2> rhs)
4624	{
4625		if(CPUID::supportsMMX2())
4626		{
4627			return As<Int2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4628		}
4629		else
4630		{
4631			return RValue<Int2>(Nucleus::createXor(lhs.value, rhs.value));
4632		}
4633	}
4634
4635	RValue<Int2> operator<<(RValue<Int2> lhs, unsigned char rhs)
4636	{
4637	//	return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
4638
4639		return x86::pslld(lhs, rhs);
4640	}
4641
4642	RValue<Int2> operator>>(RValue<Int2> lhs, unsigned char rhs)
4643	{
4644	//	return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
4645
4646		return x86::psrad(lhs, rhs);
4647	}
4648
4649	RValue<Int2> operator<<(RValue<Int2> lhs, RValue<Long1> rhs)
4650	{
4651	//	return RValue<Int2>(Nucleus::createShl(lhs.value, rhs.value));
4652
4653		return x86::pslld(lhs, rhs);
4654	}
4655
4656	RValue<Int2> operator>>(RValue<Int2> lhs, RValue<Long1> rhs)
4657	{
4658	//	return RValue<Int2>(Nucleus::createAShr(lhs.value, rhs.value));
4659
4660		return x86::psrad(lhs, rhs);
4661	}
4662
4663	RValue<Int2> operator+=(const Int2 &lhs, RValue<Int2> rhs)
4664	{
4665		return lhs = lhs + rhs;
4666	}
4667
4668	RValue<Int2> operator-=(const Int2 &lhs, RValue<Int2> rhs)
4669	{
4670		return lhs = lhs - rhs;
4671	}
4672
4673//	RValue<Int2> operator*=(const Int2 &lhs, RValue<Int2> rhs)
4674//	{
4675//		return lhs = lhs * rhs;
4676//	}
4677
4678//	RValue<Int2> operator/=(const Int2 &lhs, RValue<Int2> rhs)
4679//	{
4680//		return lhs = lhs / rhs;
4681//	}
4682
4683//	RValue<Int2> operator%=(const Int2 &lhs, RValue<Int2> rhs)
4684//	{
4685//		return lhs = lhs % rhs;
4686//	}
4687
4688	RValue<Int2> operator&=(const Int2 &lhs, RValue<Int2> rhs)
4689	{
4690		return lhs = lhs & rhs;
4691	}
4692
4693	RValue<Int2> operator|=(const Int2 &lhs, RValue<Int2> rhs)
4694	{
4695		return lhs = lhs | rhs;
4696	}
4697
4698	RValue<Int2> operator^=(const Int2 &lhs, RValue<Int2> rhs)
4699	{
4700		return lhs = lhs ^ rhs;
4701	}
4702
4703	RValue<Int2> operator<<=(const Int2 &lhs, unsigned char rhs)
4704	{
4705		return lhs = lhs << rhs;
4706	}
4707
4708	RValue<Int2> operator>>=(const Int2 &lhs, unsigned char rhs)
4709	{
4710		return lhs = lhs >> rhs;
4711	}
4712
4713	RValue<Int2> operator<<=(const Int2 &lhs, RValue<Long1> rhs)
4714	{
4715		return lhs = lhs << rhs;
4716	}
4717
4718	RValue<Int2> operator>>=(const Int2 &lhs, RValue<Long1> rhs)
4719	{
4720		return lhs = lhs >> rhs;
4721	}
4722
4723//	RValue<Int2> operator+(RValue<Int2> val)
4724//	{
4725//		return val;
4726//	}
4727
4728//	RValue<Int2> operator-(RValue<Int2> val)
4729//	{
4730//		return RValue<Int2>(Nucleus::createNeg(val.value));
4731//	}
4732
4733	RValue<Int2> operator~(RValue<Int2> val)
4734	{
4735		if(CPUID::supportsMMX2())
4736		{
4737			return val ^ Int2(0xFFFFFFFF, 0xFFFFFFFF);
4738		}
4739		else
4740		{
4741			return RValue<Int2>(Nucleus::createNot(val.value));
4742		}
4743	}
4744
4745	RValue<Long1> UnpackLow(RValue<Int2> x, RValue<Int2> y)
4746	{
4747		if(CPUID::supportsMMX2())
4748		{
4749			return x86::punpckldq(x, y);
4750		}
4751		else
4752		{
4753			int shuffle[2] = {0, 2};
4754			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4755
4756			return RValue<Long1>(Nucleus::createBitCast(packed, Long1::getType()));
4757		}
4758	}
4759
4760	RValue<Long1> UnpackHigh(RValue<Int2> x, RValue<Int2> y)
4761	{
4762		if(CPUID::supportsMMX2())
4763		{
4764			return x86::punpckhdq(x, y);
4765		}
4766		else
4767		{
4768			int shuffle[2] = {1, 3};
4769			Value *packed = Nucleus::createShuffleVector(x.value, y.value, shuffle);
4770
4771			return RValue<Long1>(Nucleus::createBitCast(packed, Long1::getType()));
4772		}
4773	}
4774
4775	RValue<Int> Extract(RValue<Int2> val, int i)
4776	{
4777		if(false)   // FIXME: LLVM does not generate optimal code
4778		{
4779			return RValue<Int>(Nucleus::createExtractElement(val.value, Int::getType(), i));
4780		}
4781		else
4782		{
4783			if(i == 0)
4784			{
4785				return RValue<Int>(Nucleus::createExtractElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), Int::getType(), 0));
4786			}
4787			else
4788			{
4789				Int2 val2 = As<Int2>(UnpackHigh(val, val));
4790
4791				return Extract(val2, 0);
4792			}
4793		}
4794	}
4795
4796	RValue<Int2> Insert(RValue<Int2> val, RValue<Int> element, int i)
4797	{
4798		return RValue<Int2>(Nucleus::createBitCast(Nucleus::createInsertElement(Nucleus::createBitCast(val.value, T(VectorType::get(Int::getType(), 2))), element.value, i), Int2::getType()));
4799	}
4800
4801	Type *Int2::getType()
4802	{
4803		if(CPUID::supportsMMX2())
4804		{
4805			return MMX::getType();
4806		}
4807		else
4808		{
4809			return T(VectorType::get(Int::getType(), 2));
4810		}
4811	}
4812
4813	UInt2::UInt2()
4814	{
4815	//	xy.parent = this;
4816	}
4817
4818	UInt2::UInt2(unsigned int x, unsigned int y)
4819	{
4820	//	xy.parent = this;
4821
4822		int64_t constantVector[2] = {x, y};
4823		Value *vector = V(Nucleus::createConstantVector(constantVector, T(VectorType::get(UInt::getType(), 2))));
4824
4825		storeValue(Nucleus::createBitCast(vector, getType()));
4826	}
4827
4828	UInt2::UInt2(RValue<UInt2> rhs)
4829	{
4830	//	xy.parent = this;
4831
4832		storeValue(rhs.value);
4833	}
4834
4835	UInt2::UInt2(const UInt2 &rhs)
4836	{
4837	//	xy.parent = this;
4838
4839		Value *value = rhs.loadValue();
4840		storeValue(value);
4841	}
4842
4843	UInt2::UInt2(const Reference<UInt2> &rhs)
4844	{
4845	//	xy.parent = this;
4846
4847		Value *value = rhs.loadValue();
4848		storeValue(value);
4849	}
4850
4851	RValue<UInt2> UInt2::operator=(RValue<UInt2> rhs) const
4852	{
4853		storeValue(rhs.value);
4854
4855		return rhs;
4856	}
4857
4858	RValue<UInt2> UInt2::operator=(const UInt2 &rhs) const
4859	{
4860		Value *value = rhs.loadValue();
4861		storeValue(value);
4862
4863		return RValue<UInt2>(value);
4864	}
4865
4866	RValue<UInt2> UInt2::operator=(const Reference<UInt2> &rhs) const
4867	{
4868		Value *value = rhs.loadValue();
4869		storeValue(value);
4870
4871		return RValue<UInt2>(value);
4872	}
4873
4874	RValue<UInt2> operator+(RValue<UInt2> lhs, RValue<UInt2> rhs)
4875	{
4876		if(CPUID::supportsMMX2())
4877		{
4878			return As<UInt2>(x86::paddd(As<Int2>(lhs), As<Int2>(rhs)));
4879		}
4880		else
4881		{
4882			return RValue<UInt2>(Nucleus::createAdd(lhs.value, rhs.value));
4883		}
4884	}
4885
4886	RValue<UInt2> operator-(RValue<UInt2> lhs, RValue<UInt2> rhs)
4887	{
4888		if(CPUID::supportsMMX2())
4889		{
4890			return As<UInt2>(x86::psubd(As<Int2>(lhs), As<Int2>(rhs)));
4891		}
4892		else
4893		{
4894			return RValue<UInt2>(Nucleus::createSub(lhs.value, rhs.value));
4895		}
4896	}
4897
4898//	RValue<UInt2> operator*(RValue<UInt2> lhs, RValue<UInt2> rhs)
4899//	{
4900//		return RValue<UInt2>(Nucleus::createMul(lhs.value, rhs.value));
4901//	}
4902
4903//	RValue<UInt2> operator/(RValue<UInt2> lhs, RValue<UInt2> rhs)
4904//	{
4905//		return RValue<UInt2>(Nucleus::createUDiv(lhs.value, rhs.value));
4906//	}
4907
4908//	RValue<UInt2> operator%(RValue<UInt2> lhs, RValue<UInt2> rhs)
4909//	{
4910//		return RValue<UInt2>(Nucleus::createURem(lhs.value, rhs.value));
4911//	}
4912
4913	RValue<UInt2> operator&(RValue<UInt2> lhs, RValue<UInt2> rhs)
4914	{
4915		if(CPUID::supportsMMX2())
4916		{
4917			return As<UInt2>(x86::pand(As<Short4>(lhs), As<Short4>(rhs)));
4918		}
4919		else
4920		{
4921			return RValue<UInt2>(Nucleus::createAnd(lhs.value, rhs.value));
4922		}
4923	}
4924
4925	RValue<UInt2> operator|(RValue<UInt2> lhs, RValue<UInt2> rhs)
4926	{
4927		if(CPUID::supportsMMX2())
4928		{
4929			return As<UInt2>(x86::por(As<Short4>(lhs), As<Short4>(rhs)));
4930		}
4931		else
4932		{
4933			return RValue<UInt2>(Nucleus::createOr(lhs.value, rhs.value));
4934		}
4935	}
4936
4937	RValue<UInt2> operator^(RValue<UInt2> lhs, RValue<UInt2> rhs)
4938	{
4939		if(CPUID::supportsMMX2())
4940		{
4941			return As<UInt2>(x86::pxor(As<Short4>(lhs), As<Short4>(rhs)));
4942		}
4943		else
4944		{
4945			return RValue<UInt2>(Nucleus::createXor(lhs.value, rhs.value));
4946		}
4947	}
4948
4949	RValue<UInt2> operator<<(RValue<UInt2> lhs, unsigned char rhs)
4950	{
4951	//	return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
4952
4953		return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
4954	}
4955
4956	RValue<UInt2> operator>>(RValue<UInt2> lhs, unsigned char rhs)
4957	{
4958	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
4959
4960		return x86::psrld(lhs, rhs);
4961	}
4962
4963	RValue<UInt2> operator<<(RValue<UInt2> lhs, RValue<Long1> rhs)
4964	{
4965	//	return RValue<UInt2>(Nucleus::createShl(lhs.value, rhs.value));
4966
4967		return As<UInt2>(x86::pslld(As<Int2>(lhs), rhs));
4968	}
4969
4970	RValue<UInt2> operator>>(RValue<UInt2> lhs, RValue<Long1> rhs)
4971	{
4972	//	return RValue<UInt2>(Nucleus::createLShr(lhs.value, rhs.value));
4973
4974		return x86::psrld(lhs, rhs);
4975	}
4976
4977	RValue<UInt2> operator+=(const UInt2 &lhs, RValue<UInt2> rhs)
4978	{
4979		return lhs = lhs + rhs;
4980	}
4981
4982	RValue<UInt2> operator-=(const UInt2 &lhs, RValue<UInt2> rhs)
4983	{
4984		return lhs = lhs - rhs;
4985	}
4986
4987//	RValue<UInt2> operator*=(const UInt2 &lhs, RValue<UInt2> rhs)
4988//	{
4989//		return lhs = lhs * rhs;
4990//	}
4991
4992//	RValue<UInt2> operator/=(const UInt2 &lhs, RValue<UInt2> rhs)
4993//	{
4994//		return lhs = lhs / rhs;
4995//	}
4996
4997//	RValue<UInt2> operator%=(const UInt2 &lhs, RValue<UInt2> rhs)
4998//	{
4999//		return lhs = lhs % rhs;
5000//	}
5001
5002	RValue<UInt2> operator&=(const UInt2 &lhs, RValue<UInt2> rhs)
5003	{
5004		return lhs = lhs & rhs;
5005	}
5006
5007	RValue<UInt2> operator|=(const UInt2 &lhs, RValue<UInt2> rhs)
5008	{
5009		return lhs = lhs | rhs;
5010	}
5011
5012	RValue<UInt2> operator^=(const UInt2 &lhs, RValue<UInt2> rhs)
5013	{
5014		return lhs = lhs ^ rhs;
5015	}
5016
5017	RValue<UInt2> operator<<=(const UInt2 &lhs, unsigned char rhs)
5018	{
5019		return lhs = lhs << rhs;
5020	}
5021
5022	RValue<UInt2> operator>>=(const UInt2 &lhs, unsigned char rhs)
5023	{
5024		return lhs = lhs >> rhs;
5025	}
5026
5027	RValue<UInt2> operator<<=(const UInt2 &lhs, RValue<Long1> rhs)
5028	{
5029		return lhs = lhs << rhs;
5030	}
5031
5032	RValue<UInt2> operator>>=(const UInt2 &lhs, RValue<Long1> rhs)
5033	{
5034		return lhs = lhs >> rhs;
5035	}
5036
5037//	RValue<UInt2> operator+(RValue<UInt2> val)
5038//	{
5039//		return val;
5040//	}
5041
5042//	RValue<UInt2> operator-(RValue<UInt2> val)
5043//	{
5044//		return RValue<UInt2>(Nucleus::createNeg(val.value));
5045//	}
5046
5047	RValue<UInt2> operator~(RValue<UInt2> val)
5048	{
5049		if(CPUID::supportsMMX2())
5050		{
5051			return val ^ UInt2(0xFFFFFFFF, 0xFFFFFFFF);
5052		}
5053		else
5054		{
5055			return RValue<UInt2>(Nucleus::createNot(val.value));
5056		}
5057	}
5058
5059	Type *UInt2::getType()
5060	{
5061		if(CPUID::supportsMMX2())
5062		{
5063			return MMX::getType();
5064		}
5065		else
5066		{
5067			return T(VectorType::get(UInt::getType(), 2));
5068		}
5069	}
5070
5071	Int4::Int4(RValue<Byte4> cast)
5072	{
5073		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
5074		Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
5075
5076		Value *e;
5077
5078		if (CPUID::supportsSSE4_1())
5079		{
5080			e = x86::pmovzxbd(RValue<Int4>(a)).value;
5081		}
5082		else
5083		{
5084			int swizzle[16] = {0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23};
5085			Value *b = Nucleus::createBitCast(a, Byte16::getType());
5086			Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Byte16::getType())), swizzle);
5087
5088			int swizzle2[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5089			Value *d = Nucleus::createBitCast(c, Short8::getType());
5090			e = Nucleus::createShuffleVector(d, V(Nucleus::createNullValue(Short8::getType())), swizzle2);
5091		}
5092
5093		Value *f = Nucleus::createBitCast(e, Int4::getType());
5094		storeValue(f);
5095	}
5096
5097	Int4::Int4(RValue<SByte4> cast)
5098	{
5099		Value *x = Nucleus::createBitCast(cast.value, Int::getType());
5100		Value *a = Nucleus::createInsertElement(V(UndefValue::get(Int4::getType())), x, 0);
5101
5102		Value *g;
5103
5104		if (CPUID::supportsSSE4_1())
5105		{
5106			g = x86::pmovsxbd(RValue<Int4>(a)).value;
5107		}
5108		else
5109		{
5110			int	swizzle[16] = {0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7};
5111			Value *b = Nucleus::createBitCast(a, Byte16::getType());
5112			Value *c = Nucleus::createShuffleVector(b, b, swizzle);
5113
5114			int swizzle2[8] = {0, 0, 1, 1, 2, 2, 3, 3};
5115			Value *d = Nucleus::createBitCast(c, Short8::getType());
5116			Value *e = Nucleus::createShuffleVector(d, d, swizzle2);
5117
5118			Value *f = Nucleus::createBitCast(e, Int4::getType());
5119			//	g = Nucleus::createAShr(f, Nucleus::createConstantInt(24));
5120			g = x86::psrad(RValue<Int4>(f), 24).value;
5121		}
5122
5123		storeValue(g);
5124	}
5125
5126	Int4::Int4(RValue<Float4> cast)
5127	{
5128	//	xyzw.parent = this;
5129
5130		Value *xyzw = Nucleus::createFPToSI(cast.value, Int4::getType());
5131
5132		storeValue(xyzw);
5133	}
5134
5135	Int4::Int4(RValue<Short4> cast)
5136	{
5137		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5138		Value *element = Nucleus::createBitCast(cast.value, Long::getType());
5139		long2 = Nucleus::createInsertElement(long2, element, 0);
5140		RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
5141
5142		if(CPUID::supportsSSE4_1())
5143		{
5144			storeValue(x86::pmovsxwd(vector).value);
5145		}
5146		else
5147		{
5148			Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
5149
5150			int swizzle[8] = {0, 0, 1, 1, 2, 2, 3, 3};
5151			Value *c = Nucleus::createShuffleVector(b, b, swizzle);
5152			Value *d = Nucleus::createBitCast(c, Int4::getType());
5153			storeValue(d);
5154
5155			// Each Short is packed into each Int in the (Short | Short) format.
5156			// Shifting by 16 will retrieve the original Short value.
5157			// Shitfing an Int will propagate the sign bit, which will work
5158			// for both positive and negative values of a Short.
5159			*this >>= 16;
5160		}
5161	}
5162
5163	Int4::Int4(RValue<UShort4> cast)
5164	{
5165		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5166		Value *element = Nucleus::createBitCast(cast.value, Long::getType());
5167		long2 = Nucleus::createInsertElement(long2, element, 0);
5168		RValue<Int4> vector = RValue<Int4>(Nucleus::createBitCast(long2, Int4::getType()));
5169
5170		if(CPUID::supportsSSE4_1())
5171		{
5172			storeValue(x86::pmovzxwd(RValue<Int4>(vector)).value);
5173		}
5174		else
5175		{
5176			Value *b = Nucleus::createBitCast(vector.value, Short8::getType());
5177
5178			int swizzle[8] = {0, 8, 1, 9, 2, 10, 3, 11};
5179			Value *c = Nucleus::createShuffleVector(b, V(Nucleus::createNullValue(Short8::getType())), swizzle);
5180			Value *d = Nucleus::createBitCast(c, Int4::getType());
5181			storeValue(d);
5182		}
5183	}
5184
5185	Int4::Int4()
5186	{
5187	//	xyzw.parent = this;
5188	}
5189
5190	Int4::Int4(int xyzw)
5191	{
5192		constant(xyzw, xyzw, xyzw, xyzw);
5193	}
5194
5195	Int4::Int4(int x, int yzw)
5196	{
5197		constant(x, yzw, yzw, yzw);
5198	}
5199
5200	Int4::Int4(int x, int y, int zw)
5201	{
5202		constant(x, y, zw, zw);
5203	}
5204
5205	Int4::Int4(int x, int y, int z, int w)
5206	{
5207		constant(x, y, z, w);
5208	}
5209
5210	void Int4::constant(int x, int y, int z, int w)
5211	{
5212	//	xyzw.parent = this;
5213
5214		int64_t constantVector[4] = {x, y, z, w};
5215		storeValue(Nucleus::createConstantVector(constantVector, getType()));
5216	}
5217
5218	Int4::Int4(RValue<Int4> rhs)
5219	{
5220	//	xyzw.parent = this;
5221
5222		storeValue(rhs.value);
5223	}
5224
5225	Int4::Int4(const Int4 &rhs)
5226	{
5227	//	xyzw.parent = this;
5228
5229		Value *value = rhs.loadValue();
5230		storeValue(value);
5231	}
5232
5233	Int4::Int4(const Reference<Int4> &rhs)
5234	{
5235	//	xyzw.parent = this;
5236
5237		Value *value = rhs.loadValue();
5238		storeValue(value);
5239	}
5240
5241	Int4::Int4(RValue<UInt4> rhs)
5242	{
5243	//	xyzw.parent = this;
5244
5245		storeValue(rhs.value);
5246	}
5247
5248	Int4::Int4(const UInt4 &rhs)
5249	{
5250	//	xyzw.parent = this;
5251
5252		Value *value = rhs.loadValue();
5253		storeValue(value);
5254	}
5255
5256	Int4::Int4(const Reference<UInt4> &rhs)
5257	{
5258	//	xyzw.parent = this;
5259
5260		Value *value = rhs.loadValue();
5261		storeValue(value);
5262	}
5263
5264	Int4::Int4(RValue<Int2> lo, RValue<Int2> hi)
5265	{
5266	//	xyzw.parent = this;
5267
5268		Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
5269		Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
5270
5271		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5272		long2 = Nucleus::createInsertElement(long2, loLong, 0);
5273		long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5274		Value *int4 = Nucleus::createBitCast(long2, Int4::getType());
5275
5276		storeValue(int4);
5277	}
5278
5279	Int4::Int4(RValue<Int> rhs)
5280	{
5281	//	xyzw.parent = this;
5282
5283		Value *vector = loadValue();
5284		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
5285
5286		int swizzle[4] = {0, 0, 0, 0};
5287		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
5288
5289		storeValue(replicate);
5290	}
5291
5292	Int4::Int4(const Int &rhs)
5293	{
5294	//	xyzw.parent = this;
5295
5296		*this = RValue<Int>(rhs.loadValue());
5297	}
5298
5299	Int4::Int4(const Reference<Int> &rhs)
5300	{
5301	//	xyzw.parent = this;
5302
5303		*this = RValue<Int>(rhs.loadValue());
5304	}
5305
5306	RValue<Int4> Int4::operator=(RValue<Int4> rhs) const
5307	{
5308		storeValue(rhs.value);
5309
5310		return rhs;
5311	}
5312
5313	RValue<Int4> Int4::operator=(const Int4 &rhs) const
5314	{
5315		Value *value = rhs.loadValue();
5316		storeValue(value);
5317
5318		return RValue<Int4>(value);
5319	}
5320
5321	RValue<Int4> Int4::operator=(const Reference<Int4> &rhs) const
5322	{
5323		Value *value = rhs.loadValue();
5324		storeValue(value);
5325
5326		return RValue<Int4>(value);
5327	}
5328
5329	RValue<Int4> operator+(RValue<Int4> lhs, RValue<Int4> rhs)
5330	{
5331		return RValue<Int4>(Nucleus::createAdd(lhs.value, rhs.value));
5332	}
5333
5334	RValue<Int4> operator-(RValue<Int4> lhs, RValue<Int4> rhs)
5335	{
5336		return RValue<Int4>(Nucleus::createSub(lhs.value, rhs.value));
5337	}
5338
5339	RValue<Int4> operator*(RValue<Int4> lhs, RValue<Int4> rhs)
5340	{
5341		return RValue<Int4>(Nucleus::createMul(lhs.value, rhs.value));
5342	}
5343
5344	RValue<Int4> operator/(RValue<Int4> lhs, RValue<Int4> rhs)
5345	{
5346		return RValue<Int4>(Nucleus::createSDiv(lhs.value, rhs.value));
5347	}
5348
5349	RValue<Int4> operator%(RValue<Int4> lhs, RValue<Int4> rhs)
5350	{
5351		return RValue<Int4>(Nucleus::createSRem(lhs.value, rhs.value));
5352	}
5353
5354	RValue<Int4> operator&(RValue<Int4> lhs, RValue<Int4> rhs)
5355	{
5356		return RValue<Int4>(Nucleus::createAnd(lhs.value, rhs.value));
5357	}
5358
5359	RValue<Int4> operator|(RValue<Int4> lhs, RValue<Int4> rhs)
5360	{
5361		return RValue<Int4>(Nucleus::createOr(lhs.value, rhs.value));
5362	}
5363
5364	RValue<Int4> operator^(RValue<Int4> lhs, RValue<Int4> rhs)
5365	{
5366		return RValue<Int4>(Nucleus::createXor(lhs.value, rhs.value));
5367	}
5368
5369	RValue<Int4> operator<<(RValue<Int4> lhs, unsigned char rhs)
5370	{
5371		return x86::pslld(lhs, rhs);
5372	}
5373
5374	RValue<Int4> operator>>(RValue<Int4> lhs, unsigned char rhs)
5375	{
5376		return x86::psrad(lhs, rhs);
5377	}
5378
5379	RValue<Int4> operator<<(RValue<Int4> lhs, RValue<Int4> rhs)
5380	{
5381		return RValue<Int4>(Nucleus::createShl(lhs.value, rhs.value));
5382	}
5383
5384	RValue<Int4> operator>>(RValue<Int4> lhs, RValue<Int4> rhs)
5385	{
5386		return RValue<Int4>(Nucleus::createAShr(lhs.value, rhs.value));
5387	}
5388
5389	RValue<Int4> operator+=(const Int4 &lhs, RValue<Int4> rhs)
5390	{
5391		return lhs = lhs + rhs;
5392	}
5393
5394	RValue<Int4> operator-=(const Int4 &lhs, RValue<Int4> rhs)
5395	{
5396		return lhs = lhs - rhs;
5397	}
5398
5399	RValue<Int4> operator*=(const Int4 &lhs, RValue<Int4> rhs)
5400	{
5401		return lhs = lhs * rhs;
5402	}
5403
5404//	RValue<Int4> operator/=(const Int4 &lhs, RValue<Int4> rhs)
5405//	{
5406//		return lhs = lhs / rhs;
5407//	}
5408
5409//	RValue<Int4> operator%=(const Int4 &lhs, RValue<Int4> rhs)
5410//	{
5411//		return lhs = lhs % rhs;
5412//	}
5413
5414	RValue<Int4> operator&=(const Int4 &lhs, RValue<Int4> rhs)
5415	{
5416		return lhs = lhs & rhs;
5417	}
5418
5419	RValue<Int4> operator|=(const Int4 &lhs, RValue<Int4> rhs)
5420	{
5421		return lhs = lhs | rhs;
5422	}
5423
5424	RValue<Int4> operator^=(const Int4 &lhs, RValue<Int4> rhs)
5425	{
5426		return lhs = lhs ^ rhs;
5427	}
5428
5429	RValue<Int4> operator<<=(const Int4 &lhs, unsigned char rhs)
5430	{
5431		return lhs = lhs << rhs;
5432	}
5433
5434	RValue<Int4> operator>>=(const Int4 &lhs, unsigned char rhs)
5435	{
5436		return lhs = lhs >> rhs;
5437	}
5438
5439	RValue<Int4> operator+(RValue<Int4> val)
5440	{
5441		return val;
5442	}
5443
5444	RValue<Int4> operator-(RValue<Int4> val)
5445	{
5446		return RValue<Int4>(Nucleus::createNeg(val.value));
5447	}
5448
5449	RValue<Int4> operator~(RValue<Int4> val)
5450	{
5451		return RValue<Int4>(Nucleus::createNot(val.value));
5452	}
5453
5454	RValue<Int4> CmpEQ(RValue<Int4> x, RValue<Int4> y)
5455	{
5456		// FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5457		//        Restore the following line when LLVM is updated to a version where this issue is fixed.
5458		// return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5459		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5460	}
5461
5462	RValue<Int4> CmpLT(RValue<Int4> x, RValue<Int4> y)
5463	{
5464		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType()));
5465	}
5466
5467	RValue<Int4> CmpLE(RValue<Int4> x, RValue<Int4> y)
5468	{
5469		// FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5470		//        Restore the following line when LLVM is updated to a version where this issue is fixed.
5471		// return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLE(x.value, y.value), Int4::getType()));
5472		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5473	}
5474
5475	RValue<Int4> CmpNEQ(RValue<Int4> x, RValue<Int4> y)
5476	{
5477		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5478	}
5479
5480	RValue<Int4> CmpNLT(RValue<Int4> x, RValue<Int4> y)
5481	{
5482		// FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5483		//        Restore the following line when LLVM is updated to a version where this issue is fixed.
5484		// return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGE(x.value, y.value), Int4::getType()));
5485		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSLT(x.value, y.value), Int4::getType())) ^ Int4(0xFFFFFFFF);
5486	}
5487
5488	RValue<Int4> CmpNLE(RValue<Int4> x, RValue<Int4> y)
5489	{
5490		return RValue<Int4>(Nucleus::createSExt(Nucleus::createICmpSGT(x.value, y.value), Int4::getType()));
5491	}
5492
5493	RValue<Int4> Max(RValue<Int4> x, RValue<Int4> y)
5494	{
5495		if(CPUID::supportsSSE4_1())
5496		{
5497			return x86::pmaxsd(x, y);
5498		}
5499		else
5500		{
5501			RValue<Int4> greater = CmpNLE(x, y);
5502			return x & greater | y & ~greater;
5503		}
5504	}
5505
5506	RValue<Int4> Min(RValue<Int4> x, RValue<Int4> y)
5507	{
5508		if(CPUID::supportsSSE4_1())
5509		{
5510			return x86::pminsd(x, y);
5511		}
5512		else
5513		{
5514			RValue<Int4> less = CmpLT(x, y);
5515			return x & less | y & ~less;
5516		}
5517	}
5518
5519	RValue<Int4> RoundInt(RValue<Float4> cast)
5520	{
5521		return x86::cvtps2dq(cast);
5522	}
5523
5524	RValue<Short8> Pack(RValue<Int4> x, RValue<Int4> y)
5525	{
5526		return x86::packssdw(x, y);
5527	}
5528
5529	RValue<Int> Extract(RValue<Int4> x, int i)
5530	{
5531		return RValue<Int>(Nucleus::createExtractElement(x.value, Int::getType(), i));
5532	}
5533
5534	RValue<Int4> Insert(RValue<Int4> x, RValue<Int> element, int i)
5535	{
5536		return RValue<Int4>(Nucleus::createInsertElement(x.value, element.value, i));
5537	}
5538
5539	RValue<Int> SignMask(RValue<Int4> x)
5540	{
5541		return x86::movmskps(As<Float4>(x));
5542	}
5543
5544	RValue<Int4> Swizzle(RValue<Int4> x, unsigned char select)
5545	{
5546		return RValue<Int4>(createSwizzle4(x.value, select));
5547	}
5548
5549	Type *Int4::getType()
5550	{
5551		return T(VectorType::get(Int::getType(), 4));
5552	}
5553
5554	UInt4::UInt4(RValue<Float4> cast)
5555	{
5556	//	xyzw.parent = this;
5557
5558		// Note: createFPToUI is broken, must perform conversion using createFPtoSI
5559		// Value *xyzw = Nucleus::createFPToUI(cast.value, UInt4::getType());
5560
5561		// Smallest positive value representable in UInt, but not in Int
5562		const unsigned int ustart = 0x80000000u;
5563		const float ustartf = float(ustart);
5564
5565		// Check if the value can be represented as an Int
5566		Int4 uiValue = CmpNLT(cast, Float4(ustartf));
5567		// If the value is too large, subtract ustart and re-add it after conversion.
5568		uiValue = (uiValue & As<Int4>(As<UInt4>(Int4(cast - Float4(ustartf))) + UInt4(ustart))) |
5569		// Otherwise, just convert normally
5570		          (~uiValue & Int4(cast));
5571		// If the value is negative, store 0, otherwise store the result of the conversion
5572		storeValue((~(As<Int4>(cast) >> 31) & uiValue).value);
5573	}
5574
5575	UInt4::UInt4()
5576	{
5577	//	xyzw.parent = this;
5578	}
5579
5580	UInt4::UInt4(int xyzw)
5581	{
5582		constant(xyzw, xyzw, xyzw, xyzw);
5583	}
5584
5585	UInt4::UInt4(int x, int yzw)
5586	{
5587		constant(x, yzw, yzw, yzw);
5588	}
5589
5590	UInt4::UInt4(int x, int y, int zw)
5591	{
5592		constant(x, y, zw, zw);
5593	}
5594
5595	UInt4::UInt4(int x, int y, int z, int w)
5596	{
5597		constant(x, y, z, w);
5598	}
5599
5600	void UInt4::constant(int x, int y, int z, int w)
5601	{
5602	//	xyzw.parent = this;
5603
5604		int64_t constantVector[4] = {x, y, z, w};
5605		storeValue(Nucleus::createConstantVector(constantVector, getType()));
5606	}
5607
5608	UInt4::UInt4(RValue<UInt4> rhs)
5609	{
5610	//	xyzw.parent = this;
5611
5612		storeValue(rhs.value);
5613	}
5614
5615	UInt4::UInt4(const UInt4 &rhs)
5616	{
5617	//	xyzw.parent = this;
5618
5619		Value *value = rhs.loadValue();
5620		storeValue(value);
5621	}
5622
5623	UInt4::UInt4(const Reference<UInt4> &rhs)
5624	{
5625	//	xyzw.parent = this;
5626
5627		Value *value = rhs.loadValue();
5628		storeValue(value);
5629	}
5630
5631	UInt4::UInt4(RValue<Int4> rhs)
5632	{
5633	//	xyzw.parent = this;
5634
5635		storeValue(rhs.value);
5636	}
5637
5638	UInt4::UInt4(const Int4 &rhs)
5639	{
5640	//	xyzw.parent = this;
5641
5642		Value *value = rhs.loadValue();
5643		storeValue(value);
5644	}
5645
5646	UInt4::UInt4(const Reference<Int4> &rhs)
5647	{
5648	//	xyzw.parent = this;
5649
5650		Value *value = rhs.loadValue();
5651		storeValue(value);
5652	}
5653
5654	UInt4::UInt4(RValue<UInt2> lo, RValue<UInt2> hi)
5655	{
5656		Value *loLong = Nucleus::createBitCast(lo.value, Long::getType());
5657		Value *hiLong = Nucleus::createBitCast(hi.value, Long::getType());
5658
5659		Value *long2 = V(UndefValue::get(VectorType::get(Long::getType(), 2)));
5660		long2 = Nucleus::createInsertElement(long2, loLong, 0);
5661		long2 = Nucleus::createInsertElement(long2, hiLong, 1);
5662		Value *uint4 = Nucleus::createBitCast(long2, Int4::getType());
5663
5664		storeValue(uint4);
5665	}
5666
5667	RValue<UInt4> UInt4::operator=(RValue<UInt4> rhs) const
5668	{
5669		storeValue(rhs.value);
5670
5671		return rhs;
5672	}
5673
5674	RValue<UInt4> UInt4::operator=(const UInt4 &rhs) const
5675	{
5676		Value *value = rhs.loadValue();
5677		storeValue(value);
5678
5679		return RValue<UInt4>(value);
5680	}
5681
5682	RValue<UInt4> UInt4::operator=(const Reference<UInt4> &rhs) const
5683	{
5684		Value *value = rhs.loadValue();
5685		storeValue(value);
5686
5687		return RValue<UInt4>(value);
5688	}
5689
5690	RValue<UInt4> operator+(RValue<UInt4> lhs, RValue<UInt4> rhs)
5691	{
5692		return RValue<UInt4>(Nucleus::createAdd(lhs.value, rhs.value));
5693	}
5694
5695	RValue<UInt4> operator-(RValue<UInt4> lhs, RValue<UInt4> rhs)
5696	{
5697		return RValue<UInt4>(Nucleus::createSub(lhs.value, rhs.value));
5698	}
5699
5700	RValue<UInt4> operator*(RValue<UInt4> lhs, RValue<UInt4> rhs)
5701	{
5702		return RValue<UInt4>(Nucleus::createMul(lhs.value, rhs.value));
5703	}
5704
5705	RValue<UInt4> operator/(RValue<UInt4> lhs, RValue<UInt4> rhs)
5706	{
5707		return RValue<UInt4>(Nucleus::createUDiv(lhs.value, rhs.value));
5708	}
5709
5710	RValue<UInt4> operator%(RValue<UInt4> lhs, RValue<UInt4> rhs)
5711	{
5712		return RValue<UInt4>(Nucleus::createURem(lhs.value, rhs.value));
5713	}
5714
5715	RValue<UInt4> operator&(RValue<UInt4> lhs, RValue<UInt4> rhs)
5716	{
5717		return RValue<UInt4>(Nucleus::createAnd(lhs.value, rhs.value));
5718	}
5719
5720	RValue<UInt4> operator|(RValue<UInt4> lhs, RValue<UInt4> rhs)
5721	{
5722		return RValue<UInt4>(Nucleus::createOr(lhs.value, rhs.value));
5723	}
5724
5725	RValue<UInt4> operator^(RValue<UInt4> lhs, RValue<UInt4> rhs)
5726	{
5727		return RValue<UInt4>(Nucleus::createXor(lhs.value, rhs.value));
5728	}
5729
5730	RValue<UInt4> operator<<(RValue<UInt4> lhs, unsigned char rhs)
5731	{
5732		return As<UInt4>(x86::pslld(As<Int4>(lhs), rhs));
5733	}
5734
5735	RValue<UInt4> operator>>(RValue<UInt4> lhs, unsigned char rhs)
5736	{
5737		return x86::psrld(lhs, rhs);
5738	}
5739
5740	RValue<UInt4> operator<<(RValue<UInt4> lhs, RValue<UInt4> rhs)
5741	{
5742		return RValue<UInt4>(Nucleus::createShl(lhs.value, rhs.value));
5743	}
5744
5745	RValue<UInt4> operator>>(RValue<UInt4> lhs, RValue<UInt4> rhs)
5746	{
5747		return RValue<UInt4>(Nucleus::createLShr(lhs.value, rhs.value));
5748	}
5749
5750	RValue<UInt4> operator+=(const UInt4 &lhs, RValue<UInt4> rhs)
5751	{
5752		return lhs = lhs + rhs;
5753	}
5754
5755	RValue<UInt4> operator-=(const UInt4 &lhs, RValue<UInt4> rhs)
5756	{
5757		return lhs = lhs - rhs;
5758	}
5759
5760	RValue<UInt4> operator*=(const UInt4 &lhs, RValue<UInt4> rhs)
5761	{
5762		return lhs = lhs * rhs;
5763	}
5764
5765//	RValue<UInt4> operator/=(const UInt4 &lhs, RValue<UInt4> rhs)
5766//	{
5767//		return lhs = lhs / rhs;
5768//	}
5769
5770//	RValue<UInt4> operator%=(const UInt4 &lhs, RValue<UInt4> rhs)
5771//	{
5772//		return lhs = lhs % rhs;
5773//	}
5774
5775	RValue<UInt4> operator&=(const UInt4 &lhs, RValue<UInt4> rhs)
5776	{
5777		return lhs = lhs & rhs;
5778	}
5779
5780	RValue<UInt4> operator|=(const UInt4 &lhs, RValue<UInt4> rhs)
5781	{
5782		return lhs = lhs | rhs;
5783	}
5784
5785	RValue<UInt4> operator^=(const UInt4 &lhs, RValue<UInt4> rhs)
5786	{
5787		return lhs = lhs ^ rhs;
5788	}
5789
5790	RValue<UInt4> operator<<=(const UInt4 &lhs, unsigned char rhs)
5791	{
5792		return lhs = lhs << rhs;
5793	}
5794
5795	RValue<UInt4> operator>>=(const UInt4 &lhs, unsigned char rhs)
5796	{
5797		return lhs = lhs >> rhs;
5798	}
5799
5800	RValue<UInt4> operator+(RValue<UInt4> val)
5801	{
5802		return val;
5803	}
5804
5805	RValue<UInt4> operator-(RValue<UInt4> val)
5806	{
5807		return RValue<UInt4>(Nucleus::createNeg(val.value));
5808	}
5809
5810	RValue<UInt4> operator~(RValue<UInt4> val)
5811	{
5812		return RValue<UInt4>(Nucleus::createNot(val.value));
5813	}
5814
5815	RValue<UInt4> CmpEQ(RValue<UInt4> x, RValue<UInt4> y)
5816	{
5817		// FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5818		//        Restore the following line when LLVM is updated to a version where this issue is fixed.
5819		// return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpEQ(x.value, y.value), Int4::getType()));
5820		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5821	}
5822
5823	RValue<UInt4> CmpLT(RValue<UInt4> x, RValue<UInt4> y)
5824	{
5825		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType()));
5826	}
5827
5828	RValue<UInt4> CmpLE(RValue<UInt4> x, RValue<UInt4> y)
5829	{
5830		// FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5831		//        Restore the following line when LLVM is updated to a version where this issue is fixed.
5832		// return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULE(x.value, y.value), Int4::getType()));
5833		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5834	}
5835
5836	RValue<UInt4> CmpNEQ(RValue<UInt4> x, RValue<UInt4> y)
5837	{
5838		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpNE(x.value, y.value), Int4::getType()));
5839	}
5840
5841	RValue<UInt4> CmpNLT(RValue<UInt4> x, RValue<UInt4> y)
5842	{
5843		// FIXME: An LLVM bug causes SExt(ICmpCC()) to produce 0 or 1 instead of 0 or ~0
5844		//        Restore the following line when LLVM is updated to a version where this issue is fixed.
5845		// return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGE(x.value, y.value), Int4::getType()));
5846		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpULT(x.value, y.value), Int4::getType())) ^ UInt4(0xFFFFFFFF);
5847	}
5848
5849	RValue<UInt4> CmpNLE(RValue<UInt4> x, RValue<UInt4> y)
5850	{
5851		return RValue<UInt4>(Nucleus::createSExt(Nucleus::createICmpUGT(x.value, y.value), Int4::getType()));
5852	}
5853
5854	RValue<UInt4> Max(RValue<UInt4> x, RValue<UInt4> y)
5855	{
5856		if(CPUID::supportsSSE4_1())
5857		{
5858			return x86::pmaxud(x, y);
5859		}
5860		else
5861		{
5862			RValue<UInt4> greater = CmpNLE(x, y);
5863			return x & greater | y & ~greater;
5864		}
5865	}
5866
5867	RValue<UInt4> Min(RValue<UInt4> x, RValue<UInt4> y)
5868	{
5869		if(CPUID::supportsSSE4_1())
5870		{
5871			return x86::pminud(x, y);
5872		}
5873		else
5874		{
5875			RValue<UInt4> less = CmpLT(x, y);
5876			return x & less | y & ~less;
5877		}
5878	}
5879
5880	RValue<UShort8> Pack(RValue<UInt4> x, RValue<UInt4> y)
5881	{
5882		return x86::packusdw(x, y);   // FIXME: Fallback required
5883	}
5884
5885	Type *UInt4::getType()
5886	{
5887		return T(VectorType::get(UInt::getType(), 4));
5888	}
5889
5890	Float::Float(RValue<Int> cast)
5891	{
5892		Value *integer = Nucleus::createSIToFP(cast.value, Float::getType());
5893
5894		storeValue(integer);
5895	}
5896
5897	Float::Float()
5898	{
5899
5900	}
5901
5902	Float::Float(float x)
5903	{
5904		storeValue(Nucleus::createConstantFloat(x));
5905	}
5906
5907	Float::Float(RValue<Float> rhs)
5908	{
5909		storeValue(rhs.value);
5910	}
5911
5912	Float::Float(const Float &rhs)
5913	{
5914		Value *value = rhs.loadValue();
5915		storeValue(value);
5916	}
5917
5918	Float::Float(const Reference<Float> &rhs)
5919	{
5920		Value *value = rhs.loadValue();
5921		storeValue(value);
5922	}
5923
5924	RValue<Float> Float::operator=(RValue<Float> rhs) const
5925	{
5926		storeValue(rhs.value);
5927
5928		return rhs;
5929	}
5930
5931	RValue<Float> Float::operator=(const Float &rhs) const
5932	{
5933		Value *value = rhs.loadValue();
5934		storeValue(value);
5935
5936		return RValue<Float>(value);
5937	}
5938
5939	RValue<Float> Float::operator=(const Reference<Float> &rhs) const
5940	{
5941		Value *value = rhs.loadValue();
5942		storeValue(value);
5943
5944		return RValue<Float>(value);
5945	}
5946
5947	RValue<Float> operator+(RValue<Float> lhs, RValue<Float> rhs)
5948	{
5949		return RValue<Float>(Nucleus::createFAdd(lhs.value, rhs.value));
5950	}
5951
5952	RValue<Float> operator-(RValue<Float> lhs, RValue<Float> rhs)
5953	{
5954		return RValue<Float>(Nucleus::createFSub(lhs.value, rhs.value));
5955	}
5956
5957	RValue<Float> operator*(RValue<Float> lhs, RValue<Float> rhs)
5958	{
5959		return RValue<Float>(Nucleus::createFMul(lhs.value, rhs.value));
5960	}
5961
5962	RValue<Float> operator/(RValue<Float> lhs, RValue<Float> rhs)
5963	{
5964		return RValue<Float>(Nucleus::createFDiv(lhs.value, rhs.value));
5965	}
5966
5967	RValue<Float> operator+=(const Float &lhs, RValue<Float> rhs)
5968	{
5969		return lhs = lhs + rhs;
5970	}
5971
5972	RValue<Float> operator-=(const Float &lhs, RValue<Float> rhs)
5973	{
5974		return lhs = lhs - rhs;
5975	}
5976
5977	RValue<Float> operator*=(const Float &lhs, RValue<Float> rhs)
5978	{
5979		return lhs = lhs * rhs;
5980	}
5981
5982	RValue<Float> operator/=(const Float &lhs, RValue<Float> rhs)
5983	{
5984		return lhs = lhs / rhs;
5985	}
5986
5987	RValue<Float> operator+(RValue<Float> val)
5988	{
5989		return val;
5990	}
5991
5992	RValue<Float> operator-(RValue<Float> val)
5993	{
5994		return RValue<Float>(Nucleus::createFNeg(val.value));
5995	}
5996
5997	RValue<Bool> operator<(RValue<Float> lhs, RValue<Float> rhs)
5998	{
5999		return RValue<Bool>(Nucleus::createFCmpOLT(lhs.value, rhs.value));
6000	}
6001
6002	RValue<Bool> operator<=(RValue<Float> lhs, RValue<Float> rhs)
6003	{
6004		return RValue<Bool>(Nucleus::createFCmpOLE(lhs.value, rhs.value));
6005	}
6006
6007	RValue<Bool> operator>(RValue<Float> lhs, RValue<Float> rhs)
6008	{
6009		return RValue<Bool>(Nucleus::createFCmpOGT(lhs.value, rhs.value));
6010	}
6011
6012	RValue<Bool> operator>=(RValue<Float> lhs, RValue<Float> rhs)
6013	{
6014		return RValue<Bool>(Nucleus::createFCmpOGE(lhs.value, rhs.value));
6015	}
6016
6017	RValue<Bool> operator!=(RValue<Float> lhs, RValue<Float> rhs)
6018	{
6019		return RValue<Bool>(Nucleus::createFCmpONE(lhs.value, rhs.value));
6020	}
6021
6022	RValue<Bool> operator==(RValue<Float> lhs, RValue<Float> rhs)
6023	{
6024		return RValue<Bool>(Nucleus::createFCmpOEQ(lhs.value, rhs.value));
6025	}
6026
6027	RValue<Float> Abs(RValue<Float> x)
6028	{
6029		return IfThenElse(x > 0.0f, x, -x);
6030	}
6031
6032	RValue<Float> Max(RValue<Float> x, RValue<Float> y)
6033	{
6034		return IfThenElse(x > y, x, y);
6035	}
6036
6037	RValue<Float> Min(RValue<Float> x, RValue<Float> y)
6038	{
6039		return IfThenElse(x < y, x, y);
6040	}
6041
6042	RValue<Float> Rcp_pp(RValue<Float> x, bool exactAtPow2)
6043	{
6044		if(exactAtPow2)
6045		{
6046			// rcpss uses a piecewise-linear approximation which minimizes the relative error
6047			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
6048			return x86::rcpss(x) * Float(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
6049		}
6050		else
6051		{
6052			return x86::rcpss(x);
6053		}
6054	}
6055
6056	RValue<Float> RcpSqrt_pp(RValue<Float> x)
6057	{
6058		return x86::rsqrtss(x);
6059	}
6060
6061	RValue<Float> Sqrt(RValue<Float> x)
6062	{
6063		return x86::sqrtss(x);
6064	}
6065
6066	RValue<Float> Round(RValue<Float> x)
6067	{
6068		if(CPUID::supportsSSE4_1())
6069		{
6070			return x86::roundss(x, 0);
6071		}
6072		else
6073		{
6074			return Float4(Round(Float4(x))).x;
6075		}
6076	}
6077
6078	RValue<Float> Trunc(RValue<Float> x)
6079	{
6080		if(CPUID::supportsSSE4_1())
6081		{
6082			return x86::roundss(x, 3);
6083		}
6084		else
6085		{
6086			return Float(Int(x));   // Rounded toward zero
6087		}
6088	}
6089
6090	RValue<Float> Frac(RValue<Float> x)
6091	{
6092		if(CPUID::supportsSSE4_1())
6093		{
6094			return x - x86::floorss(x);
6095		}
6096		else
6097		{
6098			return Float4(Frac(Float4(x))).x;
6099		}
6100	}
6101
6102	RValue<Float> Floor(RValue<Float> x)
6103	{
6104		if(CPUID::supportsSSE4_1())
6105		{
6106			return x86::floorss(x);
6107		}
6108		else
6109		{
6110			return Float4(Floor(Float4(x))).x;
6111		}
6112	}
6113
6114	RValue<Float> Ceil(RValue<Float> x)
6115	{
6116		if(CPUID::supportsSSE4_1())
6117		{
6118			return x86::ceilss(x);
6119		}
6120		else
6121		{
6122			return Float4(Ceil(Float4(x))).x;
6123		}
6124	}
6125
6126	Type *Float::getType()
6127	{
6128		return T(llvm::Type::getFloatTy(*::context));
6129	}
6130
6131	Float2::Float2(RValue<Float4> cast)
6132	{
6133	//	xyzw.parent = this;
6134
6135		Value *int64x2 = Nucleus::createBitCast(cast.value, T(VectorType::get(Long::getType(), 2)));
6136		Value *int64 = Nucleus::createExtractElement(int64x2, Long::getType(), 0);
6137		Value *float2 = Nucleus::createBitCast(int64, Float2::getType());
6138
6139		storeValue(float2);
6140	}
6141
6142	Type *Float2::getType()
6143	{
6144		return T(VectorType::get(Float::getType(), 2));
6145	}
6146
6147	Float4::Float4(RValue<Byte4> cast)
6148	{
6149		xyzw.parent = this;
6150
6151		#if 0
6152			Value *xyzw = Nucleus::createUIToFP(cast.value, Float4::getType());   // FIXME: Crashes
6153		#elif 0
6154			Value *vector = loadValue();
6155
6156			Value *i8x = Nucleus::createExtractElement(cast.value, 0);
6157			Value *f32x = Nucleus::createUIToFP(i8x, Float::getType());
6158			Value *x = Nucleus::createInsertElement(vector, f32x, 0);
6159
6160			Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
6161			Value *f32y = Nucleus::createUIToFP(i8y, Float::getType());
6162			Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
6163
6164			Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
6165			Value *f32z = Nucleus::createUIToFP(i8z, Float::getType());
6166			Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
6167
6168			Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
6169			Value *f32w = Nucleus::createUIToFP(i8w, Float::getType());
6170			Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
6171		#else
6172			Value *a = Int4(cast).loadValue();
6173			Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
6174		#endif
6175
6176		storeValue(xyzw);
6177	}
6178
6179	Float4::Float4(RValue<SByte4> cast)
6180	{
6181		xyzw.parent = this;
6182
6183		#if 0
6184			Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());   // FIXME: Crashes
6185		#elif 0
6186			Value *vector = loadValue();
6187
6188			Value *i8x = Nucleus::createExtractElement(cast.value, 0);
6189			Value *f32x = Nucleus::createSIToFP(i8x, Float::getType());
6190			Value *x = Nucleus::createInsertElement(vector, f32x, 0);
6191
6192			Value *i8y = Nucleus::createExtractElement(cast.value, V(Nucleus::createConstantInt(1)));
6193			Value *f32y = Nucleus::createSIToFP(i8y, Float::getType());
6194			Value *xy = Nucleus::createInsertElement(x, f32y, V(Nucleus::createConstantInt(1)));
6195
6196			Value *i8z = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(2));
6197			Value *f32z = Nucleus::createSIToFP(i8z, Float::getType());
6198			Value *xyz = Nucleus::createInsertElement(xy, f32z, Nucleus::createConstantInt(2));
6199
6200			Value *i8w = Nucleus::createExtractElement(cast.value, Nucleus::createConstantInt(3));
6201			Value *f32w = Nucleus::createSIToFP(i8w, Float::getType());
6202			Value *xyzw = Nucleus::createInsertElement(xyz, f32w, Nucleus::createConstantInt(3));
6203		#else
6204			Value *a = Int4(cast).loadValue();
6205			Value *xyzw = Nucleus::createSIToFP(a, Float4::getType());
6206		#endif
6207
6208		storeValue(xyzw);
6209	}
6210
6211	Float4::Float4(RValue<Short4> cast)
6212	{
6213		xyzw.parent = this;
6214
6215		Int4 c(cast);
6216		storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
6217	}
6218
6219	Float4::Float4(RValue<UShort4> cast)
6220	{
6221		xyzw.parent = this;
6222
6223		Int4 c(cast);
6224		storeValue(Nucleus::createSIToFP(RValue<Int4>(c).value, Float4::getType()));
6225	}
6226
6227	Float4::Float4(RValue<Int4> cast)
6228	{
6229		xyzw.parent = this;
6230
6231		Value *xyzw = Nucleus::createSIToFP(cast.value, Float4::getType());
6232
6233		storeValue(xyzw);
6234	}
6235
6236	Float4::Float4(RValue<UInt4> cast)
6237	{
6238		xyzw.parent = this;
6239
6240		Value *xyzw = Nucleus::createUIToFP(cast.value, Float4::getType());
6241
6242		storeValue(xyzw);
6243	}
6244
6245	Float4::Float4()
6246	{
6247		xyzw.parent = this;
6248	}
6249
6250	Float4::Float4(float xyzw)
6251	{
6252		constant(xyzw, xyzw, xyzw, xyzw);
6253	}
6254
6255	Float4::Float4(float x, float yzw)
6256	{
6257		constant(x, yzw, yzw, yzw);
6258	}
6259
6260	Float4::Float4(float x, float y, float zw)
6261	{
6262		constant(x, y, zw, zw);
6263	}
6264
6265	Float4::Float4(float x, float y, float z, float w)
6266	{
6267		constant(x, y, z, w);
6268	}
6269
6270	void Float4::constant(float x, float y, float z, float w)
6271	{
6272		xyzw.parent = this;
6273
6274		double constantVector[4] = {x, y, z, w};
6275		storeValue(Nucleus::createConstantVector(constantVector, getType()));
6276	}
6277
6278	Float4::Float4(RValue<Float4> rhs)
6279	{
6280		xyzw.parent = this;
6281
6282		storeValue(rhs.value);
6283	}
6284
6285	Float4::Float4(const Float4 &rhs)
6286	{
6287		xyzw.parent = this;
6288
6289		Value *value = rhs.loadValue();
6290		storeValue(value);
6291	}
6292
6293	Float4::Float4(const Reference<Float4> &rhs)
6294	{
6295		xyzw.parent = this;
6296
6297		Value *value = rhs.loadValue();
6298		storeValue(value);
6299	}
6300
6301	Float4::Float4(RValue<Float> rhs)
6302	{
6303		xyzw.parent = this;
6304
6305		Value *vector = loadValue();
6306		Value *insert = Nucleus::createInsertElement(vector, rhs.value, 0);
6307
6308		int swizzle[4] = {0, 0, 0, 0};
6309		Value *replicate = Nucleus::createShuffleVector(insert, insert, swizzle);
6310
6311		storeValue(replicate);
6312	}
6313
6314	Float4::Float4(const Float &rhs)
6315	{
6316		xyzw.parent = this;
6317
6318		*this = RValue<Float>(rhs.loadValue());
6319	}
6320
6321	Float4::Float4(const Reference<Float> &rhs)
6322	{
6323		xyzw.parent = this;
6324
6325		*this = RValue<Float>(rhs.loadValue());
6326	}
6327
6328	RValue<Float4> Float4::operator=(float x) const
6329	{
6330		return *this = Float4(x, x, x, x);
6331	}
6332
6333	RValue<Float4> Float4::operator=(RValue<Float4> rhs) const
6334	{
6335		storeValue(rhs.value);
6336
6337		return rhs;
6338	}
6339
6340	RValue<Float4> Float4::operator=(const Float4 &rhs) const
6341	{
6342		Value *value = rhs.loadValue();
6343		storeValue(value);
6344
6345		return RValue<Float4>(value);
6346	}
6347
6348	RValue<Float4> Float4::operator=(const Reference<Float4> &rhs) const
6349	{
6350		Value *value = rhs.loadValue();
6351		storeValue(value);
6352
6353		return RValue<Float4>(value);
6354	}
6355
6356	RValue<Float4> Float4::operator=(RValue<Float> rhs) const
6357	{
6358		return *this = Float4(rhs);
6359	}
6360
6361	RValue<Float4> Float4::operator=(const Float &rhs) const
6362	{
6363		return *this = Float4(rhs);
6364	}
6365
6366	RValue<Float4> Float4::operator=(const Reference<Float> &rhs) const
6367	{
6368		return *this = Float4(rhs);
6369	}
6370
6371	RValue<Float4> operator+(RValue<Float4> lhs, RValue<Float4> rhs)
6372	{
6373		return RValue<Float4>(Nucleus::createFAdd(lhs.value, rhs.value));
6374	}
6375
6376	RValue<Float4> operator-(RValue<Float4> lhs, RValue<Float4> rhs)
6377	{
6378		return RValue<Float4>(Nucleus::createFSub(lhs.value, rhs.value));
6379	}
6380
6381	RValue<Float4> operator*(RValue<Float4> lhs, RValue<Float4> rhs)
6382	{
6383		return RValue<Float4>(Nucleus::createFMul(lhs.value, rhs.value));
6384	}
6385
6386	RValue<Float4> operator/(RValue<Float4> lhs, RValue<Float4> rhs)
6387	{
6388		return RValue<Float4>(Nucleus::createFDiv(lhs.value, rhs.value));
6389	}
6390
6391	RValue<Float4> operator%(RValue<Float4> lhs, RValue<Float4> rhs)
6392	{
6393		return RValue<Float4>(Nucleus::createFRem(lhs.value, rhs.value));
6394	}
6395
6396	RValue<Float4> operator+=(const Float4 &lhs, RValue<Float4> rhs)
6397	{
6398		return lhs = lhs + rhs;
6399	}
6400
6401	RValue<Float4> operator-=(const Float4 &lhs, RValue<Float4> rhs)
6402	{
6403		return lhs = lhs - rhs;
6404	}
6405
6406	RValue<Float4> operator*=(const Float4 &lhs, RValue<Float4> rhs)
6407	{
6408		return lhs = lhs * rhs;
6409	}
6410
6411	RValue<Float4> operator/=(const Float4 &lhs, RValue<Float4> rhs)
6412	{
6413		return lhs = lhs / rhs;
6414	}
6415
6416	RValue<Float4> operator%=(const Float4 &lhs, RValue<Float4> rhs)
6417	{
6418		return lhs = lhs % rhs;
6419	}
6420
6421	RValue<Float4> operator+(RValue<Float4> val)
6422	{
6423		return val;
6424	}
6425
6426	RValue<Float4> operator-(RValue<Float4> val)
6427	{
6428		return RValue<Float4>(Nucleus::createFNeg(val.value));
6429	}
6430
6431	RValue<Float4> Abs(RValue<Float4> x)
6432	{
6433		Value *vector = Nucleus::createBitCast(x.value, Int4::getType());
6434		int64_t constantVector[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
6435		Value *result = Nucleus::createAnd(vector, V(Nucleus::createConstantVector(constantVector, Int4::getType())));
6436
6437		return RValue<Float4>(Nucleus::createBitCast(result, Float4::getType()));
6438	}
6439
6440	RValue<Float4> Max(RValue<Float4> x, RValue<Float4> y)
6441	{
6442		return x86::maxps(x, y);
6443	}
6444
6445	RValue<Float4> Min(RValue<Float4> x, RValue<Float4> y)
6446	{
6447		return x86::minps(x, y);
6448	}
6449
6450	RValue<Float4> Rcp_pp(RValue<Float4> x, bool exactAtPow2)
6451	{
6452		if(exactAtPow2)
6453		{
6454			// rcpps uses a piecewise-linear approximation which minimizes the relative error
6455			// but is not exact at power-of-two values. Rectify by multiplying by the inverse.
6456			return x86::rcpps(x) * Float4(1.0f / _mm_cvtss_f32(_mm_rcp_ss(_mm_set_ps1(1.0f))));
6457		}
6458		else
6459		{
6460			return x86::rcpps(x);
6461		}
6462	}
6463
6464	RValue<Float4> RcpSqrt_pp(RValue<Float4> x)
6465	{
6466		return x86::rsqrtps(x);
6467	}
6468
6469	RValue<Float4> Sqrt(RValue<Float4> x)
6470	{
6471		return x86::sqrtps(x);
6472	}
6473
6474	RValue<Float4> Insert(RValue<Float4> val, RValue<Float> element, int i)
6475	{
6476		return RValue<Float4>(Nucleus::createInsertElement(val.value, element.value, i));
6477	}
6478
6479	RValue<Float> Extract(RValue<Float4> x, int i)
6480	{
6481		return RValue<Float>(Nucleus::createExtractElement(x.value, Float::getType(), i));
6482	}
6483
6484	RValue<Float4> Swizzle(RValue<Float4> x, unsigned char select)
6485	{
6486		return RValue<Float4>(createSwizzle4(x.value, select));
6487	}
6488
6489	RValue<Float4> ShuffleLowHigh(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6490	{
6491		int shuffle[4] =
6492		{
6493			((imm >> 0) & 0x03) + 0,
6494			((imm >> 2) & 0x03) + 0,
6495			((imm >> 4) & 0x03) + 4,
6496			((imm >> 6) & 0x03) + 4,
6497		};
6498
6499		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6500	}
6501
6502	RValue<Float4> UnpackLow(RValue<Float4> x, RValue<Float4> y)
6503	{
6504		int shuffle[4] = {0, 4, 1, 5};
6505		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6506	}
6507
6508	RValue<Float4> UnpackHigh(RValue<Float4> x, RValue<Float4> y)
6509	{
6510		int shuffle[4] = {2, 6, 3, 7};
6511		return RValue<Float4>(Nucleus::createShuffleVector(x.value, y.value, shuffle));
6512	}
6513
6514	RValue<Float4> Mask(Float4 &lhs, RValue<Float4> rhs, unsigned char select)
6515	{
6516		Value *vector = lhs.loadValue();
6517		Value *shuffle = createMask4(vector, rhs.value, select);
6518		lhs.storeValue(shuffle);
6519
6520		return RValue<Float4>(shuffle);
6521	}
6522
6523	RValue<Int> SignMask(RValue<Float4> x)
6524	{
6525		return x86::movmskps(x);
6526	}
6527
6528	RValue<Int4> CmpEQ(RValue<Float4> x, RValue<Float4> y)
6529	{
6530	//	return As<Int4>(x86::cmpeqps(x, y));
6531		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOEQ(x.value, y.value), Int4::getType()));
6532	}
6533
6534	RValue<Int4> CmpLT(RValue<Float4> x, RValue<Float4> y)
6535	{
6536	//	return As<Int4>(x86::cmpltps(x, y));
6537		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLT(x.value, y.value), Int4::getType()));
6538	}
6539
6540	RValue<Int4> CmpLE(RValue<Float4> x, RValue<Float4> y)
6541	{
6542	//	return As<Int4>(x86::cmpleps(x, y));
6543		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOLE(x.value, y.value), Int4::getType()));
6544	}
6545
6546	RValue<Int4> CmpNEQ(RValue<Float4> x, RValue<Float4> y)
6547	{
6548	//	return As<Int4>(x86::cmpneqps(x, y));
6549		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpONE(x.value, y.value), Int4::getType()));
6550	}
6551
6552	RValue<Int4> CmpNLT(RValue<Float4> x, RValue<Float4> y)
6553	{
6554	//	return As<Int4>(x86::cmpnltps(x, y));
6555		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGE(x.value, y.value), Int4::getType()));
6556	}
6557
6558	RValue<Int4> CmpNLE(RValue<Float4> x, RValue<Float4> y)
6559	{
6560	//	return As<Int4>(x86::cmpnleps(x, y));
6561		return RValue<Int4>(Nucleus::createSExt(Nucleus::createFCmpOGT(x.value, y.value), Int4::getType()));
6562	}
6563
6564	RValue<Float4> Round(RValue<Float4> x)
6565	{
6566		if(CPUID::supportsSSE4_1())
6567		{
6568			return x86::roundps(x, 0);
6569		}
6570		else
6571		{
6572			return Float4(RoundInt(x));
6573		}
6574	}
6575
6576	RValue<Float4> Trunc(RValue<Float4> x)
6577	{
6578		if(CPUID::supportsSSE4_1())
6579		{
6580			return x86::roundps(x, 3);
6581		}
6582		else
6583		{
6584			return Float4(Int4(x));   // Rounded toward zero
6585		}
6586	}
6587
6588	RValue<Float4> Frac(RValue<Float4> x)
6589	{
6590		if(CPUID::supportsSSE4_1())
6591		{
6592			return x - x86::floorps(x);
6593		}
6594		else
6595		{
6596			Float4 frc = x - Float4(Int4(x));   // Signed fractional part
6597
6598			return frc + As<Float4>(As<Int4>(CmpNLE(Float4(0.0f), frc)) & As<Int4>(Float4(1, 1, 1, 1)));
6599		}
6600	}
6601
6602	RValue<Float4> Floor(RValue<Float4> x)
6603	{
6604		if(CPUID::supportsSSE4_1())
6605		{
6606			return x86::floorps(x);
6607		}
6608		else
6609		{
6610			return x - Frac(x);
6611		}
6612	}
6613
6614	RValue<Float4> Ceil(RValue<Float4> x)
6615	{
6616		if(CPUID::supportsSSE4_1())
6617		{
6618			return x86::ceilps(x);
6619		}
6620		else
6621		{
6622			return -Floor(-x);
6623		}
6624	}
6625
6626	Type *Float4::getType()
6627	{
6628		return T(VectorType::get(Float::getType(), 4));
6629	}
6630
6631	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, int offset)
6632	{
6633		return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), V(Nucleus::createConstantInt(offset))));
6634	}
6635
6636	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6637	{
6638		return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
6639	}
6640
6641	RValue<Pointer<Byte>> operator+(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6642	{
6643		return RValue<Pointer<Byte>>(Nucleus::createGEP(lhs.value, Byte::getType(), offset.value));
6644	}
6645
6646	RValue<Pointer<Byte>> operator+=(const Pointer<Byte> &lhs, int offset)
6647	{
6648		return lhs = lhs + offset;
6649	}
6650
6651	RValue<Pointer<Byte>> operator+=(const Pointer<Byte> &lhs, RValue<Int> offset)
6652	{
6653		return lhs = lhs + offset;
6654	}
6655
6656	RValue<Pointer<Byte>> operator+=(const Pointer<Byte> &lhs, RValue<UInt> offset)
6657	{
6658		return lhs = lhs + offset;
6659	}
6660
6661	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, int offset)
6662	{
6663		return lhs + -offset;
6664	}
6665
6666	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<Int> offset)
6667	{
6668		return lhs + -offset;
6669	}
6670
6671	RValue<Pointer<Byte>> operator-(RValue<Pointer<Byte>> lhs, RValue<UInt> offset)
6672	{
6673		return lhs + -offset;
6674	}
6675
6676	RValue<Pointer<Byte>> operator-=(const Pointer<Byte> &lhs, int offset)
6677	{
6678		return lhs = lhs - offset;
6679	}
6680
6681	RValue<Pointer<Byte>> operator-=(const Pointer<Byte> &lhs, RValue<Int> offset)
6682	{
6683		return lhs = lhs - offset;
6684	}
6685
6686	RValue<Pointer<Byte>> operator-=(const Pointer<Byte> &lhs, RValue<UInt> offset)
6687	{
6688		return lhs = lhs - offset;
6689	}
6690
6691	void Return()
6692	{
6693		Nucleus::createRetVoid();
6694		Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6695		Nucleus::createUnreachable();
6696	}
6697
6698	void Return(bool ret)
6699	{
6700		Nucleus::createRet(V(Nucleus::createConstantBool(ret)));
6701		Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6702		Nucleus::createUnreachable();
6703	}
6704
6705	void Return(const Int &ret)
6706	{
6707		Nucleus::createRet(ret.loadValue());
6708		Nucleus::setInsertBlock(Nucleus::createBasicBlock());
6709		Nucleus::createUnreachable();
6710	}
6711
6712	bool branch(RValue<Bool> cmp, BasicBlock *bodyBB, BasicBlock *endBB)
6713	{
6714		Nucleus::createCondBr(cmp.value, bodyBB, endBB);
6715		Nucleus::setInsertBlock(bodyBB);
6716
6717		return true;
6718	}
6719
6720	void endIf(BasicBlock *falseBB)
6721	{
6722		::falseBB = falseBB;
6723	}
6724
6725	bool elseBlock(BasicBlock *falseBB)
6726	{
6727		assert(falseBB && "Else not preceded by If");
6728		falseBB->back().eraseFromParent();
6729		Nucleus::setInsertBlock(falseBB);
6730
6731		return true;
6732	}
6733
6734	BasicBlock *beginElse()
6735	{
6736		BasicBlock *falseBB = ::falseBB;
6737		::falseBB = nullptr;
6738
6739		return falseBB;
6740	}
6741
6742	RValue<Long> Ticks()
6743	{
6744		llvm::Function *rdtsc = Intrinsic::getDeclaration(::module, Intrinsic::readcyclecounter);
6745
6746		return RValue<Long>(V(::builder->CreateCall(rdtsc)));
6747	}
6748}
6749
6750namespace sw
6751{
6752	namespace x86
6753	{
6754		RValue<Int> cvtss2si(RValue<Float> val)
6755		{
6756			llvm::Function *cvtss2si = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtss2si);
6757
6758			Float4 vector;
6759			vector.x = val;
6760
6761			return RValue<Int>(V(::builder->CreateCall(cvtss2si, RValue<Float4>(vector).value)));
6762		}
6763
6764		RValue<Int2> cvtps2pi(RValue<Float4> val)
6765		{
6766			llvm::Function *cvtps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvtps2pi);
6767
6768			return RValue<Int2>(V(::builder->CreateCall(cvtps2pi, val.value)));
6769		}
6770
6771		RValue<Int2> cvttps2pi(RValue<Float4> val)
6772		{
6773			llvm::Function *cvttps2pi = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cvttps2pi);
6774
6775			return RValue<Int2>(V(::builder->CreateCall(cvttps2pi, val.value)));
6776		}
6777
6778		RValue<Int4> cvtps2dq(RValue<Float4> val)
6779		{
6780			if(CPUID::supportsSSE2())
6781			{
6782				llvm::Function *cvtps2dq = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_cvtps2dq);
6783
6784				return RValue<Int4>(V(::builder->CreateCall(cvtps2dq, val.value)));
6785			}
6786			else
6787			{
6788				Int2 lo = x86::cvtps2pi(val);
6789				Int2 hi = x86::cvtps2pi(Swizzle(val, 0xEE));
6790
6791				return Int4(lo, hi);
6792			}
6793		}
6794
6795		RValue<Float> rcpss(RValue<Float> val)
6796		{
6797			llvm::Function *rcpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ss);
6798
6799			Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6800
6801			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rcpss, vector)), Float::getType(), 0));
6802		}
6803
6804		RValue<Float> sqrtss(RValue<Float> val)
6805		{
6806			llvm::Function *sqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ss);
6807
6808			Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6809
6810			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(sqrtss, vector)), Float::getType(), 0));
6811		}
6812
6813		RValue<Float> rsqrtss(RValue<Float> val)
6814		{
6815			llvm::Function *rsqrtss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ss);
6816
6817			Value *vector = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), val.value, 0);
6818
6819			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall(rsqrtss, vector)), Float::getType(), 0));
6820		}
6821
6822		RValue<Float4> rcpps(RValue<Float4> val)
6823		{
6824			llvm::Function *rcpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rcp_ps);
6825
6826			return RValue<Float4>(V(::builder->CreateCall(rcpps, val.value)));
6827		}
6828
6829		RValue<Float4> sqrtps(RValue<Float4> val)
6830		{
6831			llvm::Function *sqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_sqrt_ps);
6832
6833			return RValue<Float4>(V(::builder->CreateCall(sqrtps, val.value)));
6834		}
6835
6836		RValue<Float4> rsqrtps(RValue<Float4> val)
6837		{
6838			llvm::Function *rsqrtps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_rsqrt_ps);
6839
6840			return RValue<Float4>(V(::builder->CreateCall(rsqrtps, val.value)));
6841		}
6842
6843		RValue<Float4> maxps(RValue<Float4> x, RValue<Float4> y)
6844		{
6845			llvm::Function *maxps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_max_ps);
6846
6847			return RValue<Float4>(V(::builder->CreateCall2(maxps, x.value, y.value)));
6848		}
6849
6850		RValue<Float4> minps(RValue<Float4> x, RValue<Float4> y)
6851		{
6852			llvm::Function *minps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_min_ps);
6853
6854			return RValue<Float4>(V(::builder->CreateCall2(minps, x.value, y.value)));
6855		}
6856
6857		RValue<Float> roundss(RValue<Float> val, unsigned char imm)
6858		{
6859			llvm::Function *roundss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ss);
6860
6861			Value *undef = V(UndefValue::get(Float4::getType()));
6862			Value *vector = Nucleus::createInsertElement(undef, val.value, 0);
6863
6864			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(roundss, undef, vector, V(Nucleus::createConstantInt(imm)))), Float::getType(), 0));
6865		}
6866
6867		RValue<Float> floorss(RValue<Float> val)
6868		{
6869			return roundss(val, 1);
6870		}
6871
6872		RValue<Float> ceilss(RValue<Float> val)
6873		{
6874			return roundss(val, 2);
6875		}
6876
6877		RValue<Float4> roundps(RValue<Float4> val, unsigned char imm)
6878		{
6879			llvm::Function *roundps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_round_ps);
6880
6881			return RValue<Float4>(V(::builder->CreateCall2(roundps, val.value, V(Nucleus::createConstantInt(imm)))));
6882		}
6883
6884		RValue<Float4> floorps(RValue<Float4> val)
6885		{
6886			return roundps(val, 1);
6887		}
6888
6889		RValue<Float4> ceilps(RValue<Float4> val)
6890		{
6891			return roundps(val, 2);
6892		}
6893
6894		RValue<Float4> cmpps(RValue<Float4> x, RValue<Float4> y, unsigned char imm)
6895		{
6896			llvm::Function *cmpps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ps);
6897
6898			return RValue<Float4>(V(::builder->CreateCall3(cmpps, x.value, y.value, V(Nucleus::createConstantByte(imm)))));
6899		}
6900
6901		RValue<Float4> cmpeqps(RValue<Float4> x, RValue<Float4> y)
6902		{
6903			return cmpps(x, y, 0);
6904		}
6905
6906		RValue<Float4> cmpltps(RValue<Float4> x, RValue<Float4> y)
6907		{
6908			return cmpps(x, y, 1);
6909		}
6910
6911		RValue<Float4> cmpleps(RValue<Float4> x, RValue<Float4> y)
6912		{
6913			return cmpps(x, y, 2);
6914		}
6915
6916		RValue<Float4> cmpunordps(RValue<Float4> x, RValue<Float4> y)
6917		{
6918			return cmpps(x, y, 3);
6919		}
6920
6921		RValue<Float4> cmpneqps(RValue<Float4> x, RValue<Float4> y)
6922		{
6923			return cmpps(x, y, 4);
6924		}
6925
6926		RValue<Float4> cmpnltps(RValue<Float4> x, RValue<Float4> y)
6927		{
6928			return cmpps(x, y, 5);
6929		}
6930
6931		RValue<Float4> cmpnleps(RValue<Float4> x, RValue<Float4> y)
6932		{
6933			return cmpps(x, y, 6);
6934		}
6935
6936		RValue<Float4> cmpordps(RValue<Float4> x, RValue<Float4> y)
6937		{
6938			return cmpps(x, y, 7);
6939		}
6940
6941		RValue<Float> cmpss(RValue<Float> x, RValue<Float> y, unsigned char imm)
6942		{
6943			llvm::Function *cmpss = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_cmp_ss);
6944
6945			Value *vector1 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), x.value, 0);
6946			Value *vector2 = Nucleus::createInsertElement(V(UndefValue::get(Float4::getType())), y.value, 0);
6947
6948			return RValue<Float>(Nucleus::createExtractElement(V(::builder->CreateCall3(cmpss, vector1, vector2, V(Nucleus::createConstantByte(imm)))), Float::getType(), 0));
6949		}
6950
6951		RValue<Float> cmpeqss(RValue<Float> x, RValue<Float> y)
6952		{
6953			return cmpss(x, y, 0);
6954		}
6955
6956		RValue<Float> cmpltss(RValue<Float> x, RValue<Float> y)
6957		{
6958			return cmpss(x, y, 1);
6959		}
6960
6961		RValue<Float> cmpless(RValue<Float> x, RValue<Float> y)
6962		{
6963			return cmpss(x, y, 2);
6964		}
6965
6966		RValue<Float> cmpunordss(RValue<Float> x, RValue<Float> y)
6967		{
6968			return cmpss(x, y, 3);
6969		}
6970
6971		RValue<Float> cmpneqss(RValue<Float> x, RValue<Float> y)
6972		{
6973			return cmpss(x, y, 4);
6974		}
6975
6976		RValue<Float> cmpnltss(RValue<Float> x, RValue<Float> y)
6977		{
6978			return cmpss(x, y, 5);
6979		}
6980
6981		RValue<Float> cmpnless(RValue<Float> x, RValue<Float> y)
6982		{
6983			return cmpss(x, y, 6);
6984		}
6985
6986		RValue<Float> cmpordss(RValue<Float> x, RValue<Float> y)
6987		{
6988			return cmpss(x, y, 7);
6989		}
6990
6991		RValue<Int4> pabsd(RValue<Int4> x)
6992		{
6993			llvm::Function *pabsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_ssse3_pabs_d_128);
6994
6995			return RValue<Int4>(V(::builder->CreateCall(pabsd, x.value)));
6996		}
6997
6998		RValue<Short4> paddsw(RValue<Short4> x, RValue<Short4> y)
6999		{
7000			llvm::Function *paddsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_w);
7001
7002			return As<Short4>(V(::builder->CreateCall2(paddsw, As<MMX>(x).value, As<MMX>(y).value)));
7003		}
7004
7005		RValue<Short4> psubsw(RValue<Short4> x, RValue<Short4> y)
7006		{
7007			llvm::Function *psubsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_w);
7008
7009			return As<Short4>(V(::builder->CreateCall2(psubsw, As<MMX>(x).value, As<MMX>(y).value)));
7010		}
7011
7012		RValue<UShort4> paddusw(RValue<UShort4> x, RValue<UShort4> y)
7013		{
7014			llvm::Function *paddusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_w);
7015
7016			return As<UShort4>(V(::builder->CreateCall2(paddusw, As<MMX>(x).value, As<MMX>(y).value)));
7017		}
7018
7019		RValue<UShort4> psubusw(RValue<UShort4> x, RValue<UShort4> y)
7020		{
7021			llvm::Function *psubusw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_w);
7022
7023			return As<UShort4>(V(::builder->CreateCall2(psubusw, As<MMX>(x).value, As<MMX>(y).value)));
7024		}
7025
7026		RValue<SByte8> paddsb(RValue<SByte8> x, RValue<SByte8> y)
7027		{
7028			llvm::Function *paddsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padds_b);
7029
7030			return As<SByte8>(V(::builder->CreateCall2(paddsb, As<MMX>(x).value, As<MMX>(y).value)));
7031		}
7032
7033		RValue<SByte8> psubsb(RValue<SByte8> x, RValue<SByte8> y)
7034		{
7035			llvm::Function *psubsb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubs_b);
7036
7037			return As<SByte8>(V(::builder->CreateCall2(psubsb, As<MMX>(x).value, As<MMX>(y).value)));
7038		}
7039
7040		RValue<Byte8> paddusb(RValue<Byte8> x, RValue<Byte8> y)
7041		{
7042			llvm::Function *paddusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_paddus_b);
7043
7044			return As<Byte8>(V(::builder->CreateCall2(paddusb, As<MMX>(x).value, As<MMX>(y).value)));
7045		}
7046
7047		RValue<Byte8> psubusb(RValue<Byte8> x, RValue<Byte8> y)
7048		{
7049			llvm::Function *psubusb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psubus_b);
7050
7051			return As<Byte8>(V(::builder->CreateCall2(psubusb, As<MMX>(x).value, As<MMX>(y).value)));
7052		}
7053
7054		RValue<Short4> paddw(RValue<Short4> x, RValue<Short4> y)
7055		{
7056			llvm::Function *paddw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_w);
7057
7058			return As<Short4>(V(::builder->CreateCall2(paddw, As<MMX>(x).value, As<MMX>(y).value)));
7059		}
7060
7061		RValue<Short4> psubw(RValue<Short4> x, RValue<Short4> y)
7062		{
7063			llvm::Function *psubw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_w);
7064
7065			return As<Short4>(V(::builder->CreateCall2(psubw, As<MMX>(x).value, As<MMX>(y).value)));
7066		}
7067
7068		RValue<Short4> pmullw(RValue<Short4> x, RValue<Short4> y)
7069		{
7070			llvm::Function *pmullw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmull_w);
7071
7072			return As<Short4>(V(::builder->CreateCall2(pmullw, As<MMX>(x).value, As<MMX>(y).value)));
7073		}
7074
7075		RValue<Short4> pand(RValue<Short4> x, RValue<Short4> y)
7076		{
7077			llvm::Function *pand = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pand);
7078
7079			return As<Short4>(V(::builder->CreateCall2(pand, As<MMX>(x).value, As<MMX>(y).value)));
7080		}
7081
7082		RValue<Short4> por(RValue<Short4> x, RValue<Short4> y)
7083		{
7084			llvm::Function *por = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_por);
7085
7086			return As<Short4>(V(::builder->CreateCall2(por, As<MMX>(x).value, As<MMX>(y).value)));
7087		}
7088
7089		RValue<Short4> pxor(RValue<Short4> x, RValue<Short4> y)
7090		{
7091			llvm::Function *pxor = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pxor);
7092
7093			return As<Short4>(V(::builder->CreateCall2(pxor, As<MMX>(x).value, As<MMX>(y).value)));
7094		}
7095
7096		RValue<Short4> pshufw(RValue<Short4> x, unsigned char y)
7097		{
7098			llvm::Function *pshufw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_pshuf_w);
7099
7100			return As<Short4>(V(::builder->CreateCall2(pshufw, As<MMX>(x).value, V(Nucleus::createConstantByte(y)))));
7101		}
7102
7103		RValue<Int2> punpcklwd(RValue<Short4> x, RValue<Short4> y)
7104		{
7105			llvm::Function *punpcklwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklwd);
7106
7107			return As<Int2>(V(::builder->CreateCall2(punpcklwd, As<MMX>(x).value, As<MMX>(y).value)));
7108		}
7109
7110		RValue<Int2> punpckhwd(RValue<Short4> x, RValue<Short4> y)
7111		{
7112			llvm::Function *punpckhwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhwd);
7113
7114			return As<Int2>(V(::builder->CreateCall2(punpckhwd, As<MMX>(x).value, As<MMX>(y).value)));
7115		}
7116
7117		RValue<Short4> pinsrw(RValue<Short4> x, RValue<Int> y, unsigned int i)
7118		{
7119			llvm::Function *pinsrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pinsr_w);
7120
7121			return As<Short4>(V(::builder->CreateCall3(pinsrw, As<MMX>(x).value, y.value, V(Nucleus::createConstantInt(i)))));
7122		}
7123
7124		RValue<Int> pextrw(RValue<Short4> x, unsigned int i)
7125		{
7126			llvm::Function *pextrw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pextr_w);
7127
7128			return RValue<Int>(V(::builder->CreateCall2(pextrw, As<MMX>(x).value, V(Nucleus::createConstantInt(i)))));
7129		}
7130
7131		RValue<Long1> punpckldq(RValue<Int2> x, RValue<Int2> y)
7132		{
7133			llvm::Function *punpckldq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckldq);
7134
7135			return As<Long1>(V(::builder->CreateCall2(punpckldq, As<MMX>(x).value, As<MMX>(y).value)));
7136		}
7137
7138		RValue<Long1> punpckhdq(RValue<Int2> x, RValue<Int2> y)
7139		{
7140			llvm::Function *punpckhdq = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhdq);
7141
7142			return As<Long1>(V(::builder->CreateCall2(punpckhdq, As<MMX>(x).value, As<MMX>(y).value)));
7143		}
7144
7145		RValue<Short4> punpcklbw(RValue<Byte8> x, RValue<Byte8> y)
7146		{
7147			llvm::Function *punpcklbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpcklbw);
7148
7149			return As<Short4>(V(::builder->CreateCall2(punpcklbw, As<MMX>(x).value, As<MMX>(y).value)));
7150		}
7151
7152		RValue<Short4> punpckhbw(RValue<Byte8> x, RValue<Byte8> y)
7153		{
7154			llvm::Function *punpckhbw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_punpckhbw);
7155
7156			return As<Short4>(V(::builder->CreateCall2(punpckhbw, As<MMX>(x).value, As<MMX>(y).value)));
7157		}
7158
7159		RValue<Byte8> paddb(RValue<Byte8> x, RValue<Byte8> y)
7160		{
7161			llvm::Function *paddb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_b);
7162
7163			return As<Byte8>(V(::builder->CreateCall2(paddb, As<MMX>(x).value, As<MMX>(y).value)));
7164		}
7165
7166		RValue<Byte8> psubb(RValue<Byte8> x, RValue<Byte8> y)
7167		{
7168			llvm::Function *psubb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_b);
7169
7170			return As<Byte8>(V(::builder->CreateCall2(psubb, As<MMX>(x).value, As<MMX>(y).value)));
7171		}
7172
7173		RValue<Int2> paddd(RValue<Int2> x, RValue<Int2> y)
7174		{
7175			llvm::Function *paddd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_padd_d);
7176
7177			return As<Int2>(V(::builder->CreateCall2(paddd, As<MMX>(x).value, As<MMX>(y).value)));
7178		}
7179
7180		RValue<Int2> psubd(RValue<Int2> x, RValue<Int2> y)
7181		{
7182			llvm::Function *psubd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psub_d);
7183
7184			return As<Int2>(V(::builder->CreateCall2(psubd, As<MMX>(x).value, As<MMX>(y).value)));
7185		}
7186
7187		RValue<UShort4> pavgw(RValue<UShort4> x, RValue<UShort4> y)
7188		{
7189			llvm::Function *pavgw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pavg_w);
7190
7191			return As<UShort4>(V(::builder->CreateCall2(pavgw, As<MMX>(x).value, As<MMX>(y).value)));
7192		}
7193
7194		RValue<Short4> pmaxsw(RValue<Short4> x, RValue<Short4> y)
7195		{
7196			llvm::Function *pmaxsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmaxs_w);
7197
7198			return As<Short4>(V(::builder->CreateCall2(pmaxsw, As<MMX>(x).value, As<MMX>(y).value)));
7199		}
7200
7201		RValue<Short4> pminsw(RValue<Short4> x, RValue<Short4> y)
7202		{
7203			llvm::Function *pminsw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmins_w);
7204
7205			return As<Short4>(V(::builder->CreateCall2(pminsw, As<MMX>(x).value, As<MMX>(y).value)));
7206		}
7207
7208		RValue<Short4> pcmpgtw(RValue<Short4> x, RValue<Short4> y)
7209		{
7210			llvm::Function *pcmpgtw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_w);
7211
7212			return As<Short4>(V(::builder->CreateCall2(pcmpgtw, As<MMX>(x).value, As<MMX>(y).value)));
7213		}
7214
7215		RValue<Short4> pcmpeqw(RValue<Short4> x, RValue<Short4> y)
7216		{
7217			llvm::Function *pcmpeqw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_w);
7218
7219			return As<Short4>(V(::builder->CreateCall2(pcmpeqw, As<MMX>(x).value, As<MMX>(y).value)));
7220		}
7221
7222		RValue<Byte8> pcmpgtb(RValue<SByte8> x, RValue<SByte8> y)
7223		{
7224			llvm::Function *pcmpgtb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpgt_b);
7225
7226			return As<Byte8>(V(::builder->CreateCall2(pcmpgtb, As<MMX>(x).value, As<MMX>(y).value)));
7227		}
7228
7229		RValue<Byte8> pcmpeqb(RValue<Byte8> x, RValue<Byte8> y)
7230		{
7231			llvm::Function *pcmpeqb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pcmpeq_b);
7232
7233			return As<Byte8>(V(::builder->CreateCall2(pcmpeqb, As<MMX>(x).value, As<MMX>(y).value)));
7234		}
7235
7236		RValue<Short4> packssdw(RValue<Int2> x, RValue<Int2> y)
7237		{
7238			llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packssdw);
7239
7240			return As<Short4>(V(::builder->CreateCall2(packssdw, As<MMX>(x).value, As<MMX>(y).value)));
7241		}
7242
7243		RValue<Short8> packssdw(RValue<Int4> x, RValue<Int4> y)
7244		{
7245			if(CPUID::supportsSSE2())
7246			{
7247				llvm::Function *packssdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_packssdw_128);
7248
7249				return RValue<Short8>(V(::builder->CreateCall2(packssdw, x.value, y.value)));
7250			}
7251			else
7252			{
7253				Int2 loX = Int2(x);
7254				Int2 hiX = Int2(Swizzle(x, 0xEE));
7255
7256				Int2 loY = Int2(y);
7257				Int2 hiY = Int2(Swizzle(y, 0xEE));
7258
7259				Short4 lo = x86::packssdw(loX, hiX);
7260				Short4 hi = x86::packssdw(loY, hiY);
7261
7262				return Short8(lo, hi);
7263			}
7264		}
7265
7266		RValue<SByte8> packsswb(RValue<Short4> x, RValue<Short4> y)
7267		{
7268			llvm::Function *packsswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packsswb);
7269
7270			return As<SByte8>(V(::builder->CreateCall2(packsswb, As<MMX>(x).value, As<MMX>(y).value)));
7271		}
7272
7273		RValue<Byte8> packuswb(RValue<UShort4> x, RValue<UShort4> y)
7274		{
7275			llvm::Function *packuswb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_packuswb);
7276
7277			return As<Byte8>(V(::builder->CreateCall2(packuswb, As<MMX>(x).value, As<MMX>(y).value)));
7278		}
7279
7280		RValue<UShort8> packusdw(RValue<UInt4> x, RValue<UInt4> y)
7281		{
7282			if(CPUID::supportsSSE4_1())
7283			{
7284				llvm::Function *packusdw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_packusdw);
7285
7286				return RValue<UShort8>(V(::builder->CreateCall2(packusdw, x.value, y.value)));
7287			}
7288			else
7289			{
7290				// FIXME: Not an exact replacement!
7291				return As<UShort8>(packssdw(As<Int4>(x - UInt4(0x00008000, 0x00008000, 0x00008000, 0x00008000)), As<Int4>(y - UInt4(0x00008000, 0x00008000, 0x00008000, 0x00008000))) + Short8(0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u, 0x8000u));
7292			}
7293		}
7294
7295		RValue<UShort4> psrlw(RValue<UShort4> x, unsigned char y)
7296		{
7297			llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_w);
7298
7299			return As<UShort4>(V(::builder->CreateCall2(psrlw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7300		}
7301
7302		RValue<UShort8> psrlw(RValue<UShort8> x, unsigned char y)
7303		{
7304			llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_w);
7305
7306			return RValue<UShort8>(V(::builder->CreateCall2(psrlw, x.value, V(Nucleus::createConstantInt(y)))));
7307		}
7308
7309		RValue<Short4> psraw(RValue<Short4> x, unsigned char y)
7310		{
7311			llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_w);
7312
7313			return As<Short4>(V(::builder->CreateCall2(psraw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7314		}
7315
7316		RValue<Short8> psraw(RValue<Short8> x, unsigned char y)
7317		{
7318			llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_w);
7319
7320			return RValue<Short8>(V(::builder->CreateCall2(psraw, x.value, V(Nucleus::createConstantInt(y)))));
7321		}
7322
7323		RValue<Short4> psllw(RValue<Short4> x, unsigned char y)
7324		{
7325			llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_w);
7326
7327			return As<Short4>(V(::builder->CreateCall2(psllw, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7328		}
7329
7330		RValue<Short8> psllw(RValue<Short8> x, unsigned char y)
7331		{
7332			llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_w);
7333
7334			return RValue<Short8>(V(::builder->CreateCall2(psllw, x.value, V(Nucleus::createConstantInt(y)))));
7335		}
7336
7337		RValue<Int2> pslld(RValue<Int2> x, unsigned char y)
7338		{
7339			llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pslli_d);
7340
7341			return As<Int2>(V(::builder->CreateCall2(pslld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7342		}
7343
7344		RValue<Int4> pslld(RValue<Int4> x, unsigned char y)
7345		{
7346			if(CPUID::supportsSSE2())
7347			{
7348				llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pslli_d);
7349
7350				return RValue<Int4>(V(::builder->CreateCall2(pslld, x.value, V(Nucleus::createConstantInt(y)))));
7351			}
7352			else
7353			{
7354				Int2 lo = Int2(x);
7355				Int2 hi = Int2(Swizzle(x, 0xEE));
7356
7357				lo = x86::pslld(lo, y);
7358				hi = x86::pslld(hi, y);
7359
7360				return Int4(lo, hi);
7361			}
7362		}
7363
7364		RValue<Int2> psrad(RValue<Int2> x, unsigned char y)
7365		{
7366			llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrai_d);
7367
7368			return As<Int2>(V(::builder->CreateCall2(psrad, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7369		}
7370
7371		RValue<Int4> psrad(RValue<Int4> x, unsigned char y)
7372		{
7373			if(CPUID::supportsSSE2())
7374			{
7375				llvm::Function *psrad = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrai_d);
7376
7377				return RValue<Int4>(V(::builder->CreateCall2(psrad, x.value, V(Nucleus::createConstantInt(y)))));
7378			}
7379			else
7380			{
7381				Int2 lo = Int2(x);
7382				Int2 hi = Int2(Swizzle(x, 0xEE));
7383
7384				lo = x86::psrad(lo, y);
7385				hi = x86::psrad(hi, y);
7386
7387				return Int4(lo, hi);
7388			}
7389		}
7390
7391		RValue<UInt2> psrld(RValue<UInt2> x, unsigned char y)
7392		{
7393			llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrli_d);
7394
7395			return As<UInt2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, V(Nucleus::createConstantInt(y)))));
7396		}
7397
7398		RValue<UInt4> psrld(RValue<UInt4> x, unsigned char y)
7399		{
7400			if(CPUID::supportsSSE2())
7401			{
7402				llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_psrli_d);
7403
7404				return RValue<UInt4>(V(::builder->CreateCall2(psrld, x.value, V(Nucleus::createConstantInt(y)))));
7405			}
7406			else
7407			{
7408				UInt2 lo = As<UInt2>(Int2(As<Int4>(x)));
7409				UInt2 hi = As<UInt2>(Int2(Swizzle(As<Int4>(x), 0xEE)));
7410
7411				lo = x86::psrld(lo, y);
7412				hi = x86::psrld(hi, y);
7413
7414				return UInt4(lo, hi);
7415			}
7416		}
7417
7418		RValue<UShort4> psrlw(RValue<UShort4> x, RValue<Long1> y)
7419		{
7420			llvm::Function *psrlw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrl_w);
7421
7422			return As<UShort4>(V(::builder->CreateCall2(psrlw, As<MMX>(x).value, As<MMX>(y).value)));
7423		}
7424
7425		RValue<Short4> psraw(RValue<Short4> x, RValue<Long1> y)
7426		{
7427			llvm::Function *psraw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psra_w);
7428
7429			return As<Short4>(V(::builder->CreateCall2(psraw, As<MMX>(x).value, As<MMX>(y).value)));
7430		}
7431
7432		RValue<Short4> psllw(RValue<Short4> x, RValue<Long1> y)
7433		{
7434			llvm::Function *psllw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psll_w);
7435
7436			return As<Short4>(V(::builder->CreateCall2(psllw, As<MMX>(x).value, As<MMX>(y).value)));
7437		}
7438
7439		RValue<Int2> pslld(RValue<Int2> x, RValue<Long1> y)
7440		{
7441			llvm::Function *pslld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psll_d);
7442
7443			return As<Int2>(V(::builder->CreateCall2(pslld, As<MMX>(x).value, As<MMX>(y).value)));
7444		}
7445
7446		RValue<UInt2> psrld(RValue<UInt2> x, RValue<Long1> y)
7447		{
7448			llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psrl_d);
7449
7450			return As<UInt2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, As<MMX>(y).value)));
7451		}
7452
7453		RValue<Int2> psrad(RValue<Int2> x, RValue<Long1> y)
7454		{
7455			llvm::Function *psrld = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_psra_d);
7456
7457			return As<Int2>(V(::builder->CreateCall2(psrld, As<MMX>(x).value, As<MMX>(y).value)));
7458		}
7459
7460		RValue<Int4> pmaxsd(RValue<Int4> x, RValue<Int4> y)
7461		{
7462			llvm::Function *pmaxsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxsd);
7463
7464			return RValue<Int4>(V(::builder->CreateCall2(pmaxsd, x.value, y.value)));
7465		}
7466
7467		RValue<Int4> pminsd(RValue<Int4> x, RValue<Int4> y)
7468		{
7469			llvm::Function *pminsd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminsd);
7470
7471			return RValue<Int4>(V(::builder->CreateCall2(pminsd, x.value, y.value)));
7472		}
7473
7474		RValue<UInt4> pmaxud(RValue<UInt4> x, RValue<UInt4> y)
7475		{
7476			llvm::Function *pmaxud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmaxud);
7477
7478			return RValue<UInt4>(V(::builder->CreateCall2(pmaxud, x.value, y.value)));
7479		}
7480
7481		RValue<UInt4> pminud(RValue<UInt4> x, RValue<UInt4> y)
7482		{
7483			llvm::Function *pminud = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pminud);
7484
7485			return RValue<UInt4>(V(::builder->CreateCall2(pminud, x.value, y.value)));
7486		}
7487
7488		RValue<Short4> pmulhw(RValue<Short4> x, RValue<Short4> y)
7489		{
7490			llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulh_w);
7491
7492			return As<Short4>(V(::builder->CreateCall2(pmulhw, As<MMX>(x).value, As<MMX>(y).value)));
7493		}
7494
7495		RValue<UShort4> pmulhuw(RValue<UShort4> x, RValue<UShort4> y)
7496		{
7497			llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmulhu_w);
7498
7499			return As<UShort4>(V(::builder->CreateCall2(pmulhuw, As<MMX>(x).value, As<MMX>(y).value)));
7500		}
7501
7502		RValue<Int2> pmaddwd(RValue<Short4> x, RValue<Short4> y)
7503		{
7504			llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmadd_wd);
7505
7506			return As<Int2>(V(::builder->CreateCall2(pmaddwd, As<MMX>(x).value, As<MMX>(y).value)));
7507		}
7508
7509		RValue<Short8> pmulhw(RValue<Short8> x, RValue<Short8> y)
7510		{
7511			llvm::Function *pmulhw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulh_w);
7512
7513			return RValue<Short8>(V(::builder->CreateCall2(pmulhw, x.value, y.value)));
7514		}
7515
7516		RValue<UShort8> pmulhuw(RValue<UShort8> x, RValue<UShort8> y)
7517		{
7518			llvm::Function *pmulhuw = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmulhu_w);
7519
7520			return RValue<UShort8>(V(::builder->CreateCall2(pmulhuw, x.value, y.value)));
7521		}
7522
7523		RValue<Int4> pmaddwd(RValue<Short8> x, RValue<Short8> y)
7524		{
7525			llvm::Function *pmaddwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse2_pmadd_wd);
7526
7527			return RValue<Int4>(V(::builder->CreateCall2(pmaddwd, x.value, y.value)));
7528		}
7529
7530		RValue<Int> movmskps(RValue<Float4> x)
7531		{
7532			llvm::Function *movmskps = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse_movmsk_ps);
7533
7534			return RValue<Int>(V(::builder->CreateCall(movmskps, x.value)));
7535		}
7536
7537		RValue<Int> pmovmskb(RValue<Byte8> x)
7538		{
7539			llvm::Function *pmovmskb = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_pmovmskb);
7540
7541			return RValue<Int>(V(::builder->CreateCall(pmovmskb, As<MMX>(x).value)));
7542		}
7543
7544		//RValue<Int2> movd(RValue<Pointer<Int>> x)
7545		//{
7546		//	Value *element = Nucleus::createLoad(x.value);
7547
7548		////	Value *int2 = UndefValue::get(Int2::getType());
7549		////	int2 = Nucleus::createInsertElement(int2, element, ConstantInt::get(Int::getType(), 0));
7550
7551		//	Value *int2 = Nucleus::createBitCast(Nucleus::createZExt(element, Long::getType()), Int2::getType());
7552
7553		//	return RValue<Int2>(int2);
7554		//}
7555
7556		//RValue<Int2> movdq2q(RValue<Int4> x)
7557		//{
7558		//	Value *long2 = Nucleus::createBitCast(x.value, T(VectorType::get(Long::getType(), 2)));
7559		//	Value *element = Nucleus::createExtractElement(long2, ConstantInt::get(Int::getType(), 0));
7560
7561		//	return RValue<Int2>(Nucleus::createBitCast(element, Int2::getType()));
7562		//}
7563
7564		RValue<Int4> pmovzxbd(RValue<Int4> x)
7565		{
7566			llvm::Function *pmovzxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxbd);
7567
7568			return RValue<Int4>(V(::builder->CreateCall(pmovzxbd, Nucleus::createBitCast(x.value, Byte16::getType()))));
7569		}
7570
7571		RValue<Int4> pmovsxbd(RValue<Int4> x)
7572		{
7573			llvm::Function *pmovsxbd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxbd);
7574
7575			return RValue<Int4>(V(::builder->CreateCall(pmovsxbd, Nucleus::createBitCast(x.value, SByte16::getType()))));
7576		}
7577
7578		RValue<Int4> pmovzxwd(RValue<Int4> x)
7579		{
7580			llvm::Function *pmovzxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovzxwd);
7581
7582			return RValue<Int4>(V(::builder->CreateCall(pmovzxwd, Nucleus::createBitCast(x.value, UShort8::getType()))));
7583		}
7584
7585		RValue<Int4> pmovsxwd(RValue<Int4> x)
7586		{
7587			llvm::Function *pmovsxwd = Intrinsic::getDeclaration(::module, Intrinsic::x86_sse41_pmovsxwd);
7588
7589			return RValue<Int4>(V(::builder->CreateCall(pmovsxwd, Nucleus::createBitCast(x.value, Short8::getType()))));
7590		}
7591
7592		void emms()
7593		{
7594			llvm::Function *emms = Intrinsic::getDeclaration(::module, Intrinsic::x86_mmx_emms);
7595
7596			V(::builder->CreateCall(emms));
7597		}
7598	}
7599}
7600