blend_jit.cpp revision cee66dd2aa182ba30130bef3298444667753b051
1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file blend_jit.cpp
24*
25* @brief Implementation of the blend jitter
26*
27* Notes:
28*
29******************************************************************************/
30#include "jit_api.h"
31#include "blend_jit.h"
32#include "builder.h"
33#include "state_llvm.h"
34
35#include <sstream>
36
37// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
38#define QUANTIZE_THRESHOLD 2
39
40using namespace llvm;
41using namespace SwrJit;
42
43//////////////////////////////////////////////////////////////////////////
44/// Interface to Jitting a blend shader
45//////////////////////////////////////////////////////////////////////////
46struct BlendJit : public Builder
47{
48    BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
49
50    template<bool Color, bool Alpha>
51    void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
52    {
53        Value* out[4];
54
55        switch (factor)
56        {
57        case BLENDFACTOR_ONE:
58            out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
59            break;
60        case BLENDFACTOR_SRC_COLOR:
61            out[0] = src[0];
62            out[1] = src[1];
63            out[2] = src[2];
64            out[3] = src[3];
65            break;
66        case BLENDFACTOR_SRC_ALPHA:
67            out[0] = out[1] = out[2] = out[3] = src[3];
68            break;
69        case BLENDFACTOR_DST_ALPHA:
70            out[0] = out[1] = out[2] = out[3] = dst[3];
71            break;
72        case BLENDFACTOR_DST_COLOR:
73            out[0] = dst[0];
74            out[1] = dst[1];
75            out[2] = dst[2];
76            out[3] = dst[3];
77            break;
78        case BLENDFACTOR_SRC_ALPHA_SATURATE:
79            out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
80            out[3] = VIMMED1(1.0f);
81            break;
82        case BLENDFACTOR_CONST_COLOR:
83            out[0] = constColor[0];
84            out[1] = constColor[1];
85            out[2] = constColor[2];
86            out[3] = constColor[3];
87            break;
88        case BLENDFACTOR_CONST_ALPHA:
89            out[0] = out[1] = out[2] = out[3] = constColor[3];
90            break;
91        case BLENDFACTOR_SRC1_COLOR:
92            out[0] = src1[0];
93            out[1] = src1[1];
94            out[2] = src1[2];
95            out[3] = src1[3];
96            break;
97        case BLENDFACTOR_SRC1_ALPHA:
98            out[0] = out[1] = out[2] = out[3] = src1[3];
99            break;
100        case BLENDFACTOR_ZERO:
101            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
102            break;
103        case BLENDFACTOR_INV_SRC_COLOR:
104            out[0] = FSUB(VIMMED1(1.0f), src[0]);
105            out[1] = FSUB(VIMMED1(1.0f), src[1]);
106            out[2] = FSUB(VIMMED1(1.0f), src[2]);
107            out[3] = FSUB(VIMMED1(1.0f), src[3]);
108            break;
109        case BLENDFACTOR_INV_SRC_ALPHA:
110            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
111            break;
112        case BLENDFACTOR_INV_DST_ALPHA:
113            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
114            break;
115        case BLENDFACTOR_INV_DST_COLOR:
116            out[0] = FSUB(VIMMED1(1.0f), dst[0]);
117            out[1] = FSUB(VIMMED1(1.0f), dst[1]);
118            out[2] = FSUB(VIMMED1(1.0f), dst[2]);
119            out[3] = FSUB(VIMMED1(1.0f), dst[3]);
120            break;
121        case BLENDFACTOR_INV_CONST_COLOR:
122            out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
123            out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
124            out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
125            out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
126            break;
127        case BLENDFACTOR_INV_CONST_ALPHA:
128            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
129            break;
130        case BLENDFACTOR_INV_SRC1_COLOR:
131            out[0] = FSUB(VIMMED1(1.0f), src1[0]);
132            out[1] = FSUB(VIMMED1(1.0f), src1[1]);
133            out[2] = FSUB(VIMMED1(1.0f), src1[2]);
134            out[3] = FSUB(VIMMED1(1.0f), src1[3]);
135            break;
136        case BLENDFACTOR_INV_SRC1_ALPHA:
137            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
138            break;
139        default:
140            SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
141            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
142            break;
143        }
144
145        if (Color)
146        {
147            result[0] = out[0];
148            result[1] = out[1];
149            result[2] = out[2];
150        }
151
152        if (Alpha)
153        {
154            result[3] = out[3];
155        }
156    }
157
158    void Clamp(SWR_FORMAT format, Value* src[4])
159    {
160        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
161        SWR_TYPE type = info.type[0];
162
163        switch (type)
164        {
165        case SWR_TYPE_FLOAT:
166            break;
167
168        case SWR_TYPE_UNORM:
169            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
170            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
171            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
172            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
173            break;
174
175        case SWR_TYPE_SNORM:
176            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
177            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
178            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
179            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
180            break;
181
182        default: SWR_ASSERT(false, "Unsupport format type: %d", type);
183        }
184    }
185
186    void ApplyDefaults(SWR_FORMAT format, Value* src[4])
187    {
188        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
189
190        bool valid[] = { false, false, false, false };
191        for (uint32_t c = 0; c < info.numComps; ++c)
192        {
193            valid[info.swizzle[c]] = true;
194        }
195
196        for (uint32_t c = 0; c < 4; ++c)
197        {
198            if (!valid[c])
199            {
200                src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
201            }
202        }
203    }
204
205    void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
206    {
207        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
208
209        for (uint32_t c = 0; c < info.numComps; ++c)
210        {
211            if (info.type[c] == SWR_TYPE_UNUSED)
212            {
213                src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
214            }
215        }
216    }
217
218    void Quantize(SWR_FORMAT format, Value* src[4])
219    {
220        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
221        for (uint32_t c = 0; c < info.numComps; ++c)
222        {
223            if (info.bpc[c] <= QUANTIZE_THRESHOLD)
224            {
225                uint32_t swizComp = info.swizzle[c];
226                float factor = (float)((1 << info.bpc[c]) - 1);
227                switch (info.type[c])
228                {
229                case SWR_TYPE_UNORM:
230                    src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
231                    src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
232                    src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
233                    break;
234                default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
235                }
236            }
237        }
238    }
239
240    template<bool Color, bool Alpha>
241    void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
242    {
243        Value* out[4];
244        Value* srcBlend[4];
245        Value* dstBlend[4];
246        for (uint32_t i = 0; i < 4; ++i)
247        {
248            srcBlend[i] = FMUL(src[i], srcFactor[i]);
249            dstBlend[i] = FMUL(dst[i], dstFactor[i]);
250        }
251
252        switch (blendOp)
253        {
254        case BLENDOP_ADD:
255            out[0] = FADD(srcBlend[0], dstBlend[0]);
256            out[1] = FADD(srcBlend[1], dstBlend[1]);
257            out[2] = FADD(srcBlend[2], dstBlend[2]);
258            out[3] = FADD(srcBlend[3], dstBlend[3]);
259            break;
260
261        case BLENDOP_SUBTRACT:
262            out[0] = FSUB(srcBlend[0], dstBlend[0]);
263            out[1] = FSUB(srcBlend[1], dstBlend[1]);
264            out[2] = FSUB(srcBlend[2], dstBlend[2]);
265            out[3] = FSUB(srcBlend[3], dstBlend[3]);
266            break;
267
268        case BLENDOP_REVSUBTRACT:
269            out[0] = FSUB(dstBlend[0], srcBlend[0]);
270            out[1] = FSUB(dstBlend[1], srcBlend[1]);
271            out[2] = FSUB(dstBlend[2], srcBlend[2]);
272            out[3] = FSUB(dstBlend[3], srcBlend[3]);
273            break;
274
275        case BLENDOP_MIN:
276            out[0] = VMINPS(src[0], dst[0]);
277            out[1] = VMINPS(src[1], dst[1]);
278            out[2] = VMINPS(src[2], dst[2]);
279            out[3] = VMINPS(src[3], dst[3]);
280            break;
281
282        case BLENDOP_MAX:
283            out[0] = VMAXPS(src[0], dst[0]);
284            out[1] = VMAXPS(src[1], dst[1]);
285            out[2] = VMAXPS(src[2], dst[2]);
286            out[3] = VMAXPS(src[3], dst[3]);
287            break;
288
289        default:
290            SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
291            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
292            break;
293        }
294
295        if (Color)
296        {
297            result[0] = out[0];
298            result[1] = out[1];
299            result[2] = out[2];
300        }
301
302        if (Alpha)
303        {
304            result[3] = out[3];
305        }
306    }
307
308    void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
309    {
310        // Op: (s == PS output, d = RT contents)
311        switch(logicOp)
312        {
313        case LOGICOP_CLEAR:
314            result[0] = VIMMED1(0);
315            result[1] = VIMMED1(0);
316            result[2] = VIMMED1(0);
317            result[3] = VIMMED1(0);
318            break;
319
320        case LOGICOP_NOR:
321            // ~(s | d)
322            result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
323            result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
324            result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
325            result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
326            break;
327
328        case LOGICOP_AND_INVERTED:
329            // ~s & d
330            // todo: use avx andnot instr when I can find the intrinsic to call
331            result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
332            result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
333            result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
334            result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
335            break;
336
337        case LOGICOP_COPY_INVERTED:
338            // ~s
339            result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
340            result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
341            result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
342            result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
343            break;
344
345        case LOGICOP_AND_REVERSE:
346            // s & ~d
347            // todo: use avx andnot instr when I can find the intrinsic to call
348            result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
349            result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
350            result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
351            result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
352            break;
353
354        case LOGICOP_INVERT:
355            // ~d
356            result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
357            result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
358            result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
359            result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
360            break;
361
362        case LOGICOP_XOR:
363            // s ^ d
364            result[0] = XOR(src[0], dst[0]);
365            result[1] = XOR(src[1], dst[1]);
366            result[2] = XOR(src[2], dst[2]);
367            result[3] = XOR(src[3], dst[3]);
368            break;
369
370        case LOGICOP_NAND:
371            // ~(s & d)
372            result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
373            result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
374            result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
375            result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
376            break;
377
378        case LOGICOP_AND:
379            // s & d
380            result[0] = AND(src[0], dst[0]);
381            result[1] = AND(src[1], dst[1]);
382            result[2] = AND(src[2], dst[2]);
383            result[3] = AND(src[3], dst[3]);
384            break;
385
386        case LOGICOP_EQUIV:
387            // ~(s ^ d)
388            result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
389            result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
390            result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
391            result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
392            break;
393
394        case LOGICOP_NOOP:
395            result[0] = dst[0];
396            result[1] = dst[1];
397            result[2] = dst[2];
398            result[3] = dst[3];
399            break;
400
401        case LOGICOP_OR_INVERTED:
402            // ~s | d
403            result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
404            result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
405            result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
406            result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
407            break;
408
409        case LOGICOP_COPY:
410            result[0] = src[0];
411            result[1] = src[1];
412            result[2] = src[2];
413            result[3] = src[3];
414            break;
415
416        case LOGICOP_OR_REVERSE:
417            // s | ~d
418            result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
419            result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
420            result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
421            result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
422            break;
423
424        case LOGICOP_OR:
425            // s | d
426            result[0] = OR(src[0], dst[0]);
427            result[1] = OR(src[1], dst[1]);
428            result[2] = OR(src[2], dst[2]);
429            result[3] = OR(src[3], dst[3]);
430            break;
431
432        case LOGICOP_SET:
433            result[0] = VIMMED1(0xFFFFFFFF);
434            result[1] = VIMMED1(0xFFFFFFFF);
435            result[2] = VIMMED1(0xFFFFFFFF);
436            result[3] = VIMMED1(0xFFFFFFFF);
437            break;
438
439        default:
440            SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp);
441            result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
442            break;
443        }
444    }
445
446    void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* ppAlpha, Value* ppMask)
447    {
448        // load uint32_t reference
449        Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
450
451        // load alpha
452        Value* pAlpha = LOAD(ppAlpha);
453
454        Value* pTest = nullptr;
455        if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
456        {
457            // convert float alpha to unorm8
458            Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
459            pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
460
461            // compare
462            switch (state.alphaTestFunction)
463            {
464            case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
465            case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
466            case ZFUNC_LT:      pTest = ICMP_ULT(pAlphaU8, pRef); break;
467            case ZFUNC_EQ:      pTest = ICMP_EQ(pAlphaU8, pRef); break;
468            case ZFUNC_LE:      pTest = ICMP_ULE(pAlphaU8, pRef); break;
469            case ZFUNC_GT:      pTest = ICMP_UGT(pAlphaU8, pRef); break;
470            case ZFUNC_NE:      pTest = ICMP_NE(pAlphaU8, pRef); break;
471            case ZFUNC_GE:      pTest = ICMP_UGE(pAlphaU8, pRef); break;
472            default:
473                SWR_ASSERT(false, "Invalid alpha test function");
474                break;
475            }
476        }
477        else
478        {
479            // cast ref to float
480            pRef = BITCAST(pRef, mSimdFP32Ty);
481
482            // compare
483            switch (state.alphaTestFunction)
484            {
485            case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
486            case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
487            case ZFUNC_LT:      pTest = FCMP_OLT(pAlpha, pRef); break;
488            case ZFUNC_EQ:      pTest = FCMP_OEQ(pAlpha, pRef); break;
489            case ZFUNC_LE:      pTest = FCMP_OLE(pAlpha, pRef); break;
490            case ZFUNC_GT:      pTest = FCMP_OGT(pAlpha, pRef); break;
491            case ZFUNC_NE:      pTest = FCMP_ONE(pAlpha, pRef); break;
492            case ZFUNC_GE:      pTest = FCMP_OGE(pAlpha, pRef); break;
493            default:
494                SWR_ASSERT(false, "Invalid alpha test function");
495                break;
496            }
497        }
498
499        // load current mask
500        Value* pMask = LOAD(ppMask);
501
502        // convert to int1 mask
503        pMask = MASK(pMask);
504
505        // and with alpha test result
506        pMask = AND(pMask, pTest);
507
508        // convert back to vector mask
509        pMask = VMASK(pMask);
510
511        // store new mask
512        STORE(pMask, ppMask);
513    }
514
515    Function* Create(const BLEND_COMPILE_STATE& state)
516    {
517        static std::size_t jitNum = 0;
518
519        std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
520        fnName << jitNum++;
521
522        // blend function signature
523        //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
524
525        std::vector<Type*> args{
526            PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
527            PointerType::get(mSimdFP32Ty, 0),               // simdvector& src
528            PointerType::get(mSimdFP32Ty, 0),               // simdvector& src1
529            PointerType::get(mSimdFP32Ty, 0),               // src0alpha
530            Type::getInt32Ty(JM()->mContext),               // sampleNum
531            PointerType::get(mSimdFP32Ty, 0),               // uint8_t* pDst
532            PointerType::get(mSimdFP32Ty, 0),               // simdvector& result
533            PointerType::get(mSimdInt32Ty, 0),              // simdscalari* oMask
534            PointerType::get(mSimdInt32Ty, 0),              // simdscalari* pMask
535        };
536
537        FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
538        Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
539
540        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
541
542        IRB()->SetInsertPoint(entry);
543
544        // arguments
545        auto argitr = blendFunc->getArgumentList().begin();
546        Value* pBlendState = &*argitr++;
547        pBlendState->setName("pBlendState");
548        Value* pSrc = &*argitr++;
549        pSrc->setName("src");
550        Value* pSrc1 = &*argitr++;
551        pSrc1->setName("src1");
552        Value* pSrc0Alpha = &*argitr++;
553        pSrc0Alpha->setName("src0alpha");
554        Value* sampleNum = &*argitr++;
555        sampleNum->setName("sampleNum");
556        Value* pDst = &*argitr++;
557        pDst->setName("pDst");
558        Value* pResult = &*argitr++;
559        pResult->setName("result");
560        Value* ppoMask = &*argitr++;
561        ppoMask->setName("ppoMask");
562        Value* ppMask = &*argitr++;
563        ppMask->setName("pMask");
564
565        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
566        Value* dst[4];
567        Value* constantColor[4];
568        Value* src[4];
569        Value* src1[4];
570        Value* result[4];
571        for (uint32_t i = 0; i < 4; ++i)
572        {
573            // load hot tile
574            dst[i] = LOAD(pDst, { i });
575
576            // load constant color
577            constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
578
579            // load src
580            src[i] = LOAD(pSrc, { i });
581
582            // load src1
583            src1[i] = LOAD(pSrc1, { i });
584        }
585        Value* currentMask = VIMMED1(-1);
586        if (state.desc.alphaToCoverageEnable)
587        {
588            Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
589            uint32_t bits = (1 << state.desc.numSamples) - 1;
590            currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
591            currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
592        }
593
594        // alpha test
595        if (state.desc.alphaTestEnable)
596        {
597            AlphaTest(state, pBlendState, pSrc0Alpha, ppMask);
598        }
599
600        // color blend
601        if (state.blendState.blendEnable)
602        {
603            // clamp sources
604            Clamp(state.format, src);
605            Clamp(state.format, src1);
606            Clamp(state.format, dst);
607            Clamp(state.format, constantColor);
608
609            // apply defaults to hottile contents to take into account missing components
610            ApplyDefaults(state.format, dst);
611
612            // Force defaults for unused 'X' components
613            ApplyUnusedDefaults(state.format, dst);
614
615            // Quantize low precision components
616            Quantize(state.format, dst);
617
618            // special case clamping for R11G11B10_float which has no sign bit
619            if (state.format == R11G11B10_FLOAT)
620            {
621                dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
622                dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
623                dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
624                dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
625            }
626
627            Value* srcFactor[4];
628            Value* dstFactor[4];
629            if (state.desc.independentAlphaBlendEnable)
630            {
631                GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
632                GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
633
634                GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
635                GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
636
637                BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
638                BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
639            }
640            else
641            {
642                GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
643                GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
644
645                BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
646            }
647
648            // store results out
649            for (uint32_t i = 0; i < 4; ++i)
650            {
651                STORE(result[i], pResult, { i });
652            }
653        }
654
655        if(state.blendState.logicOpEnable)
656        {
657            const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
658            Value* vMask[4];
659            float scale[4];
660
661            if (!state.blendState.blendEnable)
662            {
663                Clamp(state.format, src);
664                Clamp(state.format, dst);
665            }
666
667            for(uint32_t i = 0; i < 4; i++)
668            {
669                if (info.type[i] == SWR_TYPE_UNUSED)
670                {
671                    continue;
672                }
673
674                if (info.bpc[i] >= 32) {
675                    vMask[i] = VIMMED1(0xFFFFFFFF);
676                    scale[i] = 0xFFFFFFFF;
677                } else {
678                    vMask[i] = VIMMED1((1 << info.bpc[i]) - 1);
679                    if (info.type[i] == SWR_TYPE_SNORM)
680                        scale[i] = (1 << (info.bpc[i] - 1)) - 1;
681                    else
682                        scale[i] = (1 << info.bpc[i]) - 1;
683                }
684
685                switch (info.type[i]) {
686                default:
687                    SWR_ASSERT(0, "Unsupported type for logic op\n");
688                    /* fallthrough */
689                case SWR_TYPE_UINT:
690                case SWR_TYPE_SINT:
691                    src[i] = BITCAST(src[i], mSimdInt32Ty);
692                    dst[i] = BITCAST(dst[i], mSimdInt32Ty);
693                    break;
694                case SWR_TYPE_SNORM:
695                    src[i] = FADD(src[i], VIMMED1(0.5f));
696                    dst[i] = FADD(dst[i], VIMMED1(0.5f));
697                    /* fallthrough */
698                case SWR_TYPE_UNORM:
699                    src[i] = FP_TO_UI(
700                        FMUL(src[i], VIMMED1(scale[i])),
701                        mSimdInt32Ty);
702                    dst[i] = FP_TO_UI(
703                        FMUL(dst[i], VIMMED1(scale[i])),
704                        mSimdInt32Ty);
705                    break;
706                }
707            }
708
709            LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
710
711            // store results out
712            for(uint32_t i = 0; i < 4; ++i)
713            {
714                if (info.type[i] == SWR_TYPE_UNUSED)
715                {
716                    continue;
717                }
718
719                // clear upper bits from PS output not in RT format after doing logic op
720                result[i] = AND(result[i], vMask[i]);
721
722                switch (info.type[i]) {
723                default:
724                    SWR_ASSERT(0, "Unsupported type for logic op\n");
725                    /* fallthrough */
726                case SWR_TYPE_UINT:
727                case SWR_TYPE_SINT:
728                    result[i] = BITCAST(result[i], mSimdFP32Ty);
729                    break;
730                case SWR_TYPE_SNORM:
731                case SWR_TYPE_UNORM:
732                    result[i] = FMUL(UI_TO_FP(result[i], mSimdFP32Ty),
733                                     VIMMED1(1.0f / scale[i]));
734                    if (info.type[i] == SWR_TYPE_SNORM)
735                        result[i] = FADD(result[i], VIMMED1(-0.5f));
736                    break;
737                }
738
739                STORE(result[i], pResult, {i});
740            }
741        }
742
743        if(state.desc.oMaskEnable)
744        {
745            assert(!(state.desc.alphaToCoverageEnable));
746            // load current mask
747            Value* oMask = LOAD(ppoMask);
748            Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
749            oMask = AND(oMask, sampleMasked);
750            currentMask = AND(oMask, currentMask);
751        }
752
753        if(state.desc.sampleMaskEnable)
754        {
755            Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
756            Value* sampleMasked = SHL(C(1), sampleNum);
757            sampleMask = AND(sampleMask, sampleMasked);
758            sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
759            sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
760            currentMask = AND(sampleMask, currentMask);
761        }
762
763        if (state.desc.alphaToCoverageEnable)
764        {
765            Value* sampleMasked = SHL(C(1), sampleNum);
766            currentMask = AND(currentMask, VBROADCAST(sampleMasked));
767        }
768
769        if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
770           state.desc.oMaskEnable)
771        {
772            // load current mask
773            Value* pMask = LOAD(ppMask);
774            currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty);
775            Value* outputMask = AND(pMask, currentMask);
776            // store new mask
777            STORE(outputMask, GEP(ppMask, C(0)));
778        }
779
780        RET_VOID();
781
782        JitManager::DumpToFile(blendFunc, "");
783
784        ::FunctionPassManager passes(JM()->mpCurrentModule);
785
786        passes.add(createBreakCriticalEdgesPass());
787        passes.add(createCFGSimplificationPass());
788        passes.add(createEarlyCSEPass());
789        passes.add(createPromoteMemoryToRegisterPass());
790        passes.add(createCFGSimplificationPass());
791        passes.add(createEarlyCSEPass());
792        passes.add(createInstructionCombiningPass());
793        passes.add(createInstructionSimplifierPass());
794        passes.add(createConstantPropagationPass());
795        passes.add(createSCCPPass());
796        passes.add(createAggressiveDCEPass());
797
798        passes.run(*blendFunc);
799
800        JitManager::DumpToFile(blendFunc, "optimized");
801
802        return blendFunc;
803    }
804};
805
806//////////////////////////////////////////////////////////////////////////
807/// @brief JITs from fetch shader IR
808/// @param hJitMgr - JitManager handle
809/// @param func   - LLVM function IR
810/// @return PFN_FETCH_FUNC - pointer to fetch code
811PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
812{
813    const llvm::Function *func = (const llvm::Function*)hFunc;
814    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
815    PFN_BLEND_JIT_FUNC pfnBlend;
816    pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
817    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
818    pJitMgr->mIsModuleFinalized = true;
819
820    return pfnBlend;
821}
822
823//////////////////////////////////////////////////////////////////////////
824/// @brief JIT compiles blend shader
825/// @param hJitMgr - JitManager handle
826/// @param state   - blend state to build function from
827extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
828{
829    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
830
831    pJitMgr->SetupNewModule();
832
833    BlendJit theJit(pJitMgr);
834    HANDLE hFunc = theJit.Create(state);
835
836    return JitBlendFunc(hJitMgr, hFunc);
837}
838