blend_jit.cpp revision d3d97f8395513bf365d2fe8e4292c8098290586f
1/****************************************************************************
2* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
3*
4* Permission is hereby granted, free of charge, to any person obtaining a
5* copy of this software and associated documentation files (the "Software"),
6* to deal in the Software without restriction, including without limitation
7* the rights to use, copy, modify, merge, publish, distribute, sublicense,
8* and/or sell copies of the Software, and to permit persons to whom the
9* Software is furnished to do so, subject to the following conditions:
10*
11* The above copyright notice and this permission notice (including the next
12* paragraph) shall be included in all copies or substantial portions of the
13* Software.
14*
15* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21* IN THE SOFTWARE.
22*
23* @file blend_jit.cpp
24*
25* @brief Implementation of the blend jitter
26*
27* Notes:
28*
29******************************************************************************/
30#include "jit_api.h"
31#include "blend_jit.h"
32#include "builder.h"
33#include "state_llvm.h"
34
35#include <sstream>
36
37// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
38#define QUANTIZE_THRESHOLD 2
39
40//////////////////////////////////////////////////////////////////////////
41/// Interface to Jitting a blend shader
42//////////////////////////////////////////////////////////////////////////
43struct BlendJit : public Builder
44{
45    BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
46
47    template<bool Color, bool Alpha>
48    void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
49    {
50        Value* out[4];
51
52        switch (factor)
53        {
54        case BLENDFACTOR_ONE:
55            out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
56            break;
57        case BLENDFACTOR_SRC_COLOR:
58            out[0] = src[0];
59            out[1] = src[1];
60            out[2] = src[2];
61            out[3] = src[3];
62            break;
63        case BLENDFACTOR_SRC_ALPHA:
64            out[0] = out[1] = out[2] = out[3] = src[3];
65            break;
66        case BLENDFACTOR_DST_ALPHA:
67            out[0] = out[1] = out[2] = out[3] = dst[3];
68            break;
69        case BLENDFACTOR_DST_COLOR:
70            out[0] = dst[0];
71            out[1] = dst[1];
72            out[2] = dst[2];
73            out[3] = dst[3];
74            break;
75        case BLENDFACTOR_SRC_ALPHA_SATURATE:
76            out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
77            out[3] = VIMMED1(1.0f);
78            break;
79        case BLENDFACTOR_CONST_COLOR:
80            out[0] = constColor[0];
81            out[1] = constColor[1];
82            out[2] = constColor[2];
83            out[3] = constColor[3];
84            break;
85        case BLENDFACTOR_CONST_ALPHA:
86            out[0] = out[1] = out[2] = out[3] = constColor[3];
87            break;
88        case BLENDFACTOR_SRC1_COLOR:
89            out[0] = src1[0];
90            out[1] = src1[1];
91            out[2] = src1[2];
92            out[3] = src1[3];
93            break;
94        case BLENDFACTOR_SRC1_ALPHA:
95            out[0] = out[1] = out[2] = out[3] = src1[3];
96            break;
97        case BLENDFACTOR_ZERO:
98            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
99            break;
100        case BLENDFACTOR_INV_SRC_COLOR:
101            out[0] = FSUB(VIMMED1(1.0f), src[0]);
102            out[1] = FSUB(VIMMED1(1.0f), src[1]);
103            out[2] = FSUB(VIMMED1(1.0f), src[2]);
104            out[3] = FSUB(VIMMED1(1.0f), src[3]);
105            break;
106        case BLENDFACTOR_INV_SRC_ALPHA:
107            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
108            break;
109        case BLENDFACTOR_INV_DST_ALPHA:
110            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
111            break;
112        case BLENDFACTOR_INV_DST_COLOR:
113            out[0] = FSUB(VIMMED1(1.0f), dst[0]);
114            out[1] = FSUB(VIMMED1(1.0f), dst[1]);
115            out[2] = FSUB(VIMMED1(1.0f), dst[2]);
116            out[3] = FSUB(VIMMED1(1.0f), dst[3]);
117            break;
118        case BLENDFACTOR_INV_CONST_COLOR:
119            out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
120            out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
121            out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
122            out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
123            break;
124        case BLENDFACTOR_INV_CONST_ALPHA:
125            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
126            break;
127        case BLENDFACTOR_INV_SRC1_COLOR:
128            out[0] = FSUB(VIMMED1(1.0f), src1[0]);
129            out[1] = FSUB(VIMMED1(1.0f), src1[1]);
130            out[2] = FSUB(VIMMED1(1.0f), src1[2]);
131            out[3] = FSUB(VIMMED1(1.0f), src1[3]);
132            break;
133        case BLENDFACTOR_INV_SRC1_ALPHA:
134            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
135            break;
136        default:
137            SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
138            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
139            break;
140        }
141
142        if (Color)
143        {
144            result[0] = out[0];
145            result[1] = out[1];
146            result[2] = out[2];
147        }
148
149        if (Alpha)
150        {
151            result[3] = out[3];
152        }
153    }
154
155    void Clamp(SWR_FORMAT format, Value* src[4])
156    {
157        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
158        SWR_TYPE type = info.type[0];
159
160        switch (type)
161        {
162        case SWR_TYPE_FLOAT:
163            break;
164
165        case SWR_TYPE_UNORM:
166            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
167            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
168            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
169            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
170            break;
171
172        case SWR_TYPE_SNORM:
173            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
174            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
175            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
176            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
177            break;
178
179        default: SWR_ASSERT(false, "Unsupport format type: %d", type);
180        }
181    }
182
183    void ApplyDefaults(SWR_FORMAT format, Value* src[4])
184    {
185        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
186
187        bool valid[] = { false, false, false, false };
188        for (uint32_t c = 0; c < info.numComps; ++c)
189        {
190            valid[info.swizzle[c]] = true;
191        }
192
193        for (uint32_t c = 0; c < 4; ++c)
194        {
195            if (!valid[c])
196            {
197                src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
198            }
199        }
200    }
201
202    void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
203    {
204        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
205
206        for (uint32_t c = 0; c < info.numComps; ++c)
207        {
208            if (info.type[c] == SWR_TYPE_UNUSED)
209            {
210                src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
211            }
212        }
213    }
214
215    void Quantize(SWR_FORMAT format, Value* src[4])
216    {
217        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
218        for (uint32_t c = 0; c < info.numComps; ++c)
219        {
220            if (info.bpc[c] <= QUANTIZE_THRESHOLD)
221            {
222                uint32_t swizComp = info.swizzle[c];
223                float factor = (float)((1 << info.bpc[c]) - 1);
224                switch (info.type[c])
225                {
226                case SWR_TYPE_UNORM:
227                    src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
228                    src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
229                    src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
230                    break;
231                default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
232                }
233            }
234        }
235    }
236
237    template<bool Color, bool Alpha>
238    void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
239    {
240        Value* out[4];
241        Value* srcBlend[4];
242        Value* dstBlend[4];
243        for (uint32_t i = 0; i < 4; ++i)
244        {
245            srcBlend[i] = FMUL(src[i], srcFactor[i]);
246            dstBlend[i] = FMUL(dst[i], dstFactor[i]);
247        }
248
249        switch (blendOp)
250        {
251        case BLENDOP_ADD:
252            out[0] = FADD(srcBlend[0], dstBlend[0]);
253            out[1] = FADD(srcBlend[1], dstBlend[1]);
254            out[2] = FADD(srcBlend[2], dstBlend[2]);
255            out[3] = FADD(srcBlend[3], dstBlend[3]);
256            break;
257
258        case BLENDOP_SUBTRACT:
259            out[0] = FSUB(srcBlend[0], dstBlend[0]);
260            out[1] = FSUB(srcBlend[1], dstBlend[1]);
261            out[2] = FSUB(srcBlend[2], dstBlend[2]);
262            out[3] = FSUB(srcBlend[3], dstBlend[3]);
263            break;
264
265        case BLENDOP_REVSUBTRACT:
266            out[0] = FSUB(dstBlend[0], srcBlend[0]);
267            out[1] = FSUB(dstBlend[1], srcBlend[1]);
268            out[2] = FSUB(dstBlend[2], srcBlend[2]);
269            out[3] = FSUB(dstBlend[3], srcBlend[3]);
270            break;
271
272        case BLENDOP_MIN:
273            out[0] = VMINPS(src[0], dst[0]);
274            out[1] = VMINPS(src[1], dst[1]);
275            out[2] = VMINPS(src[2], dst[2]);
276            out[3] = VMINPS(src[3], dst[3]);
277            break;
278
279        case BLENDOP_MAX:
280            out[0] = VMAXPS(src[0], dst[0]);
281            out[1] = VMAXPS(src[1], dst[1]);
282            out[2] = VMAXPS(src[2], dst[2]);
283            out[3] = VMAXPS(src[3], dst[3]);
284            break;
285
286        default:
287            SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
288            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
289            break;
290        }
291
292        if (Color)
293        {
294            result[0] = out[0];
295            result[1] = out[1];
296            result[2] = out[2];
297        }
298
299        if (Alpha)
300        {
301            result[3] = out[3];
302        }
303    }
304
305    void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
306    {
307        // Op: (s == PS output, d = RT contents)
308        switch(logicOp)
309        {
310        case LOGICOP_CLEAR:
311            result[0] = VIMMED1(0);
312            result[1] = VIMMED1(0);
313            result[2] = VIMMED1(0);
314            result[3] = VIMMED1(0);
315            break;
316
317        case LOGICOP_NOR:
318            // ~(s | d)
319            result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
320            result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
321            result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
322            result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
323            break;
324
325        case LOGICOP_AND_INVERTED:
326            // ~s & d
327            // todo: use avx andnot instr when I can find the intrinsic to call
328            result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
329            result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
330            result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
331            result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
332            break;
333
334        case LOGICOP_COPY_INVERTED:
335            // ~s
336            result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
337            result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
338            result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
339            result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
340            break;
341
342        case LOGICOP_AND_REVERSE:
343            // s & ~d
344            // todo: use avx andnot instr when I can find the intrinsic to call
345            result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
346            result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
347            result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
348            result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
349            break;
350
351        case LOGICOP_INVERT:
352            // ~d
353            result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
354            result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
355            result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
356            result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
357            break;
358
359        case LOGICOP_XOR:
360            // s ^ d
361            result[0] = XOR(src[0], dst[0]);
362            result[1] = XOR(src[1], dst[1]);
363            result[2] = XOR(src[2], dst[2]);
364            result[3] = XOR(src[3], dst[3]);
365            break;
366
367        case LOGICOP_NAND:
368            // ~(s & d)
369            result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
370            result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
371            result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
372            result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
373            break;
374
375        case LOGICOP_AND:
376            // s & d
377            result[0] = AND(src[0], dst[0]);
378            result[1] = AND(src[1], dst[1]);
379            result[2] = AND(src[2], dst[2]);
380            result[3] = AND(src[3], dst[3]);
381            break;
382
383        case LOGICOP_EQUIV:
384            // ~(s ^ d)
385            result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
386            result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
387            result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
388            result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
389            break;
390
391        case LOGICOP_NOOP:
392            result[0] = dst[0];
393            result[1] = dst[1];
394            result[2] = dst[2];
395            result[3] = dst[3];
396            break;
397
398        case LOGICOP_OR_INVERTED:
399            // ~s | d
400            result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
401            result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
402            result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
403            result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
404            break;
405
406        case LOGICOP_COPY:
407            result[0] = src[0];
408            result[1] = src[1];
409            result[2] = src[2];
410            result[3] = src[3];
411            break;
412
413        case LOGICOP_OR_REVERSE:
414            // s | ~d
415            result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
416            result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
417            result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
418            result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
419            break;
420
421        case LOGICOP_OR:
422            // s | d
423            result[0] = OR(src[0], dst[0]);
424            result[1] = OR(src[1], dst[1]);
425            result[2] = OR(src[2], dst[2]);
426            result[3] = OR(src[3], dst[3]);
427            break;
428
429        case LOGICOP_SET:
430            result[0] = VIMMED1(0xFFFFFFFF);
431            result[1] = VIMMED1(0xFFFFFFFF);
432            result[2] = VIMMED1(0xFFFFFFFF);
433            result[3] = VIMMED1(0xFFFFFFFF);
434            break;
435
436        default:
437            SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp);
438            result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
439            break;
440        }
441    }
442
443    void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* pAlpha, Value* ppMask)
444    {
445        // load uint32_t reference
446        Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
447
448        Value* pTest = nullptr;
449        if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
450        {
451            // convert float alpha to unorm8
452            Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
453            pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
454
455            // compare
456            switch (state.alphaTestFunction)
457            {
458            case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
459            case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
460            case ZFUNC_LT:      pTest = ICMP_ULT(pAlphaU8, pRef); break;
461            case ZFUNC_EQ:      pTest = ICMP_EQ(pAlphaU8, pRef); break;
462            case ZFUNC_LE:      pTest = ICMP_ULE(pAlphaU8, pRef); break;
463            case ZFUNC_GT:      pTest = ICMP_UGT(pAlphaU8, pRef); break;
464            case ZFUNC_NE:      pTest = ICMP_NE(pAlphaU8, pRef); break;
465            case ZFUNC_GE:      pTest = ICMP_UGE(pAlphaU8, pRef); break;
466            default:
467                SWR_ASSERT(false, "Invalid alpha test function");
468                break;
469            }
470        }
471        else
472        {
473            // cast ref to float
474            pRef = BITCAST(pRef, mSimdFP32Ty);
475
476            // compare
477            switch (state.alphaTestFunction)
478            {
479            case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
480            case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
481            case ZFUNC_LT:      pTest = FCMP_OLT(pAlpha, pRef); break;
482            case ZFUNC_EQ:      pTest = FCMP_OEQ(pAlpha, pRef); break;
483            case ZFUNC_LE:      pTest = FCMP_OLE(pAlpha, pRef); break;
484            case ZFUNC_GT:      pTest = FCMP_OGT(pAlpha, pRef); break;
485            case ZFUNC_NE:      pTest = FCMP_ONE(pAlpha, pRef); break;
486            case ZFUNC_GE:      pTest = FCMP_OGE(pAlpha, pRef); break;
487            default:
488                SWR_ASSERT(false, "Invalid alpha test function");
489                break;
490            }
491        }
492
493        // load current mask
494        Value* pMask = LOAD(ppMask);
495
496        // convert to int1 mask
497        pMask = MASK(pMask);
498
499        // and with alpha test result
500        pMask = AND(pMask, pTest);
501
502        // convert back to vector mask
503        pMask = VMASK(pMask);
504
505        // store new mask
506        STORE(pMask, ppMask);
507    }
508
509    Function* Create(const BLEND_COMPILE_STATE& state)
510    {
511        static std::size_t jitNum = 0;
512
513        std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
514        fnName << jitNum++;
515
516        // blend function signature
517        //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
518
519        std::vector<Type*> args{
520            PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
521            PointerType::get(mSimdFP32Ty, 0),               // simdvector& src
522            PointerType::get(mSimdFP32Ty, 0),               // simdvector& src1
523            Type::getInt32Ty(JM()->mContext),               // sampleNum
524            PointerType::get(mSimdFP32Ty, 0),               // uint8_t* pDst
525            PointerType::get(mSimdFP32Ty, 0),               // simdvector& result
526            PointerType::get(mSimdInt32Ty, 0),              // simdscalari* oMask
527            PointerType::get(mSimdInt32Ty, 0),              // simdscalari* pMask
528        };
529
530        FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
531        Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
532
533        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
534
535        IRB()->SetInsertPoint(entry);
536
537        // arguments
538        auto argitr = blendFunc->getArgumentList().begin();
539        Value* pBlendState = &*argitr++;
540        pBlendState->setName("pBlendState");
541        Value* pSrc = &*argitr++;
542        pSrc->setName("src");
543        Value* pSrc1 = &*argitr++;
544        pSrc1->setName("src1");
545        Value* sampleNum = &*argitr++;
546        sampleNum->setName("sampleNum");
547        Value* pDst = &*argitr++;
548        pDst->setName("pDst");
549        Value* pResult = &*argitr++;
550        pResult->setName("result");
551        Value* ppoMask = &*argitr++;
552        ppoMask->setName("ppoMask");
553        Value* ppMask = &*argitr++;
554        ppMask->setName("pMask");
555
556        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
557        Value* dst[4];
558        Value* constantColor[4];
559        Value* src[4];
560        Value* src1[4];
561        Value* result[4];
562        for (uint32_t i = 0; i < 4; ++i)
563        {
564            // load hot tile
565            dst[i] = LOAD(pDst, { i });
566
567            // load constant color
568            constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
569
570            // load src
571            src[i] = LOAD(pSrc, { i });
572
573            // load src1
574            src1[i] = LOAD(pSrc1, { i });
575        }
576        Value* currentMask = VIMMED1(-1);
577        if (state.desc.alphaToCoverageEnable)
578        {
579            Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
580            uint32_t bits = (1 << state.desc.numSamples) - 1;
581            currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
582            currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
583        }
584
585        // alpha test
586        if (state.desc.alphaTestEnable)
587        {
588            AlphaTest(state, pBlendState, src[3], ppMask);
589        }
590
591        // color blend
592        if (state.blendState.blendEnable)
593        {
594            // clamp sources
595            Clamp(state.format, src);
596            Clamp(state.format, src1);
597            Clamp(state.format, dst);
598            Clamp(state.format, constantColor);
599
600            // apply defaults to hottile contents to take into account missing components
601            ApplyDefaults(state.format, dst);
602
603            // Force defaults for unused 'X' components
604            ApplyUnusedDefaults(state.format, dst);
605
606            // Quantize low precision components
607            Quantize(state.format, dst);
608
609            // special case clamping for R11G11B10_float which has no sign bit
610            if (state.format == R11G11B10_FLOAT)
611            {
612                dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
613                dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
614                dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
615                dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
616            }
617
618            Value* srcFactor[4];
619            Value* dstFactor[4];
620            if (state.desc.independentAlphaBlendEnable)
621            {
622                GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
623                GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
624
625                GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
626                GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
627
628                BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
629                BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
630            }
631            else
632            {
633                GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
634                GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
635
636                BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
637            }
638
639            // store results out
640            for (uint32_t i = 0; i < 4; ++i)
641            {
642                STORE(result[i], pResult, { i });
643            }
644        }
645
646        if(state.blendState.logicOpEnable)
647        {
648            const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
649            SWR_ASSERT(info.type[0] == SWR_TYPE_UINT);
650            Value* vMask[4];
651            for(uint32_t i = 0; i < 4; i++)
652            {
653                switch(info.bpc[i])
654                {
655                case 0: vMask[i] = VIMMED1(0x00000000); break;
656                case 2: vMask[i] = VIMMED1(0x00000003); break;
657                case 5: vMask[i] = VIMMED1(0x0000001F); break;
658                case 6: vMask[i] = VIMMED1(0x0000003F); break;
659                case 8: vMask[i] = VIMMED1(0x000000FF); break;
660                case 10: vMask[i] = VIMMED1(0x000003FF); break;
661                case 11: vMask[i] = VIMMED1(0x000007FF); break;
662                case 16: vMask[i] = VIMMED1(0x0000FFFF); break;
663                case 24: vMask[i] = VIMMED1(0x00FFFFFF); break;
664                case 32: vMask[i] = VIMMED1(0xFFFFFFFF); break;
665                default:
666                    vMask[i] = VIMMED1(0x0);
667                    SWR_ASSERT(0, "Unsupported bpc for logic op\n");
668                    break;
669                }
670                src[i] = BITCAST(src[i], mSimdInt32Ty);//, vMask[i]);
671                dst[i] = BITCAST(dst[i], mSimdInt32Ty);
672            }
673
674            LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
675
676            // store results out
677            for(uint32_t i = 0; i < 4; ++i)
678            {
679                // clear upper bits from PS output not in RT format after doing logic op
680                result[i] = AND(result[i], vMask[i]);
681
682                STORE(BITCAST(result[i], mSimdFP32Ty), pResult, {i});
683            }
684        }
685
686        if(state.desc.oMaskEnable)
687        {
688            assert(!(state.desc.alphaToCoverageEnable));
689            // load current mask
690            Value* oMask = LOAD(ppoMask);
691            Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
692            oMask = AND(oMask, sampleMasked);
693            currentMask = AND(oMask, currentMask);
694        }
695
696        if(state.desc.sampleMaskEnable)
697        {
698            Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
699            Value* sampleMasked = SHL(C(1), sampleNum);
700            sampleMask = AND(sampleMask, sampleMasked);
701            sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
702            sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
703            currentMask = AND(sampleMask, currentMask);
704        }
705
706        if (state.desc.alphaToCoverageEnable)
707        {
708            Value* sampleMasked = SHL(C(1), sampleNum);
709            currentMask = AND(currentMask, VBROADCAST(sampleMasked));
710        }
711
712        if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
713           state.desc.oMaskEnable)
714        {
715            // load current mask
716            Value* pMask = LOAD(ppMask);
717            currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty);
718            Value* outputMask = AND(pMask, currentMask);
719            // store new mask
720            STORE(outputMask, GEP(ppMask, C(0)));
721        }
722
723        RET_VOID();
724
725        JitManager::DumpToFile(blendFunc, "");
726
727        ::FunctionPassManager passes(JM()->mpCurrentModule);
728
729        passes.add(createBreakCriticalEdgesPass());
730        passes.add(createCFGSimplificationPass());
731        passes.add(createEarlyCSEPass());
732        passes.add(createPromoteMemoryToRegisterPass());
733        passes.add(createCFGSimplificationPass());
734        passes.add(createEarlyCSEPass());
735        passes.add(createInstructionCombiningPass());
736        passes.add(createInstructionSimplifierPass());
737        passes.add(createConstantPropagationPass());
738        passes.add(createSCCPPass());
739        passes.add(createAggressiveDCEPass());
740
741        passes.run(*blendFunc);
742
743        JitManager::DumpToFile(blendFunc, "optimized");
744
745        return blendFunc;
746    }
747};
748
749//////////////////////////////////////////////////////////////////////////
750/// @brief JITs from fetch shader IR
751/// @param hJitMgr - JitManager handle
752/// @param func   - LLVM function IR
753/// @return PFN_FETCH_FUNC - pointer to fetch code
754PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
755{
756    const llvm::Function *func = (const llvm::Function*)hFunc;
757    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
758    PFN_BLEND_JIT_FUNC pfnBlend;
759    pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
760    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
761    pJitMgr->mIsModuleFinalized = true;
762
763    return pfnBlend;
764}
765
766//////////////////////////////////////////////////////////////////////////
767/// @brief JIT compiles blend shader
768/// @param hJitMgr - JitManager handle
769/// @param state   - blend state to build function from
770extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
771{
772    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
773
774    pJitMgr->SetupNewModule();
775
776    BlendJit theJit(pJitMgr);
777    HANDLE hFunc = theJit.Create(state);
778
779    return JitBlendFunc(hJitMgr, hFunc);
780}
781