rsCpuIntrinsicColorMatrix.cpp revision 9e4a96af136dab5b21a37580d17cbcb89872114e
1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <sys/mman.h>
18#include <unistd.h>
19
20#include "rsCpuIntrinsic.h"
21#include "rsCpuIntrinsicInlines.h"
22#include "linkloader/include/MemChunk.h"
23
24#include <sys/mman.h>
25#include <stddef.h>
26#include <stdint.h>
27#include <stdlib.h>
28//#include <utils/StopWatch.h>
29
30
31/*  uint kernel
32 *  Q0  D0:  Load slot for R
33 *      D1:  Load slot for G
34 *  Q1  D2:  Load slot for B
35 *      D3:  Load slot for A
36 *  Q2  D4:  Matrix
37 *      D5:  =
38 *  Q3  D6:  =
39 *      D7:  =
40 *  Q4  D8:  Add R
41 *      D9:
42 *  Q5  D10: Add G
43 *      D11:
44 *  Q6  D12: Add B
45 *      D13:
46 *  Q7  D14: Add A
47 *      D15:
48 *  Q8  D16:  I32: R Sum
49 *      D17:
50 *  Q9  D18:  I32: G Sum
51 *      D19:
52 *  Q10 D20:  I32: B Sum
53 *      D21:
54 *  Q11 D22:  I32: A Sum
55 *      D23:
56 *  Q12 D24:  U16: expanded R
57 *      D25:
58 *  Q13 D26:  U16: expanded G
59 *      D27:
60 *  Q14 D28:  U16: expanded B
61 *      D29:
62 *  Q15 D30:  U16: expanded A
63 *      D31:
64 *
65 */
66
67/*  float kernel
68 *  Q0  D0:  Load slot for R
69 *      D1:  =
70 *  Q1  D2:  Load slot for G
71 *      D3:  =
72 *  Q2  D4:  Load slot for B
73 *      D5:  =
74 *  Q3  D6:  Load slot for A
75 *      D7:  =
76 *  Q4  D8:  Matrix
77 *      D9:  =
78 *  Q5  D10: =
79 *      D11: =
80 *  Q6  D12: =
81 *      D13: =
82 *  Q7  D14: =
83 *      D15: =
84 *  Q8  D16: Add R
85 *      D17: =
86 *  Q9  D18: Add G
87 *      D19: =
88 *  Q10 D20: Add B
89 *      D21: =
90 *  Q11 D22: Add A
91 *      D23: =
92 *  Q12 D24: Sum R
93 *      D25: =
94 *  Q13 D26: Sum G
95 *      D27: =
96 *  Q14 D28: Sum B
97 *      D29: =
98 *  Q15 D30: Sum A
99 *      D31: =
100 *
101 */
102
103
104
105using namespace android;
106using namespace android::renderscript;
107
108namespace android {
109namespace renderscript {
110
111typedef union {
112    uint64_t key;
113    struct {
114        uint32_t inVecSize          :2;  // [0 - 1]
115        uint32_t outVecSize         :2;  // [2 - 3]
116        uint32_t inType             :4;  // [4 - 7]
117        uint32_t outType            :4;  // [8 - 11]
118        uint32_t dot                :1;  // [12]
119        uint32_t _unused1           :1;  // [13]
120        uint32_t copyAlpha          :1;  // [14]
121        uint32_t _unused2           :1;  // [15]
122        uint32_t coeffMask          :16; // [16-31]
123        uint32_t addMask            :4;  // [32-35]
124    } u;
125} Key_t;
126
127class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
128public:
129    virtual void populateScript(Script *);
130
131    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
132
133    virtual ~RsdCpuScriptIntrinsicColorMatrix();
134    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
135
136    virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
137                           const void * usr, uint32_t usrLen, const RsScriptCall *sc);
138    virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
139                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
140
141protected:
142    float fp[16];
143    float fpa[4];
144
145    // The following four fields are read as constants
146    // by the SIMD assembly code.
147    short ip[16];
148    int ipa[16];
149    float tmpFp[16];
150    float tmpFpa[16];
151
152    static void kernel(const RsForEachStubParamStruct *p,
153                       uint32_t xstart, uint32_t xend,
154                       uint32_t instep, uint32_t outstep);
155    void updateCoeffCache(float fpMul, float addMul);
156
157    Key_t mLastKey;
158    unsigned char *mBuf;
159    size_t mBufSize;
160
161    Key_t computeKey(const Element *ein, const Element *eout);
162
163    bool build(Key_t key);
164
165    void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
166
167};
168
169}
170}
171
172
173Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
174        const Element *ein, const Element *eout) {
175
176    Key_t key;
177    key.key = 0;
178
179    // Compute a unique code key for this operation
180
181    // Add to the key the input and output types
182    bool hasFloat = false;
183    if (ein->getType() == RS_TYPE_FLOAT_32) {
184        hasFloat = true;
185        key.u.inType = RS_TYPE_FLOAT_32;
186        rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
187    }
188    if (eout->getType() == RS_TYPE_FLOAT_32) {
189        hasFloat = true;
190        key.u.outType = RS_TYPE_FLOAT_32;
191        rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
192    }
193
194    // Mask in the bits indicating which coefficients in the
195    // color matrix are needed.
196    if (hasFloat) {
197        for (uint32_t i=0; i < 16; i++) {
198            if (fabs(fp[i]) != 0.f) {
199                key.u.coeffMask |= 1 << i;
200            }
201        }
202        if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
203        if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
204        if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
205        if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
206
207    } else {
208        for (uint32_t i=0; i < 16; i++) {
209            if (ip[i] != 0) {
210                key.u.coeffMask |= 1 << i;
211            }
212        }
213        if (ipa[0] != 0) key.u.addMask |= 0x1;
214        if (ipa[4] != 0) key.u.addMask |= 0x2;
215        if (ipa[8] != 0) key.u.addMask |= 0x4;
216        if (ipa[12] != 0) key.u.addMask |= 0x8;
217    }
218
219    // Look for a dot product where the r,g,b colums are the same
220    if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
221        (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
222        (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
223        (ip[12] == ip[13]) && (ip[12] == ip[14])) {
224
225        if (!key.u.addMask) key.u.dot = 1;
226    }
227
228    // Is alpha a simple copy
229    if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
230        key.u.copyAlpha = !(key.u.inType || key.u.outType);
231    }
232
233    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
234
235    switch (ein->getVectorSize()) {
236    case 4:
237        key.u.inVecSize = 3;
238        break;
239    case 3:
240        key.u.inVecSize = 2;
241        key.u.coeffMask &= ~0xF000;
242        break;
243    case 2:
244        key.u.inVecSize = 1;
245        key.u.coeffMask &= ~0xFF00;
246        break;
247    default:
248        key.u.coeffMask &= ~0xFFF0;
249        break;
250    }
251
252    switch (eout->getVectorSize()) {
253    case 4:
254        key.u.outVecSize = 3;
255        break;
256    case 3:
257        key.u.outVecSize = 2;
258        key.u.coeffMask &= ~0x8888;
259        break;
260    case 2:
261        key.u.outVecSize = 1;
262        key.u.coeffMask &= ~0xCCCC;
263        break;
264    default:
265        key.u.coeffMask &= ~0xEEEE;
266        break;
267    }
268
269    if (key.u.inType && !key.u.outType) {
270        key.u.addMask |= 1;
271        if (key.u.outVecSize > 0) key.u.addMask |= 2;
272        if (key.u.outVecSize > 1) key.u.addMask |= 4;
273        if (key.u.outVecSize > 2) key.u.addMask |= 8;
274    }
275
276    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
277    return key;
278}
279
280#if defined(ARCH_ARM_HAVE_NEON)
281
282#define DEF_SYM(x)                                  \
283    extern "C" uint32_t _N_ColorMatrix_##x;      \
284    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
285    extern "C" uint32_t _N_ColorMatrix_##x##_len;
286
287DEF_SYM(prefix_i)
288DEF_SYM(prefix_f)
289DEF_SYM(postfix1)
290DEF_SYM(postfix2)
291
292DEF_SYM(load_u8_4)
293DEF_SYM(load_u8_3)
294DEF_SYM(load_u8_2)
295DEF_SYM(load_u8_1)
296DEF_SYM(load_u8f_4)
297DEF_SYM(load_u8f_3)
298DEF_SYM(load_u8f_2)
299DEF_SYM(load_u8f_1)
300DEF_SYM(load_f32_4)
301DEF_SYM(load_f32_3)
302DEF_SYM(load_f32_2)
303DEF_SYM(load_f32_1)
304
305DEF_SYM(store_u8_4)
306DEF_SYM(store_u8_2)
307DEF_SYM(store_u8_1)
308DEF_SYM(store_f32_4)
309DEF_SYM(store_f32_3)
310DEF_SYM(store_f32_2)
311DEF_SYM(store_f32_1)
312DEF_SYM(store_f32u_4)
313DEF_SYM(store_f32u_2)
314DEF_SYM(store_f32u_1)
315
316DEF_SYM(unpack_u8_4)
317DEF_SYM(unpack_u8_3)
318DEF_SYM(unpack_u8_2)
319DEF_SYM(unpack_u8_1)
320DEF_SYM(pack_u8_4)
321DEF_SYM(pack_u8_3)
322DEF_SYM(pack_u8_2)
323DEF_SYM(pack_u8_1)
324DEF_SYM(dot)
325DEF_SYM(add_0_u8)
326DEF_SYM(add_1_u8)
327DEF_SYM(add_2_u8)
328DEF_SYM(add_3_u8)
329
330#define ADD_CHUNK(x) \
331    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
332    buf += _N_ColorMatrix_##x##_len
333
334
335static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
336    size_t off = (target - buf - 8) >> 2;
337    rsAssert(((off & 0xff000000) == 0) ||
338           ((off & 0xff000000) == 0xff000000));
339
340    uint32_t op = (condition << 28);
341    op |= 0xa << 24;  // branch
342    op |= 0xffffff & off;
343    ((uint32_t *)buf)[0] = op;
344    return buf + 4;
345}
346
347static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
348    rsAssert(vd < 32);
349    rsAssert(vm < 32);
350    rsAssert(vn < 32);
351
352    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
353    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
354    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
355    return op;
356}
357
358static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
359    //vmlal.s16 Q#1, D#1, D#2[#]
360    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
361    ((uint32_t *)buf)[0] = op;
362    return buf + 4;
363}
364
365static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
366    //vmull.s16 Q#1, D#1, D#2[#]
367    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
368    ((uint32_t *)buf)[0] = op;
369    return buf + 4;
370}
371
372static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
373    //vqadd.s32 Q#1, D#1, D#2
374    uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
375    ((uint32_t *)buf)[0] = op;
376    return buf + 4;
377}
378
379static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
380    //vmlal.f32 Q#1, D#1, D#2[#]
381    uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
382    ((uint32_t *)buf)[0] = op;
383    return buf + 4;
384}
385
386static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
387    //vmull.f32 Q#1, D#1, D#2[#]
388    uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
389    ((uint32_t *)buf)[0] = op;
390    return buf + 4;
391}
392
393static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
394    //vadd.f32 Q#1, D#1, D#2
395    uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
396    ((uint32_t *)buf)[0] = op;
397    return buf + 4;
398}
399
400static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
401    //vadd.f32 Q#1, D#1, D#2
402    uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
403    ((uint32_t *)buf)[0] = op;
404    return buf + 4;
405}
406#endif
407
408
409bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
410#if defined(ARCH_ARM_HAVE_NEON)
411    mBufSize = 4096;
412    //StopWatch build_time("rs cm: build time");
413    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
414                                  MAP_PRIVATE | MAP_ANON, -1, 0);
415    if (!mBuf) {
416        return false;
417    }
418
419    uint8_t *buf = mBuf;
420    uint8_t *buf2 = NULL;
421
422    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
423    int opInit[4] = {0, 0, 0, 0};
424
425    memset(ops, 0, sizeof(ops));
426    for (int i=0; i < 4; i++) {
427        if (key.u.coeffMask & (1 << (i*4))) {
428            ops[i][0] = 0x2 | opInit[0];
429            opInit[0] = 1;
430        }
431        if (!key.u.dot) {
432            if (key.u.coeffMask & (1 << (1 + i*4))) {
433                ops[i][1] = 0x2 | opInit[1];
434                opInit[1] = 1;
435            }
436            if (key.u.coeffMask & (1 << (2 + i*4))) {
437                ops[i][2] = 0x2 | opInit[2];
438                opInit[2] = 1;
439            }
440        }
441        if (!key.u.copyAlpha) {
442            if (key.u.coeffMask & (1 << (3 + i*4))) {
443                ops[i][3] = 0x2 | opInit[3];
444                opInit[3] = 1;
445            }
446        }
447    }
448
449    if (key.u.inType || key.u.outType) {
450        key.u.copyAlpha = 0;
451        ADD_CHUNK(prefix_f);
452        buf2 = buf;
453
454        // Load the incoming r,g,b,a as needed
455        if (key.u.inType) {
456            switch(key.u.inVecSize) {
457            case 3:
458                ADD_CHUNK(load_f32_4);
459                break;
460            case 2:
461                ADD_CHUNK(load_f32_3);
462                break;
463            case 1:
464                ADD_CHUNK(load_f32_2);
465                break;
466            case 0:
467                ADD_CHUNK(load_f32_1);
468                break;
469            }
470        } else {
471            switch(key.u.inVecSize) {
472            case 3:
473                ADD_CHUNK(load_u8f_4);
474                break;
475            case 2:
476                ADD_CHUNK(load_u8f_3);
477                break;
478            case 1:
479                ADD_CHUNK(load_u8f_2);
480                break;
481            case 0:
482                ADD_CHUNK(load_u8f_1);
483                break;
484            }
485        }
486
487        for (int i=0; i < 4; i++) {
488            for (int j=0; j < 4; j++) {
489                switch(ops[i][j]) {
490                case 0:
491                    break;
492                case 2:
493                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
494                    break;
495                case 3:
496                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
497                    break;
498                }
499            }
500        }
501        for (int j=0; j < 4; j++) {
502            if (opInit[j]) {
503                if (key.u.addMask & (1 << j)) {
504                    buf = addVADD_F32(buf, j, 12+j, 8+j);
505                } else {
506                    buf = addVORR_32(buf, j, 12+j, 12+j);
507                }
508            } else {
509                if (key.u.addMask & (1 << j)) {
510                    buf = addVADD_F32(buf, j, j, 8+j);
511                }
512            }
513        }
514
515        if (key.u.outType) {
516            switch(key.u.outVecSize) {
517            case 3:
518                ADD_CHUNK(store_f32_4);
519                break;
520            case 2:
521                ADD_CHUNK(store_f32_3);
522                break;
523            case 1:
524                ADD_CHUNK(store_f32_2);
525                break;
526            case 0:
527                ADD_CHUNK(store_f32_1);
528                break;
529            }
530        } else {
531            switch(key.u.outVecSize) {
532            case 3:
533            case 2:
534                ADD_CHUNK(store_f32u_4);
535                break;
536            case 1:
537                ADD_CHUNK(store_f32u_2);
538                break;
539            case 0:
540                ADD_CHUNK(store_f32u_1);
541                break;
542            }
543        }
544
545
546    } else {
547        // Add the function prefix
548        // Store the address for the loop return
549        ADD_CHUNK(prefix_i);
550        buf2 = buf;
551
552        // Load the incoming r,g,b,a as needed
553        switch(key.u.inVecSize) {
554        case 3:
555            ADD_CHUNK(load_u8_4);
556            if (key.u.copyAlpha) {
557                ADD_CHUNK(unpack_u8_3);
558            } else {
559                ADD_CHUNK(unpack_u8_4);
560            }
561            break;
562        case 2:
563            ADD_CHUNK(load_u8_3);
564            ADD_CHUNK(unpack_u8_3);
565            break;
566        case 1:
567            ADD_CHUNK(load_u8_2);
568            ADD_CHUNK(unpack_u8_2);
569            break;
570        case 0:
571            ADD_CHUNK(load_u8_1);
572            ADD_CHUNK(unpack_u8_1);
573            break;
574        }
575
576        // Add multiply and accumulate
577        // use MULL to init the output register,
578        // use MLAL from there
579        for (int i=0; i < 4; i++) {
580            for (int j=0; j < 4; j++) {
581                switch(ops[i][j]) {
582                case 0:
583                    break;
584                case 2:
585                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
586                    break;
587                case 3:
588                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
589                    break;
590                }
591            }
592        }
593        for (int j=0; j < 4; j++) {
594            if (key.u.addMask & (1 << j)) {
595                buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
596            }
597        }
598
599        // If we have a dot product, perform the special pack.
600        if (key.u.dot) {
601            ADD_CHUNK(pack_u8_1);
602            ADD_CHUNK(dot);
603        } else {
604            switch(key.u.outVecSize) {
605            case 3:
606                if (key.u.copyAlpha) {
607                    ADD_CHUNK(pack_u8_3);
608                } else {
609                    ADD_CHUNK(pack_u8_4);
610                }
611                break;
612            case 2:
613                ADD_CHUNK(pack_u8_3);
614                break;
615            case 1:
616                ADD_CHUNK(pack_u8_2);
617                break;
618            case 0:
619                ADD_CHUNK(pack_u8_1);
620                break;
621            }
622        }
623
624        // Write out result
625        switch(key.u.outVecSize) {
626        case 3:
627        case 2:
628            ADD_CHUNK(store_u8_4);
629            break;
630        case 1:
631            ADD_CHUNK(store_u8_2);
632            break;
633        case 0:
634            ADD_CHUNK(store_u8_1);
635            break;
636        }
637    }
638
639    if (key.u.inType != key.u.outType) {
640        key.u.copyAlpha = 0;
641        key.u.dot = 0;
642    }
643
644    // Loop, branch, and cleanup
645    ADD_CHUNK(postfix1);
646    buf = addBranch(buf, buf2, 0x01);
647    ADD_CHUNK(postfix2);
648
649    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
650    if (ret == -1) {
651        ALOGE("mprotect error %i", ret);
652        return false;
653    }
654
655    cacheflush((long)mBuf, (long)mBuf + mBufSize, 0);
656    return true;
657#else
658    return false;
659#endif
660}
661
662void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float adMul) {
663    for(int ct=0; ct < 16; ct++) {
664        ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
665        tmpFp[ct] = fp[ct] * fpMul;
666        //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
667    }
668
669    float ad = 0.f;
670    if (fpMul > 254.f) ad = 0.5f;
671    for(int ct=0; ct < 4; ct++) {
672        tmpFpa[ct * 4 + 0] = fpa[ct] * adMul + ad;
673        //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
674        tmpFpa[ct * 4 + 1] = tmpFpa[ct * 4];
675        tmpFpa[ct * 4 + 2] = tmpFpa[ct * 4];
676        tmpFpa[ct * 4 + 3] = tmpFpa[ct * 4];
677    }
678
679    for(int ct=0; ct < 16; ct++) {
680        ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
681    }
682
683}
684
685void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
686                                                    size_t dataLength) {
687    switch(slot) {
688    case 0:
689        memcpy (fp, data, sizeof(fp));
690        break;
691    case 1:
692        memcpy (fpa, data, sizeof(fpa));
693        break;
694    default:
695        rsAssert(0);
696        break;
697    }
698    mRootPtr = &kernel;
699}
700
701
702static void One(const RsForEachStubParamStruct *p, void *out,
703                const void *py, const float* coeff, const float *add,
704                uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
705
706    float4 f = 0.f;
707    if (fin) {
708        switch(vsin) {
709        case 3:
710            f = ((const float4 *)py)[0];
711            break;
712        case 2:
713            f = ((const float4 *)py)[0];
714            f.w = 0.f;
715            break;
716        case 1:
717            f.xy = ((const float2 *)py)[0];
718            break;
719        case 0:
720            f.x = ((const float *)py)[0];
721            break;
722        }
723    } else {
724        switch(vsin) {
725        case 3:
726            f = convert_float4(((const uchar4 *)py)[0]);
727            break;
728        case 2:
729            f = convert_float4(((const uchar4 *)py)[0]);
730            f.w = 0.f;
731            break;
732        case 1:
733            f.xy = convert_float2(((const uchar2 *)py)[0]);
734            break;
735        case 0:
736            f.x = (float)(((const uchar *)py)[0]);
737            break;
738        }
739    }
740    //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
741
742    float4 sum;
743    sum.x = f.x * coeff[0] +
744            f.y * coeff[4] +
745            f.z * coeff[8] +
746            f.w * coeff[12];
747    sum.y = f.x * coeff[1] +
748            f.y * coeff[5] +
749            f.z * coeff[9] +
750            f.w * coeff[13];
751    sum.z = f.x * coeff[2] +
752            f.y * coeff[6] +
753            f.z * coeff[10] +
754            f.w * coeff[14];
755    sum.w = f.x * coeff[3] +
756            f.y * coeff[7] +
757            f.z * coeff[11] +
758            f.w * coeff[15];
759    //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
760
761    sum.x += add[0];
762    sum.y += add[1];
763    sum.z += add[2];
764    sum.w += add[3];
765
766
767    //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
768    if (fout) {
769        switch(vsout) {
770        case 3:
771        case 2:
772            ((float4 *)out)[0] = sum;
773            break;
774        case 1:
775            ((float2 *)out)[0] = sum.xy;
776            break;
777        case 0:
778            ((float *)out)[0] = sum.x;
779            break;
780        }
781    } else {
782        sum += 0.5f;
783        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
784        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
785        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
786        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
787
788        switch(vsout) {
789        case 3:
790        case 2:
791            ((uchar4 *)out)[0] = convert_uchar4(sum);
792            break;
793        case 1:
794            ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
795            break;
796        case 0:
797            ((uchar *)out)[0] = sum.x;
798            break;
799        }
800    }
801    //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
802}
803
804void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
805                                              uint32_t xstart, uint32_t xend,
806                                              uint32_t instep, uint32_t outstep) {
807    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
808    uchar *out = (uchar *)p->out;
809    uchar *in = (uchar *)p->in;
810    uint32_t x1 = xstart;
811    uint32_t x2 = xend;
812
813    uint32_t vsin = cp->mLastKey.u.inVecSize;
814    uint32_t vsout = cp->mLastKey.u.outVecSize;
815    bool floatIn = !!cp->mLastKey.u.inType;
816    bool floatOut = !!cp->mLastKey.u.outType;
817
818    //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
819
820    if(x2 > x1) {
821        int32_t len = (x2 - x1) >> 2;
822        if((cp->mOptKernel != NULL) && (len > 0)) {
823            cp->mOptKernel(out, in, cp->ip, len);
824            x1 += len << 2;
825            out += outstep * (len << 2);
826            in += instep * (len << 2);
827        }
828
829        while(x1 != x2) {
830            One(p, out, in, cp->tmpFp, cp->fpa, vsin, vsout, floatIn, floatOut);
831            out += outstep;
832            in += instep;
833            x1++;
834        }
835    }
836}
837
838void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
839        uint32_t slot, const Allocation * ain, Allocation * aout,
840        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
841
842    const Element *ein = ain->mHal.state.type->getElement();
843    const Element *eout = aout->mHal.state.type->getElement();
844
845    if (ein->getType() == eout->getType()) {
846        updateCoeffCache(1.f, 1.f);
847    } else {
848        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
849            updateCoeffCache(255.f, 255.f);
850        } else {
851            updateCoeffCache(1.f / 255.f, 1.f);
852        }
853    }
854
855    Key_t key = computeKey(ain->mHal.state.type->getElement(),
856                           aout->mHal.state.type->getElement());
857    if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
858        if (mBuf) munmap(mBuf, mBufSize);
859        mBuf = NULL;
860        mOptKernel = NULL;
861        if (build(key)) {
862            mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
863            mLastKey = key;
864        }
865    }
866}
867
868void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
869        uint32_t slot, const Allocation * ain, Allocation * aout,
870        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
871
872}
873
874RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
875            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
876            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
877
878    mLastKey.key = 0;
879    mBuf = NULL;
880    mBufSize = 0;
881    mOptKernel = NULL;
882    const static float defaultMatrix[] = {
883        1.f, 0.f, 0.f, 0.f,
884        0.f, 1.f, 0.f, 0.f,
885        0.f, 0.f, 1.f, 0.f,
886        0.f, 0.f, 0.f, 1.f
887    };
888    const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
889    setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
890    setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
891}
892
893RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
894    if (mBuf) munmap(mBuf, mBufSize);
895    mBuf = NULL;
896    mOptKernel = NULL;
897}
898
899void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
900    s->mHal.info.exportedVariableCount = 2;
901}
902
903RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
904                                            const Script *s, const Element *e) {
905
906    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
907}
908
909
910
911