1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <sys/mman.h>
18#include <unistd.h>
19
20#include "rsCpuIntrinsic.h"
21#include "rsCpuIntrinsicInlines.h"
22
23#include <sys/mman.h>
24#include <stddef.h>
25#include <stdint.h>
26#include <stdlib.h>
27//#include <utils/StopWatch.h>
28
29
30/*  uint kernel
31 *  Q0  D0:  Load slot for R
32 *      D1:  Load slot for G
33 *  Q1  D2:  Load slot for B
34 *      D3:  Load slot for A
35 *  Q2  D4:  Matrix
36 *      D5:  =
37 *  Q3  D6:  =
38 *      D7:  =
39 *  Q4  D8:  Add R
40 *      D9:
41 *  Q5  D10: Add G
42 *      D11:
43 *  Q6  D12: Add B
44 *      D13:
45 *  Q7  D14: Add A
46 *      D15:
47 *  Q8  D16:  I32: R Sum
48 *      D17:
49 *  Q9  D18:  I32: G Sum
50 *      D19:
51 *  Q10 D20:  I32: B Sum
52 *      D21:
53 *  Q11 D22:  I32: A Sum
54 *      D23:
55 *  Q12 D24:  U16: expanded R
56 *      D25:
57 *  Q13 D26:  U16: expanded G
58 *      D27:
59 *  Q14 D28:  U16: expanded B
60 *      D29:
61 *  Q15 D30:  U16: expanded A
62 *      D31:
63 *
64 */
65
66/*  float kernel
67 *  Q0  D0:  Load slot for R
68 *      D1:  =
69 *  Q1  D2:  Load slot for G
70 *      D3:  =
71 *  Q2  D4:  Load slot for B
72 *      D5:  =
73 *  Q3  D6:  Load slot for A
74 *      D7:  =
75 *  Q4  D8:  Matrix
76 *      D9:  =
77 *  Q5  D10: =
78 *      D11: =
79 *  Q6  D12: =
80 *      D13: =
81 *  Q7  D14: =
82 *      D15: =
83 *  Q8  D16: Add R
84 *      D17: =
85 *  Q9  D18: Add G
86 *      D19: =
87 *  Q10 D20: Add B
88 *      D21: =
89 *  Q11 D22: Add A
90 *      D23: =
91 *  Q12 D24: Sum R
92 *      D25: =
93 *  Q13 D26: Sum G
94 *      D27: =
95 *  Q14 D28: Sum B
96 *      D29: =
97 *  Q15 D30: Sum A
98 *      D31: =
99 *
100 */
101
102
103
104using namespace android;
105using namespace android::renderscript;
106
107namespace android {
108namespace renderscript {
109
110typedef union {
111    uint64_t key;
112    struct {
113        uint32_t inVecSize          :2;  // [0 - 1]
114        uint32_t outVecSize         :2;  // [2 - 3]
115        uint32_t inType             :4;  // [4 - 7]
116        uint32_t outType            :4;  // [8 - 11]
117        uint32_t dot                :1;  // [12]
118        uint32_t _unused1           :1;  // [13]
119        uint32_t copyAlpha          :1;  // [14]
120        uint32_t _unused2           :1;  // [15]
121        uint32_t coeffMask          :16; // [16-31]
122        uint32_t addMask            :4;  // [32-35]
123    } u;
124} Key_t;
125
126//Re-enable when intrinsic is fixed
127#if defined(ARCH_ARM64_USE_INTRINSICS)
128typedef struct {
129    void (*column[4])(void);
130    void (*store)(void);
131    void (*load)(void);
132    void (*store_end)(void);
133    void (*load_end)(void);
134} FunctionTab_t;
135
136extern "C" void rsdIntrinsicColorMatrix_int_K(
137             void *out, void const *in, size_t count,
138             FunctionTab_t const *fns,
139             int16_t const *mult, int32_t const *add);
140
141extern "C" void rsdIntrinsicColorMatrix_float_K(
142             void *out, void const *in, size_t count,
143             FunctionTab_t const *fns,
144             float const *mult, float const *add);
145
146/* The setup functions fill in function tables to be used by above functions;
147 * this code also eliminates jump-to-another-jump cases by short-circuiting
148 * empty functions.  While it's not performance critical, it works out easier
149 * to write the set-up code in assembly than to try to expose the same symbols
150 * and write the code in C.
151 */
152extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
153             FunctionTab_t *fns,
154             uint32_t mask, int dt, int st);
155
156extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
157             FunctionTab_t *fns,
158             uint32_t mask, int dt, int st);
159#endif
160
161class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
162public:
163    void populateScript(Script *) override;
164
165    void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
166
167    ~RsdCpuScriptIntrinsicColorMatrix() override;
168    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
169
170    void preLaunch(uint32_t slot, const Allocation ** ains,
171                   uint32_t inLen, Allocation * aout, const void * usr,
172                   uint32_t usrLen, const RsScriptCall *sc) override;
173
174protected:
175    float fp[16];
176    float fpa[4];
177
178    // The following four fields are read as constants
179    // by the SIMD assembly code.
180    short ip[16];
181    int ipa[4];
182    float tmpFp[16];
183    float tmpFpa[4];
184#if defined(ARCH_ARM64_USE_INTRINSICS)
185    FunctionTab_t mFnTab;
186#endif
187
188    static void kernel(const RsExpandKernelDriverInfo *info,
189                       uint32_t xstart, uint32_t xend,
190                       uint32_t outstep);
191    void updateCoeffCache(float fpMul, float addMul);
192
193    Key_t mLastKey;
194    unsigned char *mBuf;
195    size_t mBufSize;
196
197    Key_t computeKey(const Element *ein, const Element *eout);
198
199    bool build(Key_t key);
200
201    void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
202
203};
204
205}
206}
207
208
209Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
210        const Element *ein, const Element *eout) {
211
212    Key_t key;
213    key.key = 0;
214
215    // Compute a unique code key for this operation
216
217    // Add to the key the input and output types
218    bool hasFloat = false;
219    if (ein->getType() == RS_TYPE_FLOAT_32) {
220        hasFloat = true;
221        key.u.inType = RS_TYPE_FLOAT_32;
222        rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
223    }
224    if (eout->getType() == RS_TYPE_FLOAT_32) {
225        hasFloat = true;
226        key.u.outType = RS_TYPE_FLOAT_32;
227        rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
228    }
229
230    // Mask in the bits indicating which coefficients in the
231    // color matrix are needed.
232    if (hasFloat) {
233        for (uint32_t i=0; i < 16; i++) {
234            if (fabs(fp[i]) != 0.f) {
235                key.u.coeffMask |= 1 << i;
236            }
237        }
238        if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
239        if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
240        if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
241        if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
242
243    } else {
244        for (uint32_t i=0; i < 16; i++) {
245            if (ip[i] != 0) {
246                key.u.coeffMask |= 1 << i;
247            }
248        }
249        if (ipa[0] != 0) key.u.addMask |= 0x1;
250        if (ipa[1] != 0) key.u.addMask |= 0x2;
251        if (ipa[2] != 0) key.u.addMask |= 0x4;
252        if (ipa[3] != 0) key.u.addMask |= 0x8;
253    }
254
255    // Look for a dot product where the r,g,b colums are the same
256    if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
257        (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
258        (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
259        (ip[12] == ip[13]) && (ip[12] == ip[14])) {
260
261        if (!key.u.addMask) key.u.dot = 1;
262    }
263
264    // Is alpha a simple copy
265    if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
266        key.u.copyAlpha = !(key.u.inType || key.u.outType);
267    }
268
269    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
270
271    switch (ein->getVectorSize()) {
272    case 4:
273        key.u.inVecSize = 3;
274        break;
275    case 3:
276        key.u.inVecSize = 2;
277        key.u.coeffMask &= ~0xF000;
278        break;
279    case 2:
280        key.u.inVecSize = 1;
281        key.u.coeffMask &= ~0xFF00;
282        break;
283    default:
284        key.u.coeffMask &= ~0xFFF0;
285        break;
286    }
287
288    switch (eout->getVectorSize()) {
289    case 4:
290        key.u.outVecSize = 3;
291        break;
292    case 3:
293        key.u.outVecSize = 2;
294        key.u.coeffMask &= ~0x8888;
295        key.u.addMask &= 7;
296        break;
297    case 2:
298        key.u.outVecSize = 1;
299        key.u.coeffMask &= ~0xCCCC;
300        key.u.addMask &= 3;
301        break;
302    default:
303        key.u.coeffMask &= ~0xEEEE;
304        key.u.addMask &= 1;
305        break;
306    }
307
308    if (key.u.inType && !key.u.outType) {
309        key.u.addMask |= 1;
310        if (key.u.outVecSize > 0) key.u.addMask |= 2;
311        if (key.u.outVecSize > 1) key.u.addMask |= 4;
312        if (key.u.outVecSize > 2) key.u.addMask |= 8;
313    }
314
315    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
316    return key;
317}
318
319#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
320
321#define DEF_SYM(x)                                  \
322    extern "C" uint32_t _N_ColorMatrix_##x;      \
323    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
324    extern "C" uint32_t _N_ColorMatrix_##x##_len;
325
326DEF_SYM(prefix_i)
327DEF_SYM(prefix_f)
328DEF_SYM(postfix1)
329DEF_SYM(postfix2)
330
331DEF_SYM(load_u8_4)
332DEF_SYM(load_u8_3)
333DEF_SYM(load_u8_2)
334DEF_SYM(load_u8_1)
335DEF_SYM(load_u8f_4)
336DEF_SYM(load_u8f_3)
337DEF_SYM(load_u8f_2)
338DEF_SYM(load_u8f_1)
339DEF_SYM(load_f32_4)
340DEF_SYM(load_f32_3)
341DEF_SYM(load_f32_2)
342DEF_SYM(load_f32_1)
343
344DEF_SYM(store_u8_4)
345DEF_SYM(store_u8_2)
346DEF_SYM(store_u8_1)
347DEF_SYM(store_f32_4)
348DEF_SYM(store_f32_3)
349DEF_SYM(store_f32_2)
350DEF_SYM(store_f32_1)
351DEF_SYM(store_f32u_4)
352DEF_SYM(store_f32u_2)
353DEF_SYM(store_f32u_1)
354
355DEF_SYM(unpack_u8_4)
356DEF_SYM(unpack_u8_3)
357DEF_SYM(unpack_u8_2)
358DEF_SYM(unpack_u8_1)
359DEF_SYM(pack_u8_4)
360DEF_SYM(pack_u8_3)
361DEF_SYM(pack_u8_2)
362DEF_SYM(pack_u8_1)
363DEF_SYM(dot)
364DEF_SYM(add_0_u8)
365DEF_SYM(add_1_u8)
366DEF_SYM(add_2_u8)
367DEF_SYM(add_3_u8)
368
369#define ADD_CHUNK(x) \
370    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
371    buf += _N_ColorMatrix_##x##_len
372
373
374static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
375    size_t off = (target - buf - 8) >> 2;
376    rsAssert(((off & 0xff000000) == 0) ||
377           ((off & 0xff000000) == 0xff000000));
378
379    uint32_t op = (condition << 28);
380    op |= 0xa << 24;  // branch
381    op |= 0xffffff & off;
382    ((uint32_t *)buf)[0] = op;
383    return buf + 4;
384}
385
386static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
387    rsAssert(vd < 32);
388    rsAssert(vm < 32);
389    rsAssert(vn < 32);
390
391    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
392    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
393    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
394    return op;
395}
396
397static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
398    //vmlal.s16 Q#1, D#1, D#2[#]
399    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
400    ((uint32_t *)buf)[0] = op;
401    return buf + 4;
402}
403
404static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
405    //vmull.s16 Q#1, D#1, D#2[#]
406    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
407    ((uint32_t *)buf)[0] = op;
408    return buf + 4;
409}
410
411static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
412    //vqadd.s32 Q#1, Q#1, Q#2
413    uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
414    ((uint32_t *)buf)[0] = op;
415    return buf + 4;
416}
417
418static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
419    //vmlal.f32 Q#1, D#1, D#2[#]
420    uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
421    ((uint32_t *)buf)[0] = op;
422    return buf + 4;
423}
424
425static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
426    //vmull.f32 Q#1, D#1, D#2[#]
427    uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
428    ((uint32_t *)buf)[0] = op;
429    return buf + 4;
430}
431
432static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
433    //vadd.f32 Q#1, D#1, D#2
434    uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
435    ((uint32_t *)buf)[0] = op;
436    return buf + 4;
437}
438
439static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
440    //vmov.32 Q#1, #imm
441    rsAssert(imm == 0);
442    uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
443    ((uint32_t *)buf)[0] = op;
444    return buf + 4;
445}
446
447static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
448    //vadd.f32 Q#1, D#1, D#2
449    uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
450    ((uint32_t *)buf)[0] = op;
451    return buf + 4;
452}
453#endif
454
455#if defined(ARCH_X86_HAVE_SSSE3)
456extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
457                                  const short *coef, uint32_t count);
458extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
459                                  const short *coef, uint32_t count);
460extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
461                                  const short *coef, uint32_t count);
462
463void * selectKernel(Key_t key)
464{
465    void * kernel = nullptr;
466
467    // inType, outType float if nonzero
468    if (!(key.u.inType || key.u.outType)) {
469        if (key.u.dot)
470            kernel = (void *)rsdIntrinsicColorMatrixDot_K;
471        else if (key.u.copyAlpha)
472            kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
473        else
474            kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
475    }
476
477    return kernel;
478}
479#endif
480
481bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
482#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
483    mBufSize = 4096;
484    //StopWatch build_time("rs cm: build time");
485    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
486                                  MAP_PRIVATE | MAP_ANON, -1, 0);
487    if (mBuf == MAP_FAILED) {
488        mBuf = NULL;
489        return false;
490    }
491
492    uint8_t *buf = mBuf;
493    uint8_t *buf2 = nullptr;
494
495    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
496    int opInit[4] = {0, 0, 0, 0};
497
498    memset(ops, 0, sizeof(ops));
499    for (int i=0; i < 4; i++) {
500        if (key.u.coeffMask & (1 << (i*4))) {
501            ops[i][0] = 0x2 | opInit[0];
502            opInit[0] = 1;
503        }
504        if (!key.u.dot) {
505            if (key.u.coeffMask & (1 << (1 + i*4))) {
506                ops[i][1] = 0x2 | opInit[1];
507                opInit[1] = 1;
508            }
509            if (key.u.coeffMask & (1 << (2 + i*4))) {
510                ops[i][2] = 0x2 | opInit[2];
511                opInit[2] = 1;
512            }
513        }
514        if (!key.u.copyAlpha) {
515            if (key.u.coeffMask & (1 << (3 + i*4))) {
516                ops[i][3] = 0x2 | opInit[3];
517                opInit[3] = 1;
518            }
519        }
520    }
521
522    if (key.u.inType || key.u.outType) {
523        key.u.copyAlpha = 0;
524        ADD_CHUNK(prefix_f);
525        buf2 = buf;
526
527        // Load the incoming r,g,b,a as needed
528        if (key.u.inType) {
529            switch(key.u.inVecSize) {
530            case 3:
531                ADD_CHUNK(load_f32_4);
532                break;
533            case 2:
534                ADD_CHUNK(load_f32_3);
535                break;
536            case 1:
537                ADD_CHUNK(load_f32_2);
538                break;
539            case 0:
540                ADD_CHUNK(load_f32_1);
541                break;
542            }
543        } else {
544            switch(key.u.inVecSize) {
545            case 3:
546                ADD_CHUNK(load_u8f_4);
547                break;
548            case 2:
549                ADD_CHUNK(load_u8f_3);
550                break;
551            case 1:
552                ADD_CHUNK(load_u8f_2);
553                break;
554            case 0:
555                ADD_CHUNK(load_u8f_1);
556                break;
557            }
558        }
559
560        for (int i=0; i < 4; i++) {
561            for (int j=0; j < 4; j++) {
562                switch(ops[i][j]) {
563                case 0:
564                    break;
565                case 2:
566                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
567                    break;
568                case 3:
569                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
570                    break;
571                }
572            }
573        }
574        for (int j=0; j < 4; j++) {
575            if (opInit[j]) {
576                if (key.u.addMask & (1 << j)) {
577                    buf = addVADD_F32(buf, j, 12+j, 8+j);
578                } else {
579                    buf = addVORR_32(buf, j, 12+j, 12+j);
580                }
581            } else {
582                if (key.u.addMask & (1 << j)) {
583                    buf = addVORR_32(buf, j, 8+j, 8+j);
584                } else {
585                    buf = addVMOV_32(buf, j, 0);
586                }
587            }
588        }
589
590        if (key.u.outType) {
591            switch(key.u.outVecSize) {
592            case 3:
593                ADD_CHUNK(store_f32_4);
594                break;
595            case 2:
596                ADD_CHUNK(store_f32_3);
597                break;
598            case 1:
599                ADD_CHUNK(store_f32_2);
600                break;
601            case 0:
602                ADD_CHUNK(store_f32_1);
603                break;
604            }
605        } else {
606            switch(key.u.outVecSize) {
607            case 3:
608            case 2:
609                ADD_CHUNK(store_f32u_4);
610                break;
611            case 1:
612                ADD_CHUNK(store_f32u_2);
613                break;
614            case 0:
615                ADD_CHUNK(store_f32u_1);
616                break;
617            }
618        }
619
620
621    } else {
622        // Add the function prefix
623        // Store the address for the loop return
624        ADD_CHUNK(prefix_i);
625        buf2 = buf;
626
627        // Load the incoming r,g,b,a as needed
628        switch(key.u.inVecSize) {
629        case 3:
630            ADD_CHUNK(load_u8_4);
631            if (key.u.copyAlpha) {
632                ADD_CHUNK(unpack_u8_3);
633            } else {
634                ADD_CHUNK(unpack_u8_4);
635            }
636            break;
637        case 2:
638            ADD_CHUNK(load_u8_3);
639            ADD_CHUNK(unpack_u8_3);
640            break;
641        case 1:
642            ADD_CHUNK(load_u8_2);
643            ADD_CHUNK(unpack_u8_2);
644            break;
645        case 0:
646            ADD_CHUNK(load_u8_1);
647            ADD_CHUNK(unpack_u8_1);
648            break;
649        }
650
651        // Add multiply and accumulate
652        // use MULL to init the output register,
653        // use MLAL from there
654        for (int i=0; i < 4; i++) {
655            for (int j=0; j < 4; j++) {
656                switch(ops[i][j]) {
657                case 0:
658                    break;
659                case 2:
660                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
661                    break;
662                case 3:
663                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
664                    break;
665                }
666            }
667        }
668        for (int j=0; j < 4; j++) {
669            if (opInit[j]) {
670                if (key.u.addMask & (1 << j)) {
671                    buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
672                }
673            } else {
674                if (key.u.addMask & (1 << j)) {
675                    buf = addVORR_32(buf, 8+j, 4+j, 4+j);
676                }
677            }
678        }
679
680        // If we have a dot product, perform the special pack.
681        if (key.u.dot) {
682            ADD_CHUNK(pack_u8_1);
683            ADD_CHUNK(dot);
684        } else {
685            switch(key.u.outVecSize) {
686            case 3:
687                if (key.u.copyAlpha) {
688                    ADD_CHUNK(pack_u8_3);
689                } else {
690                    ADD_CHUNK(pack_u8_4);
691                }
692                break;
693            case 2:
694                ADD_CHUNK(pack_u8_3);
695                break;
696            case 1:
697                ADD_CHUNK(pack_u8_2);
698                break;
699            case 0:
700                ADD_CHUNK(pack_u8_1);
701                break;
702            }
703        }
704
705        // Write out result
706        switch(key.u.outVecSize) {
707        case 3:
708        case 2:
709            ADD_CHUNK(store_u8_4);
710            break;
711        case 1:
712            ADD_CHUNK(store_u8_2);
713            break;
714        case 0:
715            ADD_CHUNK(store_u8_1);
716            break;
717        }
718    }
719
720    if (key.u.inType != key.u.outType) {
721        key.u.copyAlpha = 0;
722        key.u.dot = 0;
723    }
724
725    // Loop, branch, and cleanup
726    ADD_CHUNK(postfix1);
727    buf = addBranch(buf, buf2, 0x01);
728    ADD_CHUNK(postfix2);
729
730    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
731    if (ret == -1) {
732        ALOGE("mprotect error %i", ret);
733        return false;
734    }
735
736    __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
737    return true;
738#else
739    return false;
740#endif
741}
742
743void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
744    for(int ct=0; ct < 16; ct++) {
745        ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
746        tmpFp[ct] = fp[ct] * fpMul;
747        //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
748    }
749
750    float add = 0.f;
751    if (fpMul > 254.f) add = 0.5f;
752    for(int ct=0; ct < 4; ct++) {
753        tmpFpa[ct] = fpa[ct] * addMul + add;
754        //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
755    }
756
757    for(int ct=0; ct < 4; ct++) {
758        ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
759    }
760}
761
762void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
763                                                    size_t dataLength) {
764    switch(slot) {
765    case 0:
766        memcpy (fp, data, sizeof(fp));
767        break;
768    case 1:
769        memcpy (fpa, data, sizeof(fpa));
770        break;
771    default:
772        rsAssert(0);
773        break;
774    }
775    mRootPtr = &kernel;
776}
777
778
779static void One(const RsExpandKernelDriverInfo *info, void *out,
780                const void *py, const float* coeff, const float *add,
781                uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
782
783    float4 f = 0.f;
784    if (fin) {
785        switch(vsin) {
786        case 3:
787            f = ((const float4 *)py)[0];
788            break;
789        case 2:
790            f = ((const float4 *)py)[0];
791            f.w = 0.f;
792            break;
793        case 1:
794            f.xy = ((const float2 *)py)[0];
795            break;
796        case 0:
797            f.x = ((const float *)py)[0];
798            break;
799        }
800    } else {
801        switch(vsin) {
802        case 3:
803            f = convert_float4(((const uchar4 *)py)[0]);
804            break;
805        case 2:
806            f = convert_float4(((const uchar4 *)py)[0]);
807            f.w = 0.f;
808            break;
809        case 1:
810            f.xy = convert_float2(((const uchar2 *)py)[0]);
811            break;
812        case 0:
813            f.x = (float)(((const uchar *)py)[0]);
814            break;
815        }
816    }
817    //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
818
819    float4 sum;
820    sum.x = f.x * coeff[0] +
821            f.y * coeff[4] +
822            f.z * coeff[8] +
823            f.w * coeff[12];
824    sum.y = f.x * coeff[1] +
825            f.y * coeff[5] +
826            f.z * coeff[9] +
827            f.w * coeff[13];
828    sum.z = f.x * coeff[2] +
829            f.y * coeff[6] +
830            f.z * coeff[10] +
831            f.w * coeff[14];
832    sum.w = f.x * coeff[3] +
833            f.y * coeff[7] +
834            f.z * coeff[11] +
835            f.w * coeff[15];
836    //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
837
838    sum.x += add[0];
839    sum.y += add[1];
840    sum.z += add[2];
841    sum.w += add[3];
842
843
844    //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
845    if (fout) {
846        switch(vsout) {
847        case 3:
848        case 2:
849            ((float4 *)out)[0] = sum;
850            break;
851        case 1:
852            ((float2 *)out)[0] = sum.xy;
853            break;
854        case 0:
855            ((float *)out)[0] = sum.x;
856            break;
857        }
858    } else {
859        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
860        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
861        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
862        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
863
864        switch(vsout) {
865        case 3:
866        case 2:
867            ((uchar4 *)out)[0] = convert_uchar4(sum);
868            break;
869        case 1:
870            ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
871            break;
872        case 0:
873            ((uchar *)out)[0] = sum.x;
874            break;
875        }
876    }
877    //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
878}
879
880void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
881                                              uint32_t xstart, uint32_t xend,
882                                              uint32_t outstep) {
883    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
884
885    uint32_t instep = info->inStride[0];
886
887    uchar *out = (uchar *)info->outPtr[0];
888    uchar *in = (uchar *)info->inPtr[0];
889    uint32_t x1 = xstart;
890    uint32_t x2 = xend;
891
892    uint32_t vsin = cp->mLastKey.u.inVecSize;
893    uint32_t vsout = cp->mLastKey.u.outVecSize;
894    bool floatIn = !!cp->mLastKey.u.inType;
895    bool floatOut = !!cp->mLastKey.u.outType;
896
897    //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
898
899    if(x2 > x1) {
900        int32_t len = x2 - x1;
901        if (gArchUseSIMD) {
902            if((cp->mOptKernel != nullptr) && (len >= 4)) {
903                // The optimized kernel processes 4 pixels at once
904                // and requires a minimum of 1 chunk of 4
905                cp->mOptKernel(out, in, cp->ip, len >> 2);
906                // Update the len and pointers so the generic code can
907                // finish any leftover pixels
908                len &= ~3;
909                x1 += len;
910                out += outstep * len;
911                in += instep * len;
912            }
913#if defined(ARCH_ARM64_USE_INTRINSICS)
914            else {
915                if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
916                    // Currently this generates off by one errors.
917                    //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
918                    //x1 += len;
919                    //out += outstep * len;
920                    //in += instep * len;
921                } else {
922                    rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
923                    x1 += len;
924                    out += outstep * len;
925                    in += instep * len;
926                }
927            }
928#endif
929        }
930
931        while(x1 != x2) {
932            One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
933            out += outstep;
934            in += instep;
935            x1++;
936        }
937    }
938}
939
940void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
941                                                 const Allocation ** ains,
942                                                 uint32_t inLen,
943                                                 Allocation * aout,
944                                                 const void * usr,
945                                                 uint32_t usrLen,
946                                                 const RsScriptCall *sc) {
947
948    const Element *ein = ains[0]->mHal.state.type->getElement();
949    const Element *eout = aout->mHal.state.type->getElement();
950
951    if (ein->getType() == eout->getType()) {
952        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
953            updateCoeffCache(1.f, 255.f);
954        } else {
955            updateCoeffCache(1.f, 1.f);
956        }
957    } else {
958        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
959            updateCoeffCache(255.f, 255.f);
960        } else {
961            updateCoeffCache(1.f / 255.f, 1.f);
962        }
963    }
964
965    Key_t key = computeKey(ein, eout);
966
967#if defined(ARCH_X86_HAVE_SSSE3)
968    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
969        // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
970        // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
971        mLastKey = key;
972    }
973
974#else //if !defined(ARCH_X86_HAVE_SSSE3)
975    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
976        if (mBuf) munmap(mBuf, mBufSize);
977        mBuf = nullptr;
978        mOptKernel = nullptr;
979        if (build(key)) {
980            mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
981        }
982#if defined(ARCH_ARM64_USE_INTRINSICS)
983        else {
984            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
985            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
986            uint32_t mm = 0;
987            int i;
988            for (i = 0; i < 4; i++)
989            {
990                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
991                m = ((m * 0x249) >> 9) & 15;
992                m |= ((key.u.addMask >> i) & 1) << 4;
993                mm |= m << (i * 5);
994            }
995
996            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
997                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
998            } else {
999                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1000            }
1001        }
1002#endif
1003        mLastKey = key;
1004    }
1005#endif //if !defined(ARCH_X86_HAVE_SSSE3)
1006}
1007
1008RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1009            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1010            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1011
1012    mLastKey.key = 0;
1013    mBuf = nullptr;
1014    mBufSize = 0;
1015    mOptKernel = nullptr;
1016    const static float defaultMatrix[] = {
1017        1.f, 0.f, 0.f, 0.f,
1018        0.f, 1.f, 0.f, 0.f,
1019        0.f, 0.f, 1.f, 0.f,
1020        0.f, 0.f, 0.f, 1.f
1021    };
1022    const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1023    setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1024    setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1025}
1026
1027RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1028    if (mBuf) munmap(mBuf, mBufSize);
1029    mBuf = nullptr;
1030    mOptKernel = nullptr;
1031}
1032
1033void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1034    s->mHal.info.exportedVariableCount = 2;
1035}
1036
1037RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1038                                            const Script *s, const Element *e) {
1039
1040    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1041}
1042