1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <sys/mman.h>
18#include <unistd.h>
19
20#include "rsCpuIntrinsic.h"
21#include "rsCpuIntrinsicInlines.h"
22#include "linkloader/include/MemChunk.h"
23#include "linkloader/utils/flush_cpu_cache.h"
24
25#include <sys/mman.h>
26#include <stddef.h>
27#include <stdint.h>
28#include <stdlib.h>
29//#include <utils/StopWatch.h>
30
31
32/*  uint kernel
33 *  Q0  D0:  Load slot for R
34 *      D1:  Load slot for G
35 *  Q1  D2:  Load slot for B
36 *      D3:  Load slot for A
37 *  Q2  D4:  Matrix
38 *      D5:  =
39 *  Q3  D6:  =
40 *      D7:  =
41 *  Q4  D8:  Add R
42 *      D9:
43 *  Q5  D10: Add G
44 *      D11:
45 *  Q6  D12: Add B
46 *      D13:
47 *  Q7  D14: Add A
48 *      D15:
49 *  Q8  D16:  I32: R Sum
50 *      D17:
51 *  Q9  D18:  I32: G Sum
52 *      D19:
53 *  Q10 D20:  I32: B Sum
54 *      D21:
55 *  Q11 D22:  I32: A Sum
56 *      D23:
57 *  Q12 D24:  U16: expanded R
58 *      D25:
59 *  Q13 D26:  U16: expanded G
60 *      D27:
61 *  Q14 D28:  U16: expanded B
62 *      D29:
63 *  Q15 D30:  U16: expanded A
64 *      D31:
65 *
66 */
67
68/*  float kernel
69 *  Q0  D0:  Load slot for R
70 *      D1:  =
71 *  Q1  D2:  Load slot for G
72 *      D3:  =
73 *  Q2  D4:  Load slot for B
74 *      D5:  =
75 *  Q3  D6:  Load slot for A
76 *      D7:  =
77 *  Q4  D8:  Matrix
78 *      D9:  =
79 *  Q5  D10: =
80 *      D11: =
81 *  Q6  D12: =
82 *      D13: =
83 *  Q7  D14: =
84 *      D15: =
85 *  Q8  D16: Add R
86 *      D17: =
87 *  Q9  D18: Add G
88 *      D19: =
89 *  Q10 D20: Add B
90 *      D21: =
91 *  Q11 D22: Add A
92 *      D23: =
93 *  Q12 D24: Sum R
94 *      D25: =
95 *  Q13 D26: Sum G
96 *      D27: =
97 *  Q14 D28: Sum B
98 *      D29: =
99 *  Q15 D30: Sum A
100 *      D31: =
101 *
102 */
103
104
105
106using namespace android;
107using namespace android::renderscript;
108
109namespace android {
110namespace renderscript {
111
112typedef union {
113    uint64_t key;
114    struct {
115        uint32_t inVecSize          :2;  // [0 - 1]
116        uint32_t outVecSize         :2;  // [2 - 3]
117        uint32_t inType             :4;  // [4 - 7]
118        uint32_t outType            :4;  // [8 - 11]
119        uint32_t dot                :1;  // [12]
120        uint32_t _unused1           :1;  // [13]
121        uint32_t copyAlpha          :1;  // [14]
122        uint32_t _unused2           :1;  // [15]
123        uint32_t coeffMask          :16; // [16-31]
124        uint32_t addMask            :4;  // [32-35]
125    } u;
126} Key_t;
127
128//Re-enable when intrinsic is fixed
129#if defined(ARCH_ARM64_USE_INTRINSICS)
130typedef struct {
131    void (*column[4])(void);
132    void (*store)(void);
133    void (*load)(void);
134    void (*store_end)(void);
135    void (*load_end)(void);
136} FunctionTab_t;
137
138extern "C" void rsdIntrinsicColorMatrix_int_K(
139             void *out, void const *in, size_t count,
140             FunctionTab_t const *fns,
141             int16_t const *mult, int32_t const *add);
142
143extern "C" void rsdIntrinsicColorMatrix_float_K(
144             void *out, void const *in, size_t count,
145             FunctionTab_t const *fns,
146             float const *mult, float const *add);
147
148/* The setup functions fill in function tables to be used by above functions;
149 * this code also eliminates jump-to-another-jump cases by short-circuiting
150 * empty functions.  While it's not performance critical, it works out easier
151 * to write the set-up code in assembly than to try to expose the same symbols
152 * and write the code in C.
153 */
154extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
155             FunctionTab_t *fns,
156             uint32_t mask, int dt, int st);
157
158extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
159             FunctionTab_t *fns,
160             uint32_t mask, int dt, int st);
161#endif
162
163class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
164public:
165    virtual void populateScript(Script *);
166
167    virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength);
168
169    virtual ~RsdCpuScriptIntrinsicColorMatrix();
170    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
171
172    virtual void preLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
173                           const void * usr, uint32_t usrLen, const RsScriptCall *sc);
174    virtual void postLaunch(uint32_t slot, const Allocation * ain, Allocation * aout,
175                            const void * usr, uint32_t usrLen, const RsScriptCall *sc);
176
177protected:
178    float fp[16];
179    float fpa[4];
180
181    // The following four fields are read as constants
182    // by the SIMD assembly code.
183    short ip[16];
184    int ipa[4];
185    float tmpFp[16];
186    float tmpFpa[4];
187#if defined(ARCH_ARM64_USE_INTRINSICS)
188    FunctionTab_t mFnTab;
189#endif
190
191    static void kernel(const RsForEachStubParamStruct *p,
192                       uint32_t xstart, uint32_t xend,
193                       uint32_t instep, uint32_t outstep);
194    void updateCoeffCache(float fpMul, float addMul);
195
196    Key_t mLastKey;
197    unsigned char *mBuf;
198    size_t mBufSize;
199
200    Key_t computeKey(const Element *ein, const Element *eout);
201
202    bool build(Key_t key);
203
204    void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
205
206};
207
208}
209}
210
211
212Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
213        const Element *ein, const Element *eout) {
214
215    Key_t key;
216    key.key = 0;
217
218    // Compute a unique code key for this operation
219
220    // Add to the key the input and output types
221    bool hasFloat = false;
222    if (ein->getType() == RS_TYPE_FLOAT_32) {
223        hasFloat = true;
224        key.u.inType = RS_TYPE_FLOAT_32;
225        rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
226    }
227    if (eout->getType() == RS_TYPE_FLOAT_32) {
228        hasFloat = true;
229        key.u.outType = RS_TYPE_FLOAT_32;
230        rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
231    }
232
233    // Mask in the bits indicating which coefficients in the
234    // color matrix are needed.
235    if (hasFloat) {
236        for (uint32_t i=0; i < 16; i++) {
237            if (fabs(fp[i]) != 0.f) {
238                key.u.coeffMask |= 1 << i;
239            }
240        }
241        if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
242        if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
243        if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
244        if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
245
246    } else {
247        for (uint32_t i=0; i < 16; i++) {
248            if (ip[i] != 0) {
249                key.u.coeffMask |= 1 << i;
250            }
251        }
252        if (ipa[0] != 0) key.u.addMask |= 0x1;
253        if (ipa[1] != 0) key.u.addMask |= 0x2;
254        if (ipa[2] != 0) key.u.addMask |= 0x4;
255        if (ipa[3] != 0) key.u.addMask |= 0x8;
256    }
257
258    // Look for a dot product where the r,g,b colums are the same
259    if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
260        (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
261        (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
262        (ip[12] == ip[13]) && (ip[12] == ip[14])) {
263
264        if (!key.u.addMask) key.u.dot = 1;
265    }
266
267    // Is alpha a simple copy
268    if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
269        key.u.copyAlpha = !(key.u.inType || key.u.outType);
270    }
271
272    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
273
274    switch (ein->getVectorSize()) {
275    case 4:
276        key.u.inVecSize = 3;
277        break;
278    case 3:
279        key.u.inVecSize = 2;
280        key.u.coeffMask &= ~0xF000;
281        break;
282    case 2:
283        key.u.inVecSize = 1;
284        key.u.coeffMask &= ~0xFF00;
285        break;
286    default:
287        key.u.coeffMask &= ~0xFFF0;
288        break;
289    }
290
291    switch (eout->getVectorSize()) {
292    case 4:
293        key.u.outVecSize = 3;
294        break;
295    case 3:
296        key.u.outVecSize = 2;
297        key.u.coeffMask &= ~0x8888;
298        key.u.addMask &= 7;
299        break;
300    case 2:
301        key.u.outVecSize = 1;
302        key.u.coeffMask &= ~0xCCCC;
303        key.u.addMask &= 3;
304        break;
305    default:
306        key.u.coeffMask &= ~0xEEEE;
307        key.u.addMask &= 1;
308        break;
309    }
310
311    if (key.u.inType && !key.u.outType) {
312        key.u.addMask |= 1;
313        if (key.u.outVecSize > 0) key.u.addMask |= 2;
314        if (key.u.outVecSize > 1) key.u.addMask |= 4;
315        if (key.u.outVecSize > 2) key.u.addMask |= 8;
316    }
317
318    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
319    return key;
320}
321
322#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
323
324#define DEF_SYM(x)                                  \
325    extern "C" uint32_t _N_ColorMatrix_##x;      \
326    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
327    extern "C" uint32_t _N_ColorMatrix_##x##_len;
328
329DEF_SYM(prefix_i)
330DEF_SYM(prefix_f)
331DEF_SYM(postfix1)
332DEF_SYM(postfix2)
333
334DEF_SYM(load_u8_4)
335DEF_SYM(load_u8_3)
336DEF_SYM(load_u8_2)
337DEF_SYM(load_u8_1)
338DEF_SYM(load_u8f_4)
339DEF_SYM(load_u8f_3)
340DEF_SYM(load_u8f_2)
341DEF_SYM(load_u8f_1)
342DEF_SYM(load_f32_4)
343DEF_SYM(load_f32_3)
344DEF_SYM(load_f32_2)
345DEF_SYM(load_f32_1)
346
347DEF_SYM(store_u8_4)
348DEF_SYM(store_u8_2)
349DEF_SYM(store_u8_1)
350DEF_SYM(store_f32_4)
351DEF_SYM(store_f32_3)
352DEF_SYM(store_f32_2)
353DEF_SYM(store_f32_1)
354DEF_SYM(store_f32u_4)
355DEF_SYM(store_f32u_2)
356DEF_SYM(store_f32u_1)
357
358DEF_SYM(unpack_u8_4)
359DEF_SYM(unpack_u8_3)
360DEF_SYM(unpack_u8_2)
361DEF_SYM(unpack_u8_1)
362DEF_SYM(pack_u8_4)
363DEF_SYM(pack_u8_3)
364DEF_SYM(pack_u8_2)
365DEF_SYM(pack_u8_1)
366DEF_SYM(dot)
367DEF_SYM(add_0_u8)
368DEF_SYM(add_1_u8)
369DEF_SYM(add_2_u8)
370DEF_SYM(add_3_u8)
371
372#define ADD_CHUNK(x) \
373    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
374    buf += _N_ColorMatrix_##x##_len
375
376
377static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
378    size_t off = (target - buf - 8) >> 2;
379    rsAssert(((off & 0xff000000) == 0) ||
380           ((off & 0xff000000) == 0xff000000));
381
382    uint32_t op = (condition << 28);
383    op |= 0xa << 24;  // branch
384    op |= 0xffffff & off;
385    ((uint32_t *)buf)[0] = op;
386    return buf + 4;
387}
388
389static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
390    rsAssert(vd < 32);
391    rsAssert(vm < 32);
392    rsAssert(vn < 32);
393
394    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
395    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
396    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
397    return op;
398}
399
400static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
401    //vmlal.s16 Q#1, D#1, D#2[#]
402    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
403    ((uint32_t *)buf)[0] = op;
404    return buf + 4;
405}
406
407static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
408    //vmull.s16 Q#1, D#1, D#2[#]
409    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
410    ((uint32_t *)buf)[0] = op;
411    return buf + 4;
412}
413
414static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
415    //vqadd.s32 Q#1, Q#1, Q#2
416    uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
417    ((uint32_t *)buf)[0] = op;
418    return buf + 4;
419}
420
421static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
422    //vmlal.f32 Q#1, D#1, D#2[#]
423    uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
424    ((uint32_t *)buf)[0] = op;
425    return buf + 4;
426}
427
428static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
429    //vmull.f32 Q#1, D#1, D#2[#]
430    uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
431    ((uint32_t *)buf)[0] = op;
432    return buf + 4;
433}
434
435static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
436    //vadd.f32 Q#1, D#1, D#2
437    uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
438    ((uint32_t *)buf)[0] = op;
439    return buf + 4;
440}
441
442static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
443    //vmov.32 Q#1, #imm
444    rsAssert(imm == 0);
445    uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
446    ((uint32_t *)buf)[0] = op;
447    return buf + 4;
448}
449
450static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
451    //vadd.f32 Q#1, D#1, D#2
452    uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
453    ((uint32_t *)buf)[0] = op;
454    return buf + 4;
455}
456#endif
457
458#if defined(ARCH_X86_HAVE_SSSE3)
459extern "C" void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
460                                  const short *coef, uint32_t count);
461extern "C" void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
462                                  const short *coef, uint32_t count);
463extern "C" void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
464                                  const short *coef, uint32_t count);
465
466void * selectKernel(Key_t key)
467{
468    void * kernel = NULL;
469
470    // inType, outType float if nonzero
471    if (!(key.u.inType || key.u.outType)) {
472        if (key.u.dot)
473            kernel = (void *)rsdIntrinsicColorMatrixDot_K;
474        else if (key.u.copyAlpha)
475            kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
476        else
477            kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
478    }
479
480    return kernel;
481}
482#endif
483
484bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
485#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
486    mBufSize = 4096;
487    //StopWatch build_time("rs cm: build time");
488    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
489                                  MAP_PRIVATE | MAP_ANON, -1, 0);
490    if (mBuf == MAP_FAILED) {
491        mBuf = NULL;
492        return false;
493    }
494
495    uint8_t *buf = mBuf;
496    uint8_t *buf2 = NULL;
497
498    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
499    int opInit[4] = {0, 0, 0, 0};
500
501    memset(ops, 0, sizeof(ops));
502    for (int i=0; i < 4; i++) {
503        if (key.u.coeffMask & (1 << (i*4))) {
504            ops[i][0] = 0x2 | opInit[0];
505            opInit[0] = 1;
506        }
507        if (!key.u.dot) {
508            if (key.u.coeffMask & (1 << (1 + i*4))) {
509                ops[i][1] = 0x2 | opInit[1];
510                opInit[1] = 1;
511            }
512            if (key.u.coeffMask & (1 << (2 + i*4))) {
513                ops[i][2] = 0x2 | opInit[2];
514                opInit[2] = 1;
515            }
516        }
517        if (!key.u.copyAlpha) {
518            if (key.u.coeffMask & (1 << (3 + i*4))) {
519                ops[i][3] = 0x2 | opInit[3];
520                opInit[3] = 1;
521            }
522        }
523    }
524
525    if (key.u.inType || key.u.outType) {
526        key.u.copyAlpha = 0;
527        ADD_CHUNK(prefix_f);
528        buf2 = buf;
529
530        // Load the incoming r,g,b,a as needed
531        if (key.u.inType) {
532            switch(key.u.inVecSize) {
533            case 3:
534                ADD_CHUNK(load_f32_4);
535                break;
536            case 2:
537                ADD_CHUNK(load_f32_3);
538                break;
539            case 1:
540                ADD_CHUNK(load_f32_2);
541                break;
542            case 0:
543                ADD_CHUNK(load_f32_1);
544                break;
545            }
546        } else {
547            switch(key.u.inVecSize) {
548            case 3:
549                ADD_CHUNK(load_u8f_4);
550                break;
551            case 2:
552                ADD_CHUNK(load_u8f_3);
553                break;
554            case 1:
555                ADD_CHUNK(load_u8f_2);
556                break;
557            case 0:
558                ADD_CHUNK(load_u8f_1);
559                break;
560            }
561        }
562
563        for (int i=0; i < 4; i++) {
564            for (int j=0; j < 4; j++) {
565                switch(ops[i][j]) {
566                case 0:
567                    break;
568                case 2:
569                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
570                    break;
571                case 3:
572                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
573                    break;
574                }
575            }
576        }
577        for (int j=0; j < 4; j++) {
578            if (opInit[j]) {
579                if (key.u.addMask & (1 << j)) {
580                    buf = addVADD_F32(buf, j, 12+j, 8+j);
581                } else {
582                    buf = addVORR_32(buf, j, 12+j, 12+j);
583                }
584            } else {
585                if (key.u.addMask & (1 << j)) {
586                    buf = addVORR_32(buf, j, 8+j, 8+j);
587                } else {
588                    buf = addVMOV_32(buf, j, 0);
589                }
590            }
591        }
592
593        if (key.u.outType) {
594            switch(key.u.outVecSize) {
595            case 3:
596                ADD_CHUNK(store_f32_4);
597                break;
598            case 2:
599                ADD_CHUNK(store_f32_3);
600                break;
601            case 1:
602                ADD_CHUNK(store_f32_2);
603                break;
604            case 0:
605                ADD_CHUNK(store_f32_1);
606                break;
607            }
608        } else {
609            switch(key.u.outVecSize) {
610            case 3:
611            case 2:
612                ADD_CHUNK(store_f32u_4);
613                break;
614            case 1:
615                ADD_CHUNK(store_f32u_2);
616                break;
617            case 0:
618                ADD_CHUNK(store_f32u_1);
619                break;
620            }
621        }
622
623
624    } else {
625        // Add the function prefix
626        // Store the address for the loop return
627        ADD_CHUNK(prefix_i);
628        buf2 = buf;
629
630        // Load the incoming r,g,b,a as needed
631        switch(key.u.inVecSize) {
632        case 3:
633            ADD_CHUNK(load_u8_4);
634            if (key.u.copyAlpha) {
635                ADD_CHUNK(unpack_u8_3);
636            } else {
637                ADD_CHUNK(unpack_u8_4);
638            }
639            break;
640        case 2:
641            ADD_CHUNK(load_u8_3);
642            ADD_CHUNK(unpack_u8_3);
643            break;
644        case 1:
645            ADD_CHUNK(load_u8_2);
646            ADD_CHUNK(unpack_u8_2);
647            break;
648        case 0:
649            ADD_CHUNK(load_u8_1);
650            ADD_CHUNK(unpack_u8_1);
651            break;
652        }
653
654        // Add multiply and accumulate
655        // use MULL to init the output register,
656        // use MLAL from there
657        for (int i=0; i < 4; i++) {
658            for (int j=0; j < 4; j++) {
659                switch(ops[i][j]) {
660                case 0:
661                    break;
662                case 2:
663                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
664                    break;
665                case 3:
666                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
667                    break;
668                }
669            }
670        }
671        for (int j=0; j < 4; j++) {
672            if (opInit[j]) {
673                if (key.u.addMask & (1 << j)) {
674                    buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
675                }
676            } else {
677                if (key.u.addMask & (1 << j)) {
678                    buf = addVORR_32(buf, 8+j, 4+j, 4+j);
679                }
680            }
681        }
682
683        // If we have a dot product, perform the special pack.
684        if (key.u.dot) {
685            ADD_CHUNK(pack_u8_1);
686            ADD_CHUNK(dot);
687        } else {
688            switch(key.u.outVecSize) {
689            case 3:
690                if (key.u.copyAlpha) {
691                    ADD_CHUNK(pack_u8_3);
692                } else {
693                    ADD_CHUNK(pack_u8_4);
694                }
695                break;
696            case 2:
697                ADD_CHUNK(pack_u8_3);
698                break;
699            case 1:
700                ADD_CHUNK(pack_u8_2);
701                break;
702            case 0:
703                ADD_CHUNK(pack_u8_1);
704                break;
705            }
706        }
707
708        // Write out result
709        switch(key.u.outVecSize) {
710        case 3:
711        case 2:
712            ADD_CHUNK(store_u8_4);
713            break;
714        case 1:
715            ADD_CHUNK(store_u8_2);
716            break;
717        case 0:
718            ADD_CHUNK(store_u8_1);
719            break;
720        }
721    }
722
723    if (key.u.inType != key.u.outType) {
724        key.u.copyAlpha = 0;
725        key.u.dot = 0;
726    }
727
728    // Loop, branch, and cleanup
729    ADD_CHUNK(postfix1);
730    buf = addBranch(buf, buf2, 0x01);
731    ADD_CHUNK(postfix2);
732
733    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
734    if (ret == -1) {
735        ALOGE("mprotect error %i", ret);
736        return false;
737    }
738
739    FLUSH_CPU_CACHE(mBuf, (char*) mBuf + mBufSize);
740    return true;
741#else
742    return false;
743#endif
744}
745
746void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
747    for(int ct=0; ct < 16; ct++) {
748        ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
749        tmpFp[ct] = fp[ct] * fpMul;
750        //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
751    }
752
753    float add = 0.f;
754    if (fpMul > 254.f) add = 0.5f;
755    for(int ct=0; ct < 4; ct++) {
756        tmpFpa[ct] = fpa[ct] * addMul + add;
757        //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
758    }
759
760    for(int ct=0; ct < 4; ct++) {
761        ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
762    }
763}
764
765void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
766                                                    size_t dataLength) {
767    switch(slot) {
768    case 0:
769        memcpy (fp, data, sizeof(fp));
770        break;
771    case 1:
772        memcpy (fpa, data, sizeof(fpa));
773        break;
774    default:
775        rsAssert(0);
776        break;
777    }
778    mRootPtr = &kernel;
779}
780
781
782static void One(const RsForEachStubParamStruct *p, void *out,
783                const void *py, const float* coeff, const float *add,
784                uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
785
786    float4 f = 0.f;
787    if (fin) {
788        switch(vsin) {
789        case 3:
790            f = ((const float4 *)py)[0];
791            break;
792        case 2:
793            f = ((const float4 *)py)[0];
794            f.w = 0.f;
795            break;
796        case 1:
797            f.xy = ((const float2 *)py)[0];
798            break;
799        case 0:
800            f.x = ((const float *)py)[0];
801            break;
802        }
803    } else {
804        switch(vsin) {
805        case 3:
806            f = convert_float4(((const uchar4 *)py)[0]);
807            break;
808        case 2:
809            f = convert_float4(((const uchar4 *)py)[0]);
810            f.w = 0.f;
811            break;
812        case 1:
813            f.xy = convert_float2(((const uchar2 *)py)[0]);
814            break;
815        case 0:
816            f.x = (float)(((const uchar *)py)[0]);
817            break;
818        }
819    }
820    //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
821
822    float4 sum;
823    sum.x = f.x * coeff[0] +
824            f.y * coeff[4] +
825            f.z * coeff[8] +
826            f.w * coeff[12];
827    sum.y = f.x * coeff[1] +
828            f.y * coeff[5] +
829            f.z * coeff[9] +
830            f.w * coeff[13];
831    sum.z = f.x * coeff[2] +
832            f.y * coeff[6] +
833            f.z * coeff[10] +
834            f.w * coeff[14];
835    sum.w = f.x * coeff[3] +
836            f.y * coeff[7] +
837            f.z * coeff[11] +
838            f.w * coeff[15];
839    //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
840
841    sum.x += add[0];
842    sum.y += add[1];
843    sum.z += add[2];
844    sum.w += add[3];
845
846
847    //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
848    if (fout) {
849        switch(vsout) {
850        case 3:
851        case 2:
852            ((float4 *)out)[0] = sum;
853            break;
854        case 1:
855            ((float2 *)out)[0] = sum.xy;
856            break;
857        case 0:
858            ((float *)out)[0] = sum.x;
859            break;
860        }
861    } else {
862        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
863        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
864        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
865        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
866
867        switch(vsout) {
868        case 3:
869        case 2:
870            ((uchar4 *)out)[0] = convert_uchar4(sum);
871            break;
872        case 1:
873            ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
874            break;
875        case 0:
876            ((uchar *)out)[0] = sum.x;
877            break;
878        }
879    }
880    //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
881}
882
883void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsForEachStubParamStruct *p,
884                                              uint32_t xstart, uint32_t xend,
885                                              uint32_t instep, uint32_t outstep) {
886    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)p->usr;
887    uchar *out = (uchar *)p->out;
888    uchar *in = (uchar *)p->in;
889    uint32_t x1 = xstart;
890    uint32_t x2 = xend;
891
892    uint32_t vsin = cp->mLastKey.u.inVecSize;
893    uint32_t vsout = cp->mLastKey.u.outVecSize;
894    bool floatIn = !!cp->mLastKey.u.inType;
895    bool floatOut = !!cp->mLastKey.u.outType;
896
897    //if (!p->y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
898
899    if(x2 > x1) {
900        int32_t len = x2 - x1;
901        if (gArchUseSIMD) {
902            if((cp->mOptKernel != NULL) && (len >= 4)) {
903                // The optimized kernel processes 4 pixels at once
904                // and requires a minimum of 1 chunk of 4
905                cp->mOptKernel(out, in, cp->ip, len >> 2);
906                // Update the len and pointers so the generic code can
907                // finish any leftover pixels
908                len &= ~3;
909                x1 += len;
910                out += outstep * len;
911                in += instep * len;
912            }
913#if defined(ARCH_ARM64_USE_INTRINSICS)
914            else {
915                if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
916                    // Currently this generates off by one errors.
917                    //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
918                    //x1 += len;
919                    //out += outstep * len;
920                    //in += instep * len;
921                } else {
922                    rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
923                    x1 += len;
924                    out += outstep * len;
925                    in += instep * len;
926                }
927            }
928#endif
929        }
930
931        while(x1 != x2) {
932            One(p, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
933            out += outstep;
934            in += instep;
935            x1++;
936        }
937    }
938}
939
940void RsdCpuScriptIntrinsicColorMatrix::preLaunch(
941        uint32_t slot, const Allocation * ain, Allocation * aout,
942        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
943
944    const Element *ein = ain->mHal.state.type->getElement();
945    const Element *eout = aout->mHal.state.type->getElement();
946
947    if (ein->getType() == eout->getType()) {
948        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
949            updateCoeffCache(1.f, 255.f);
950        } else {
951            updateCoeffCache(1.f, 1.f);
952        }
953    } else {
954        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
955            updateCoeffCache(255.f, 255.f);
956        } else {
957            updateCoeffCache(1.f / 255.f, 1.f);
958        }
959    }
960
961    Key_t key = computeKey(ain->mHal.state.type->getElement(),
962                           aout->mHal.state.type->getElement());
963#if defined(ARCH_X86_HAVE_SSSE3)
964    if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
965        // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
966        // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
967        mLastKey = key;
968    }
969
970#else //if !defined(ARCH_X86_HAVE_SSSE3)
971    if ((mOptKernel == NULL) || (mLastKey.key != key.key)) {
972        if (mBuf) munmap(mBuf, mBufSize);
973        mBuf = NULL;
974        mOptKernel = NULL;
975        if (build(key)) {
976            mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
977        }
978#if defined(ARCH_ARM64_USE_INTRINSICS)
979        else {
980            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
981            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
982            uint32_t mm = 0;
983            int i;
984            for (i = 0; i < 4; i++)
985            {
986                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
987                m = ((m * 0x249) >> 9) & 15;
988                m |= ((key.u.addMask >> i) & 1) << 4;
989                mm |= m << (i * 5);
990            }
991
992            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
993                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
994            } else {
995                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
996            }
997        }
998#endif
999        mLastKey = key;
1000    }
1001#endif //if !defined(ARCH_X86_HAVE_SSSE3)
1002}
1003
1004void RsdCpuScriptIntrinsicColorMatrix::postLaunch(
1005        uint32_t slot, const Allocation * ain, Allocation * aout,
1006        const void * usr, uint32_t usrLen, const RsScriptCall *sc) {
1007
1008}
1009
1010RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1011            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1012            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1013
1014    mLastKey.key = 0;
1015    mBuf = NULL;
1016    mBufSize = 0;
1017    mOptKernel = NULL;
1018    const static float defaultMatrix[] = {
1019        1.f, 0.f, 0.f, 0.f,
1020        0.f, 1.f, 0.f, 0.f,
1021        0.f, 0.f, 1.f, 0.f,
1022        0.f, 0.f, 0.f, 1.f
1023    };
1024    const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1025    setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1026    setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1027}
1028
1029RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1030    if (mBuf) munmap(mBuf, mBufSize);
1031    mBuf = NULL;
1032    mOptKernel = NULL;
1033}
1034
1035void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1036    s->mHal.info.exportedVariableCount = 2;
1037}
1038
1039RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1040                                            const Script *s, const Element *e) {
1041
1042    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1043}
1044