1/*
2 * Copyright (C) 2012 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include <sys/mman.h>
18#include <unistd.h>
19
20#include "rsCpuIntrinsic.h"
21#include "rsCpuIntrinsicInlines.h"
22
23#include <sys/mman.h>
24#include <stddef.h>
25#include <stdint.h>
26#include <stdlib.h>
27//#include <utils/StopWatch.h>
28
29
30/*  uint kernel
31 *  Q0  D0:  Load slot for R
32 *      D1:  Load slot for G
33 *  Q1  D2:  Load slot for B
34 *      D3:  Load slot for A
35 *  Q2  D4:  Matrix
36 *      D5:  =
37 *  Q3  D6:  =
38 *      D7:  =
39 *  Q4  D8:  Add R
40 *      D9:
41 *  Q5  D10: Add G
42 *      D11:
43 *  Q6  D12: Add B
44 *      D13:
45 *  Q7  D14: Add A
46 *      D15:
47 *  Q8  D16:  I32: R Sum
48 *      D17:
49 *  Q9  D18:  I32: G Sum
50 *      D19:
51 *  Q10 D20:  I32: B Sum
52 *      D21:
53 *  Q11 D22:  I32: A Sum
54 *      D23:
55 *  Q12 D24:  U16: expanded R
56 *      D25:
57 *  Q13 D26:  U16: expanded G
58 *      D27:
59 *  Q14 D28:  U16: expanded B
60 *      D29:
61 *  Q15 D30:  U16: expanded A
62 *      D31:
63 *
64 */
65
66/*  float kernel
67 *  Q0  D0:  Load slot for R
68 *      D1:  =
69 *  Q1  D2:  Load slot for G
70 *      D3:  =
71 *  Q2  D4:  Load slot for B
72 *      D5:  =
73 *  Q3  D6:  Load slot for A
74 *      D7:  =
75 *  Q4  D8:  Matrix
76 *      D9:  =
77 *  Q5  D10: =
78 *      D11: =
79 *  Q6  D12: =
80 *      D13: =
81 *  Q7  D14: =
82 *      D15: =
83 *  Q8  D16: Add R
84 *      D17: =
85 *  Q9  D18: Add G
86 *      D19: =
87 *  Q10 D20: Add B
88 *      D21: =
89 *  Q11 D22: Add A
90 *      D23: =
91 *  Q12 D24: Sum R
92 *      D25: =
93 *  Q13 D26: Sum G
94 *      D27: =
95 *  Q14 D28: Sum B
96 *      D29: =
97 *  Q15 D30: Sum A
98 *      D31: =
99 *
100 */
101
102
103
104namespace android {
105namespace renderscript {
106
107typedef union {
108    uint64_t key;
109    struct {
110        uint32_t inVecSize          :2;  // [0 - 1]
111        uint32_t outVecSize         :2;  // [2 - 3]
112        uint32_t inType             :4;  // [4 - 7]
113        uint32_t outType            :4;  // [8 - 11]
114        uint32_t dot                :1;  // [12]
115        uint32_t _unused1           :1;  // [13]
116        uint32_t copyAlpha          :1;  // [14]
117        uint32_t _unused2           :1;  // [15]
118        uint32_t coeffMask          :16; // [16-31]
119        uint32_t addMask            :4;  // [32-35]
120    } u;
121} Key_t;
122
123//Re-enable when intrinsic is fixed
124#if defined(ARCH_ARM64_USE_INTRINSICS)
125typedef struct {
126    void (*column[4])(void);
127    void (*store)(void);
128    void (*load)(void);
129    void (*store_end)(void);
130    void (*load_end)(void);
131} FunctionTab_t;
132
133extern "C" void rsdIntrinsicColorMatrix_int_K(
134             void *out, void const *in, size_t count,
135             FunctionTab_t const *fns,
136             int16_t const *mult, int32_t const *add);
137
138extern "C" void rsdIntrinsicColorMatrix_float_K(
139             void *out, void const *in, size_t count,
140             FunctionTab_t const *fns,
141             float const *mult, float const *add);
142
143/* The setup functions fill in function tables to be used by above functions;
144 * this code also eliminates jump-to-another-jump cases by short-circuiting
145 * empty functions.  While it's not performance critical, it works out easier
146 * to write the set-up code in assembly than to try to expose the same symbols
147 * and write the code in C.
148 */
149extern "C" void rsdIntrinsicColorMatrixSetup_int_K(
150             FunctionTab_t *fns,
151             uint32_t mask, int dt, int st);
152
153extern "C" void rsdIntrinsicColorMatrixSetup_float_K(
154             FunctionTab_t *fns,
155             uint32_t mask, int dt, int st);
156#endif
157
158class RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
159public:
160    void populateScript(Script *) override;
161
162    void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
163
164    ~RsdCpuScriptIntrinsicColorMatrix() override;
165    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
166
167    void preLaunch(uint32_t slot, const Allocation ** ains,
168                   uint32_t inLen, Allocation * aout, const void * usr,
169                   uint32_t usrLen, const RsScriptCall *sc) override;
170
171protected:
172    float fp[16];
173    float fpa[4];
174
175    // The following four fields are read as constants
176    // by the SIMD assembly code.
177    short ip[16];
178    int ipa[4];
179    float tmpFp[16];
180    float tmpFpa[4];
181#if defined(ARCH_ARM64_USE_INTRINSICS)
182    FunctionTab_t mFnTab;
183#endif
184
185    static void kernel(const RsExpandKernelDriverInfo *info,
186                       uint32_t xstart, uint32_t xend,
187                       uint32_t outstep);
188    void updateCoeffCache(float fpMul, float addMul);
189
190    Key_t mLastKey;
191    unsigned char *mBuf;
192    size_t mBufSize;
193
194    Key_t computeKey(const Element *ein, const Element *eout);
195
196    bool build(Key_t key);
197
198    void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
199
200};
201
202
203Key_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
204        const Element *ein, const Element *eout) {
205
206    Key_t key;
207    key.key = 0;
208
209    // Compute a unique code key for this operation
210
211    // Add to the key the input and output types
212    bool hasFloat = false;
213    if (ein->getType() == RS_TYPE_FLOAT_32) {
214        hasFloat = true;
215        key.u.inType = RS_TYPE_FLOAT_32;
216        rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
217    }
218    if (eout->getType() == RS_TYPE_FLOAT_32) {
219        hasFloat = true;
220        key.u.outType = RS_TYPE_FLOAT_32;
221        rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
222    }
223
224    // Mask in the bits indicating which coefficients in the
225    // color matrix are needed.
226    if (hasFloat) {
227        for (uint32_t i=0; i < 16; i++) {
228            if (fabs(fp[i]) != 0.f) {
229                key.u.coeffMask |= 1 << i;
230            }
231        }
232        if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
233        if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
234        if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
235        if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
236
237    } else {
238        for (uint32_t i=0; i < 16; i++) {
239            if (ip[i] != 0) {
240                key.u.coeffMask |= 1 << i;
241            }
242        }
243        if (ipa[0] != 0) key.u.addMask |= 0x1;
244        if (ipa[1] != 0) key.u.addMask |= 0x2;
245        if (ipa[2] != 0) key.u.addMask |= 0x4;
246        if (ipa[3] != 0) key.u.addMask |= 0x8;
247    }
248
249    // Look for a dot product where the r,g,b colums are the same
250    if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
251        (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
252        (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
253        (ip[12] == ip[13]) && (ip[12] == ip[14])) {
254
255        if (!key.u.addMask) key.u.dot = 1;
256    }
257
258    // Is alpha a simple copy
259    if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
260        key.u.copyAlpha = !(key.u.inType || key.u.outType);
261    }
262
263    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
264
265    switch (ein->getVectorSize()) {
266    case 4:
267        key.u.inVecSize = 3;
268        break;
269    case 3:
270        key.u.inVecSize = 2;
271        key.u.coeffMask &= ~0xF000;
272        break;
273    case 2:
274        key.u.inVecSize = 1;
275        key.u.coeffMask &= ~0xFF00;
276        break;
277    default:
278        key.u.coeffMask &= ~0xFFF0;
279        break;
280    }
281
282    switch (eout->getVectorSize()) {
283    case 4:
284        key.u.outVecSize = 3;
285        break;
286    case 3:
287        key.u.outVecSize = 2;
288        key.u.coeffMask &= ~0x8888;
289        key.u.addMask &= 7;
290        break;
291    case 2:
292        key.u.outVecSize = 1;
293        key.u.coeffMask &= ~0xCCCC;
294        key.u.addMask &= 3;
295        break;
296    default:
297        key.u.coeffMask &= ~0xEEEE;
298        key.u.addMask &= 1;
299        break;
300    }
301
302    if (key.u.inType && !key.u.outType) {
303        key.u.addMask |= 1;
304        if (key.u.outVecSize > 0) key.u.addMask |= 2;
305        if (key.u.outVecSize > 1) key.u.addMask |= 4;
306        if (key.u.outVecSize > 2) key.u.addMask |= 8;
307    }
308
309    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
310    return key;
311}
312
313} // namespace renderscript
314} // namespace android
315
316#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
317
318#define DEF_SYM(x)                                  \
319    extern "C" uint32_t _N_ColorMatrix_##x;      \
320    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
321    extern "C" uint32_t _N_ColorMatrix_##x##_len;
322
323DEF_SYM(prefix_i)
324DEF_SYM(prefix_f)
325DEF_SYM(postfix1)
326DEF_SYM(postfix2)
327
328DEF_SYM(load_u8_4)
329DEF_SYM(load_u8_3)
330DEF_SYM(load_u8_2)
331DEF_SYM(load_u8_1)
332DEF_SYM(load_u8f_4)
333DEF_SYM(load_u8f_3)
334DEF_SYM(load_u8f_2)
335DEF_SYM(load_u8f_1)
336DEF_SYM(load_f32_4)
337DEF_SYM(load_f32_3)
338DEF_SYM(load_f32_2)
339DEF_SYM(load_f32_1)
340
341DEF_SYM(store_u8_4)
342DEF_SYM(store_u8_2)
343DEF_SYM(store_u8_1)
344DEF_SYM(store_f32_4)
345DEF_SYM(store_f32_3)
346DEF_SYM(store_f32_2)
347DEF_SYM(store_f32_1)
348DEF_SYM(store_f32u_4)
349DEF_SYM(store_f32u_2)
350DEF_SYM(store_f32u_1)
351
352DEF_SYM(unpack_u8_4)
353DEF_SYM(unpack_u8_3)
354DEF_SYM(unpack_u8_2)
355DEF_SYM(unpack_u8_1)
356DEF_SYM(pack_u8_4)
357DEF_SYM(pack_u8_3)
358DEF_SYM(pack_u8_2)
359DEF_SYM(pack_u8_1)
360DEF_SYM(dot)
361DEF_SYM(add_0_u8)
362DEF_SYM(add_1_u8)
363DEF_SYM(add_2_u8)
364DEF_SYM(add_3_u8)
365
366#define ADD_CHUNK(x) \
367    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
368    buf += _N_ColorMatrix_##x##_len
369
370
371static uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
372    size_t off = (target - buf - 8) >> 2;
373    rsAssert(((off & 0xff000000) == 0) ||
374           ((off & 0xff000000) == 0xff000000));
375
376    uint32_t op = (condition << 28);
377    op |= 0xa << 24;  // branch
378    op |= 0xffffff & off;
379    ((uint32_t *)buf)[0] = op;
380    return buf + 4;
381}
382
383static uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
384    rsAssert(vd < 32);
385    rsAssert(vm < 32);
386    rsAssert(vn < 32);
387
388    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
389    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
390    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
391    return op;
392}
393
394static uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
395    //vmlal.s16 Q#1, D#1, D#2[#]
396    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
397    ((uint32_t *)buf)[0] = op;
398    return buf + 4;
399}
400
401static uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
402    //vmull.s16 Q#1, D#1, D#2[#]
403    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
404    ((uint32_t *)buf)[0] = op;
405    return buf + 4;
406}
407
408static uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
409    //vqadd.s32 Q#1, Q#1, Q#2
410    uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
411    ((uint32_t *)buf)[0] = op;
412    return buf + 4;
413}
414
415static uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
416    //vmlal.f32 Q#1, D#1, D#2[#]
417    uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
418    ((uint32_t *)buf)[0] = op;
419    return buf + 4;
420}
421
422static uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
423    //vmull.f32 Q#1, D#1, D#2[#]
424    uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
425    ((uint32_t *)buf)[0] = op;
426    return buf + 4;
427}
428
429static uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
430    //vadd.f32 Q#1, D#1, D#2
431    uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
432    ((uint32_t *)buf)[0] = op;
433    return buf + 4;
434}
435
436static uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
437    //vmov.32 Q#1, #imm
438    rsAssert(imm == 0);
439    uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
440    ((uint32_t *)buf)[0] = op;
441    return buf + 4;
442}
443
444static uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
445    //vadd.f32 Q#1, D#1, D#2
446    uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
447    ((uint32_t *)buf)[0] = op;
448    return buf + 4;
449}
450#endif
451
452#if defined(ARCH_X86_HAVE_SSSE3)
453extern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
454                                  const short *coef, uint32_t count);
455extern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
456                                  const short *coef, uint32_t count);
457extern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
458                                  const short *coef, uint32_t count);
459
460using android::renderscript::Key_t;
461
462void * selectKernel(Key_t key)
463{
464    void * kernel = nullptr;
465
466    // inType, outType float if nonzero
467    if (!(key.u.inType || key.u.outType)) {
468        if (key.u.dot)
469            kernel = (void *)rsdIntrinsicColorMatrixDot_K;
470        else if (key.u.copyAlpha)
471            kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
472        else
473            kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
474    }
475
476    return kernel;
477}
478#endif
479
480namespace android {
481namespace renderscript {
482
483bool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
484#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
485    mBufSize = 4096;
486    //StopWatch build_time("rs cm: build time");
487    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
488                                  MAP_PRIVATE | MAP_ANON, -1, 0);
489    if (mBuf == MAP_FAILED) {
490        mBuf = NULL;
491        return false;
492    }
493
494    uint8_t *buf = mBuf;
495    uint8_t *buf2 = nullptr;
496
497    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
498    int opInit[4] = {0, 0, 0, 0};
499
500    memset(ops, 0, sizeof(ops));
501    for (int i=0; i < 4; i++) {
502        if (key.u.coeffMask & (1 << (i*4))) {
503            ops[i][0] = 0x2 | opInit[0];
504            opInit[0] = 1;
505        }
506        if (!key.u.dot) {
507            if (key.u.coeffMask & (1 << (1 + i*4))) {
508                ops[i][1] = 0x2 | opInit[1];
509                opInit[1] = 1;
510            }
511            if (key.u.coeffMask & (1 << (2 + i*4))) {
512                ops[i][2] = 0x2 | opInit[2];
513                opInit[2] = 1;
514            }
515        }
516        if (!key.u.copyAlpha) {
517            if (key.u.coeffMask & (1 << (3 + i*4))) {
518                ops[i][3] = 0x2 | opInit[3];
519                opInit[3] = 1;
520            }
521        }
522    }
523
524    if (key.u.inType || key.u.outType) {
525        key.u.copyAlpha = 0;
526        ADD_CHUNK(prefix_f);
527        buf2 = buf;
528
529        // Load the incoming r,g,b,a as needed
530        if (key.u.inType) {
531            switch(key.u.inVecSize) {
532            case 3:
533                ADD_CHUNK(load_f32_4);
534                break;
535            case 2:
536                ADD_CHUNK(load_f32_3);
537                break;
538            case 1:
539                ADD_CHUNK(load_f32_2);
540                break;
541            case 0:
542                ADD_CHUNK(load_f32_1);
543                break;
544            }
545        } else {
546            switch(key.u.inVecSize) {
547            case 3:
548                ADD_CHUNK(load_u8f_4);
549                break;
550            case 2:
551                ADD_CHUNK(load_u8f_3);
552                break;
553            case 1:
554                ADD_CHUNK(load_u8f_2);
555                break;
556            case 0:
557                ADD_CHUNK(load_u8f_1);
558                break;
559            }
560        }
561
562        for (int i=0; i < 4; i++) {
563            for (int j=0; j < 4; j++) {
564                switch(ops[i][j]) {
565                case 0:
566                    break;
567                case 2:
568                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
569                    break;
570                case 3:
571                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
572                    break;
573                }
574            }
575        }
576        for (int j=0; j < 4; j++) {
577            if (opInit[j]) {
578                if (key.u.addMask & (1 << j)) {
579                    buf = addVADD_F32(buf, j, 12+j, 8+j);
580                } else {
581                    buf = addVORR_32(buf, j, 12+j, 12+j);
582                }
583            } else {
584                if (key.u.addMask & (1 << j)) {
585                    buf = addVORR_32(buf, j, 8+j, 8+j);
586                } else {
587                    buf = addVMOV_32(buf, j, 0);
588                }
589            }
590        }
591
592        if (key.u.outType) {
593            switch(key.u.outVecSize) {
594            case 3:
595                ADD_CHUNK(store_f32_4);
596                break;
597            case 2:
598                ADD_CHUNK(store_f32_3);
599                break;
600            case 1:
601                ADD_CHUNK(store_f32_2);
602                break;
603            case 0:
604                ADD_CHUNK(store_f32_1);
605                break;
606            }
607        } else {
608            switch(key.u.outVecSize) {
609            case 3:
610            case 2:
611                ADD_CHUNK(store_f32u_4);
612                break;
613            case 1:
614                ADD_CHUNK(store_f32u_2);
615                break;
616            case 0:
617                ADD_CHUNK(store_f32u_1);
618                break;
619            }
620        }
621
622
623    } else {
624        // Add the function prefix
625        // Store the address for the loop return
626        ADD_CHUNK(prefix_i);
627        buf2 = buf;
628
629        // Load the incoming r,g,b,a as needed
630        switch(key.u.inVecSize) {
631        case 3:
632            ADD_CHUNK(load_u8_4);
633            if (key.u.copyAlpha) {
634                ADD_CHUNK(unpack_u8_3);
635            } else {
636                ADD_CHUNK(unpack_u8_4);
637            }
638            break;
639        case 2:
640            ADD_CHUNK(load_u8_3);
641            ADD_CHUNK(unpack_u8_3);
642            break;
643        case 1:
644            ADD_CHUNK(load_u8_2);
645            ADD_CHUNK(unpack_u8_2);
646            break;
647        case 0:
648            ADD_CHUNK(load_u8_1);
649            ADD_CHUNK(unpack_u8_1);
650            break;
651        }
652
653        // Add multiply and accumulate
654        // use MULL to init the output register,
655        // use MLAL from there
656        for (int i=0; i < 4; i++) {
657            for (int j=0; j < 4; j++) {
658                switch(ops[i][j]) {
659                case 0:
660                    break;
661                case 2:
662                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
663                    break;
664                case 3:
665                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
666                    break;
667                }
668            }
669        }
670        for (int j=0; j < 4; j++) {
671            if (opInit[j]) {
672                if (key.u.addMask & (1 << j)) {
673                    buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
674                }
675            } else {
676                if (key.u.addMask & (1 << j)) {
677                    buf = addVORR_32(buf, 8+j, 4+j, 4+j);
678                }
679            }
680        }
681
682        // If we have a dot product, perform the special pack.
683        if (key.u.dot) {
684            ADD_CHUNK(pack_u8_1);
685            ADD_CHUNK(dot);
686        } else {
687            switch(key.u.outVecSize) {
688            case 3:
689                if (key.u.copyAlpha) {
690                    ADD_CHUNK(pack_u8_3);
691                } else {
692                    ADD_CHUNK(pack_u8_4);
693                }
694                break;
695            case 2:
696                ADD_CHUNK(pack_u8_3);
697                break;
698            case 1:
699                ADD_CHUNK(pack_u8_2);
700                break;
701            case 0:
702                ADD_CHUNK(pack_u8_1);
703                break;
704            }
705        }
706
707        // Write out result
708        switch(key.u.outVecSize) {
709        case 3:
710        case 2:
711            ADD_CHUNK(store_u8_4);
712            break;
713        case 1:
714            ADD_CHUNK(store_u8_2);
715            break;
716        case 0:
717            ADD_CHUNK(store_u8_1);
718            break;
719        }
720    }
721
722    if (key.u.inType != key.u.outType) {
723        key.u.copyAlpha = 0;
724        key.u.dot = 0;
725    }
726
727    // Loop, branch, and cleanup
728    ADD_CHUNK(postfix1);
729    buf = addBranch(buf, buf2, 0x01);
730    ADD_CHUNK(postfix2);
731
732    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
733    if (ret == -1) {
734        ALOGE("mprotect error %i", ret);
735        return false;
736    }
737
738    __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
739    return true;
740#else
741    return false;
742#endif
743}
744
745void RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
746    for(int ct=0; ct < 16; ct++) {
747        ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
748        tmpFp[ct] = fp[ct] * fpMul;
749        //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
750    }
751
752    float add = 0.f;
753    if (fpMul > 254.f) add = 0.5f;
754    for(int ct=0; ct < 4; ct++) {
755        tmpFpa[ct] = fpa[ct] * addMul + add;
756        //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
757    }
758
759    for(int ct=0; ct < 4; ct++) {
760        ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
761    }
762}
763
764void RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
765                                                    size_t dataLength) {
766    switch(slot) {
767    case 0:
768        memcpy (fp, data, sizeof(fp));
769        break;
770    case 1:
771        memcpy (fpa, data, sizeof(fpa));
772        break;
773    default:
774        rsAssert(0);
775        break;
776    }
777    mRootPtr = &kernel;
778}
779
780
781static void One(const RsExpandKernelDriverInfo *info, void *out,
782                const void *py, const float* coeff, const float *add,
783                uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
784
785    float4 f = 0.f;
786    if (fin) {
787        switch(vsin) {
788        case 3:
789            f = ((const float4 *)py)[0];
790            break;
791        case 2:
792            f = ((const float4 *)py)[0];
793            f.w = 0.f;
794            break;
795        case 1:
796            f.xy = ((const float2 *)py)[0];
797            break;
798        case 0:
799            f.x = ((const float *)py)[0];
800            break;
801        }
802    } else {
803        switch(vsin) {
804        case 3:
805            f = convert_float4(((const uchar4 *)py)[0]);
806            break;
807        case 2:
808            f = convert_float4(((const uchar4 *)py)[0]);
809            f.w = 0.f;
810            break;
811        case 1:
812            f.xy = convert_float2(((const uchar2 *)py)[0]);
813            break;
814        case 0:
815            f.x = (float)(((const uchar *)py)[0]);
816            break;
817        }
818    }
819    //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
820
821    float4 sum;
822    sum.x = f.x * coeff[0] +
823            f.y * coeff[4] +
824            f.z * coeff[8] +
825            f.w * coeff[12];
826    sum.y = f.x * coeff[1] +
827            f.y * coeff[5] +
828            f.z * coeff[9] +
829            f.w * coeff[13];
830    sum.z = f.x * coeff[2] +
831            f.y * coeff[6] +
832            f.z * coeff[10] +
833            f.w * coeff[14];
834    sum.w = f.x * coeff[3] +
835            f.y * coeff[7] +
836            f.z * coeff[11] +
837            f.w * coeff[15];
838    //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
839
840    sum.x += add[0];
841    sum.y += add[1];
842    sum.z += add[2];
843    sum.w += add[3];
844
845
846    //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
847    if (fout) {
848        switch(vsout) {
849        case 3:
850        case 2:
851            ((float4 *)out)[0] = sum;
852            break;
853        case 1:
854            ((float2 *)out)[0] = sum.xy;
855            break;
856        case 0:
857            ((float *)out)[0] = sum.x;
858            break;
859        }
860    } else {
861        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
862        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
863        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
864        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
865
866        switch(vsout) {
867        case 3:
868        case 2:
869            ((uchar4 *)out)[0] = convert_uchar4(sum);
870            break;
871        case 1:
872            ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
873            break;
874        case 0:
875            ((uchar *)out)[0] = sum.x;
876            break;
877        }
878    }
879    //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
880}
881
882void RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
883                                              uint32_t xstart, uint32_t xend,
884                                              uint32_t outstep) {
885    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
886
887    uint32_t instep = info->inStride[0];
888
889    uchar *out = (uchar *)info->outPtr[0];
890    uchar *in = (uchar *)info->inPtr[0];
891    uint32_t x1 = xstart;
892    uint32_t x2 = xend;
893
894    uint32_t vsin = cp->mLastKey.u.inVecSize;
895    uint32_t vsout = cp->mLastKey.u.outVecSize;
896    bool floatIn = !!cp->mLastKey.u.inType;
897    bool floatOut = !!cp->mLastKey.u.outType;
898
899    //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
900
901    if(x2 > x1) {
902        int32_t len = x2 - x1;
903        if (gArchUseSIMD) {
904            if((cp->mOptKernel != nullptr) && (len >= 4)) {
905                // The optimized kernel processes 4 pixels at once
906                // and requires a minimum of 1 chunk of 4
907                cp->mOptKernel(out, in, cp->ip, len >> 2);
908                // Update the len and pointers so the generic code can
909                // finish any leftover pixels
910                len &= ~3;
911                x1 += len;
912                out += outstep * len;
913                in += instep * len;
914            }
915#if defined(ARCH_ARM64_USE_INTRINSICS)
916            else {
917                if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
918                    // Currently this generates off by one errors.
919                    //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
920                    //x1 += len;
921                    //out += outstep * len;
922                    //in += instep * len;
923                } else {
924                    rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
925                    x1 += len;
926                    out += outstep * len;
927                    in += instep * len;
928                }
929            }
930#endif
931        }
932
933        while(x1 != x2) {
934            One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
935            out += outstep;
936            in += instep;
937            x1++;
938        }
939    }
940}
941
942void RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
943                                                 const Allocation ** ains,
944                                                 uint32_t inLen,
945                                                 Allocation * aout,
946                                                 const void * usr,
947                                                 uint32_t usrLen,
948                                                 const RsScriptCall *sc) {
949
950    const Element *ein = ains[0]->mHal.state.type->getElement();
951    const Element *eout = aout->mHal.state.type->getElement();
952
953    if (ein->getType() == eout->getType()) {
954        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
955            updateCoeffCache(1.f, 255.f);
956        } else {
957            updateCoeffCache(1.f, 1.f);
958        }
959    } else {
960        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
961            updateCoeffCache(255.f, 255.f);
962        } else {
963            updateCoeffCache(1.f / 255.f, 1.f);
964        }
965    }
966
967    Key_t key = computeKey(ein, eout);
968
969#if defined(ARCH_X86_HAVE_SSSE3)
970    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
971        // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
972        // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
973        mLastKey = key;
974    }
975
976#else //if !defined(ARCH_X86_HAVE_SSSE3)
977    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
978        if (mBuf) munmap(mBuf, mBufSize);
979        mBuf = nullptr;
980        mOptKernel = nullptr;
981        if (build(key)) {
982            mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
983        }
984#if defined(ARCH_ARM64_USE_INTRINSICS)
985        else {
986            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
987            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
988            uint32_t mm = 0;
989            int i;
990            for (i = 0; i < 4; i++)
991            {
992                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
993                m = ((m * 0x249) >> 9) & 15;
994                m |= ((key.u.addMask >> i) & 1) << 4;
995                mm |= m << (i * 5);
996            }
997
998            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
999                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
1000            } else {
1001                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
1002            }
1003        }
1004#endif
1005        mLastKey = key;
1006    }
1007#endif //if !defined(ARCH_X86_HAVE_SSSE3)
1008}
1009
1010RsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1011            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1012            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1013
1014    mLastKey.key = 0;
1015    mBuf = nullptr;
1016    mBufSize = 0;
1017    mOptKernel = nullptr;
1018    const static float defaultMatrix[] = {
1019        1.f, 0.f, 0.f, 0.f,
1020        0.f, 1.f, 0.f, 0.f,
1021        0.f, 0.f, 1.f, 0.f,
1022        0.f, 0.f, 0.f, 1.f
1023    };
1024    const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1025    setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
1026    setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1027}
1028
1029RsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
1030    if (mBuf) munmap(mBuf, mBufSize);
1031    mBuf = nullptr;
1032    mOptKernel = nullptr;
1033}
1034
1035void RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
1036    s->mHal.info.exportedVariableCount = 2;
1037}
1038
1039RsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1040                                            const Script *s, const Element *e) {
1041
1042    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1043}
1044
1045} // namespace renderscript
1046} // namespace android
1047