1709a0978ae141198018ca9769f8d96292a8928e6Jason Sams/*
2709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * Copyright (C) 2012 The Android Open Source Project
3709a0978ae141198018ca9769f8d96292a8928e6Jason Sams *
4709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * Licensed under the Apache License, Version 2.0 (the "License");
5709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * you may not use this file except in compliance with the License.
6709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * You may obtain a copy of the License at
7709a0978ae141198018ca9769f8d96292a8928e6Jason Sams *
8709a0978ae141198018ca9769f8d96292a8928e6Jason Sams *      http://www.apache.org/licenses/LICENSE-2.0
9709a0978ae141198018ca9769f8d96292a8928e6Jason Sams *
10709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * Unless required by applicable law or agreed to in writing, software
11709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * distributed under the License is distributed on an "AS IS" BASIS,
12709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * See the License for the specific language governing permissions and
14709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * limitations under the License.
15709a0978ae141198018ca9769f8d96292a8928e6Jason Sams */
16709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
179b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <sys/mman.h>
189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <unistd.h>
19709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
20709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsic.h"
21709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsicInlines.h"
229b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
239b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <sys/mman.h>
249b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <stddef.h>
259b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <stdint.h>
269b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <stdlib.h>
279b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams//#include <utils/StopWatch.h>
289b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
29709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
30a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams/*  uint kernel
31a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q0  D0:  Load slot for R
32a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D1:  Load slot for G
33a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q1  D2:  Load slot for B
34a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D3:  Load slot for A
35a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q2  D4:  Matrix
36a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D5:  =
37a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q3  D6:  =
38a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D7:  =
39a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q4  D8:  Add R
40a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D9:
41a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q5  D10: Add G
42a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D11:
43a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q6  D12: Add B
44a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D13:
45a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q7  D14: Add A
46a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D15:
47a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q8  D16:  I32: R Sum
48a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D17:
49a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q9  D18:  I32: G Sum
50a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D19:
51a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q10 D20:  I32: B Sum
52a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D21:
53a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q11 D22:  I32: A Sum
54a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D23:
55a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q12 D24:  U16: expanded R
56a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D25:
57a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q13 D26:  U16: expanded G
58a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D27:
59a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q14 D28:  U16: expanded B
60a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D29:
61a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q15 D30:  U16: expanded A
62a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D31:
63a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *
64a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams */
65a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
66a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams/*  float kernel
67a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q0  D0:  Load slot for R
68a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D1:  =
69a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q1  D2:  Load slot for G
70a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D3:  =
71a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q2  D4:  Load slot for B
72a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D5:  =
73a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q3  D6:  Load slot for A
74a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D7:  =
75a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q4  D8:  Matrix
76a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D9:  =
77a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q5  D10: =
78a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D11: =
79a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q6  D12: =
80a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D13: =
81a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q7  D14: =
82a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D15: =
83a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q8  D16: Add R
84a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D17: =
85a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q9  D18: Add G
86a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D19: =
87a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q10 D20: Add B
88a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D21: =
89a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q11 D22: Add A
90a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D23: =
91a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q12 D24: Sum R
92a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D25: =
93a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q13 D26: Sum G
94a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D27: =
95a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q14 D28: Sum B
96a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D29: =
97a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *  Q15 D30: Sum A
98a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *      D31: =
99a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams *
100a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams */
101a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
102a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
103a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
104709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace android {
105709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace renderscript {
106709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
107a65de10aabdee0794d0e9c96db944e990166ef0dJason Samstypedef union {
108a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    uint64_t key;
109a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    struct {
110a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t inVecSize          :2;  // [0 - 1]
111a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t outVecSize         :2;  // [2 - 3]
112a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t inType             :4;  // [4 - 7]
113a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t outType            :4;  // [8 - 11]
114a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t dot                :1;  // [12]
115a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t _unused1           :1;  // [13]
116a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t copyAlpha          :1;  // [14]
117a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t _unused2           :1;  // [15]
118a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t coeffMask          :16; // [16-31]
119a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        uint32_t addMask            :4;  // [32-35]
120a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    } u;
121a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams} Key_t;
122709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
1236a45ddb32f391060aa05da6ff09c4814d450586eTim Murray//Re-enable when intrinsic is fixed
12432f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams#if defined(ARCH_ARM64_USE_INTRINSICS)
1250462a39371659d1eeed5eb48dd6d507760301c22Simon Hosietypedef struct {
1260462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    void (*column[4])(void);
1270462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    void (*store)(void);
1280462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    void (*load)(void);
1296e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie    void (*store_end)(void);
1306e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie    void (*load_end)(void);
1310462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie} FunctionTab_t;
1320462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie
1336e7e258316f72be95039278e88e3bc1daea1668fSimon Hosieextern "C" void rsdIntrinsicColorMatrix_int_K(
1340462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie             void *out, void const *in, size_t count,
1350462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie             FunctionTab_t const *fns,
1360462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie             int16_t const *mult, int32_t const *add);
1370462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie
1386e7e258316f72be95039278e88e3bc1daea1668fSimon Hosieextern "C" void rsdIntrinsicColorMatrix_float_K(
1390462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie             void *out, void const *in, size_t count,
1400462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie             FunctionTab_t const *fns,
1410462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie             float const *mult, float const *add);
1420462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie
1436e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie/* The setup functions fill in function tables to be used by above functions;
1446e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie * this code also eliminates jump-to-another-jump cases by short-circuiting
1456e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie * empty functions.  While it's not performance critical, it works out easier
1466e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie * to write the set-up code in assembly than to try to expose the same symbols
1476e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie * and write the code in C.
1486e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie */
1496e7e258316f72be95039278e88e3bc1daea1668fSimon Hosieextern "C" void rsdIntrinsicColorMatrixSetup_int_K(
1506e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie             FunctionTab_t *fns,
1516e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie             uint32_t mask, int dt, int st);
1526e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie
1530462a39371659d1eeed5eb48dd6d507760301c22Simon Hosieextern "C" void rsdIntrinsicColorMatrixSetup_float_K(
1546e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie             FunctionTab_t *fns,
1550462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie             uint32_t mask, int dt, int st);
1560462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie#endif
1570462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie
158709a0978ae141198018ca9769f8d96292a8928e6Jason Samsclass RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic {
159709a0978ae141198018ca9769f8d96292a8928e6Jason Samspublic:
160c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    void populateScript(Script *) override;
161709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
162c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
163709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
164c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    ~RsdCpuScriptIntrinsicColorMatrix() override;
165c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
166709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
167c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines    void preLaunch(uint32_t slot, const Allocation ** ains,
168c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines                   uint32_t inLen, Allocation * aout, const void * usr,
169c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines                   uint32_t usrLen, const RsScriptCall *sc) override;
1709b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
171709a0978ae141198018ca9769f8d96292a8928e6Jason Samsprotected:
172709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    float fp[16];
1732b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    float fpa[4];
174a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
1752b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    // The following four fields are read as constants
1762b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    // by the SIMD assembly code.
177709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    short ip[16];
1780462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    int ipa[4];
1792b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    float tmpFp[16];
1800462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    float tmpFpa[4];
18132f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams#if defined(ARCH_ARM64_USE_INTRINSICS)
1820462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    FunctionTab_t mFnTab;
1830462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie#endif
1849b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
185b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    static void kernel(const RsExpandKernelDriverInfo *info,
1869b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams                       uint32_t xstart, uint32_t xend,
1879ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                       uint32_t outstep);
1889e4a96af136dab5b21a37580d17cbcb89872114eJason Sams    void updateCoeffCache(float fpMul, float addMul);
1899b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
190a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    Key_t mLastKey;
1919b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    unsigned char *mBuf;
1929b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    size_t mBufSize;
1939b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
194a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    Key_t computeKey(const Element *ein, const Element *eout);
1959b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
196a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    bool build(Key_t key);
1979b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
1989b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count);
199709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
200709a0978ae141198018ca9769f8d96292a8928e6Jason Sams};
201709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
202709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
203a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsKey_t RsdCpuScriptIntrinsicColorMatrix::computeKey(
2049b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        const Element *ein, const Element *eout) {
2059b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
206a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    Key_t key;
207a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    key.key = 0;
2089b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
2099b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    // Compute a unique code key for this operation
2109b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
2119b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    // Add to the key the input and output types
2129b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    bool hasFloat = false;
2139b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    if (ein->getType() == RS_TYPE_FLOAT_32) {
2149b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        hasFloat = true;
215a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.inType = RS_TYPE_FLOAT_32;
216a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        rsAssert(key.u.inType == RS_TYPE_FLOAT_32);
2179b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
2189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    if (eout->getType() == RS_TYPE_FLOAT_32) {
2199b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        hasFloat = true;
220a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.outType = RS_TYPE_FLOAT_32;
221a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        rsAssert(key.u.outType == RS_TYPE_FLOAT_32);
222709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    }
223709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
2249b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    // Mask in the bits indicating which coefficients in the
2259b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    // color matrix are needed.
2269b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    if (hasFloat) {
2279b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        for (uint32_t i=0; i < 16; i++) {
2289b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            if (fabs(fp[i]) != 0.f) {
229a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                key.u.coeffMask |= 1 << i;
2309b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
2319b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
232a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1;
2332b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2;
2342b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4;
2352b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8;
236a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
2379b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    } else {
2389b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        for (uint32_t i=0; i < 16; i++) {
2399b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            if (ip[i] != 0) {
240a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                key.u.coeffMask |= 1 << i;
2419b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
2429b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
243a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        if (ipa[0] != 0) key.u.addMask |= 0x1;
2440462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        if (ipa[1] != 0) key.u.addMask |= 0x2;
2450462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        if (ipa[2] != 0) key.u.addMask |= 0x4;
2460462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        if (ipa[3] != 0) key.u.addMask |= 0x8;
2479b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
2489b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
2499b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    // Look for a dot product where the r,g,b colums are the same
2509b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    if ((ip[0] == ip[1]) && (ip[0] == ip[2]) &&
2519b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        (ip[4] == ip[5]) && (ip[4] == ip[6]) &&
2529b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        (ip[8] == ip[9]) && (ip[8] == ip[10]) &&
2539b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        (ip[12] == ip[13]) && (ip[12] == ip[14])) {
2549b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
255a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        if (!key.u.addMask) key.u.dot = 1;
2569b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
2579b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
2589b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    // Is alpha a simple copy
259a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) {
2609e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        key.u.copyAlpha = !(key.u.inType || key.u.outType);
261a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    }
262a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
263a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
264a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
265a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    switch (ein->getVectorSize()) {
266a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    case 4:
267a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.inVecSize = 3;
268a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        break;
269a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    case 3:
270a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.inVecSize = 2;
271a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.coeffMask &= ~0xF000;
272a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        break;
273a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    case 2:
274a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.inVecSize = 1;
275a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.coeffMask &= ~0xFF00;
276a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        break;
277a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    default:
278a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.coeffMask &= ~0xFFF0;
279a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        break;
280a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    }
281a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
282a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    switch (eout->getVectorSize()) {
283a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    case 4:
284a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.outVecSize = 3;
285a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        break;
286a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    case 3:
287a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.outVecSize = 2;
288a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.coeffMask &= ~0x8888;
2890462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        key.u.addMask &= 7;
290a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        break;
291a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    case 2:
292a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.outVecSize = 1;
293a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.coeffMask &= ~0xCCCC;
2940462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        key.u.addMask &= 3;
295a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        break;
296a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    default:
297a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        key.u.coeffMask &= ~0xEEEE;
2980462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        key.u.addMask &= 1;
299a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        break;
3009b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
3019b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
3029e4a96af136dab5b21a37580d17cbcb89872114eJason Sams    if (key.u.inType && !key.u.outType) {
3039e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        key.u.addMask |= 1;
3049e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        if (key.u.outVecSize > 0) key.u.addMask |= 2;
3059e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        if (key.u.outVecSize > 1) key.u.addMask |= 4;
3069e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        if (key.u.outVecSize > 2) key.u.addMask |= 8;
3079e4a96af136dab5b21a37580d17cbcb89872114eJason Sams    }
3089e4a96af136dab5b21a37580d17cbcb89872114eJason Sams
309a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key);
3109b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    return key;
3119b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams}
3129b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
313462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsieh} // namespace renderscript
314462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsieh} // namespace android
315462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsieh
316074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
3179b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
3189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#define DEF_SYM(x)                                  \
3199b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    extern "C" uint32_t _N_ColorMatrix_##x;      \
3209b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    extern "C" uint32_t _N_ColorMatrix_##x##_end;  \
3219b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    extern "C" uint32_t _N_ColorMatrix_##x##_len;
3229b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
323a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(prefix_i)
324a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(prefix_f)
3259b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(postfix1)
3269b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(postfix2)
327a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
3289b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(load_u8_4)
3299e4a96af136dab5b21a37580d17cbcb89872114eJason SamsDEF_SYM(load_u8_3)
3309b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(load_u8_2)
3319b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(load_u8_1)
332a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_u8f_4)
3339e4a96af136dab5b21a37580d17cbcb89872114eJason SamsDEF_SYM(load_u8f_3)
334a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_u8f_2)
335a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_u8f_1)
336a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_f32_4)
3379e4a96af136dab5b21a37580d17cbcb89872114eJason SamsDEF_SYM(load_f32_3)
338a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_f32_2)
339a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_f32_1)
340a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
3419b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(store_u8_4)
3429b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(store_u8_2)
3439b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(store_u8_1)
344a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(store_f32_4)
3459e4a96af136dab5b21a37580d17cbcb89872114eJason SamsDEF_SYM(store_f32_3)
346a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(store_f32_2)
347a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(store_f32_1)
3482b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason SamsDEF_SYM(store_f32u_4)
3492b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason SamsDEF_SYM(store_f32u_2)
3502b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason SamsDEF_SYM(store_f32u_1)
3512b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams
3529b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(unpack_u8_4)
3539b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(unpack_u8_3)
3549b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(unpack_u8_2)
3559b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(unpack_u8_1)
3569b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(pack_u8_4)
3579b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(pack_u8_3)
3589b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(pack_u8_2)
3599b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(pack_u8_1)
3609b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(dot)
3619b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(add_0_u8)
3629b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(add_1_u8)
3639b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(add_2_u8)
3649b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(add_3_u8)
3659b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
3669b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#define ADD_CHUNK(x) \
3679b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \
3689b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    buf += _N_ColorMatrix_##x##_len
3699b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
3709b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
3719b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Samsstatic uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) {
3729b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    size_t off = (target - buf - 8) >> 2;
3739b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    rsAssert(((off & 0xff000000) == 0) ||
3749b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams           ((off & 0xff000000) == 0xff000000));
3759b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
3769b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    uint32_t op = (condition << 28);
3779b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    op |= 0xa << 24;  // branch
3789b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    op |= 0xffffff & off;
3799b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    ((uint32_t *)buf)[0] = op;
3809b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    return buf + 4;
3819b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams}
3829b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
3832b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Samsstatic uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) {
3849b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    rsAssert(vd < 32);
3859b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    rsAssert(vm < 32);
3869b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    rsAssert(vn < 32);
3879b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
3889b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22);
3899b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5);
3909b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7);
3919b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    return op;
3929b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams}
393709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
3949b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Samsstatic uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
3959b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    //vmlal.s16 Q#1, D#1, D#2[#]
3962b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
3979b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    ((uint32_t *)buf)[0] = op;
3989b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    return buf + 4;
3999b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams}
4009b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
4019b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Samsstatic uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
4029b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    //vmull.s16 Q#1, D#1, D#2[#]
4032b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3));
4049b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    ((uint32_t *)buf)[0] = op;
4059b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    return buf + 4;
4069b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams}
407a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
408a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsstatic uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
409c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie    //vqadd.s32 Q#1, Q#1, Q#2
4102b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
411a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    ((uint32_t *)buf)[0] = op;
412a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    return buf + 4;
413a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams}
414a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
415a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsstatic uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
416a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    //vmlal.f32 Q#1, D#1, D#2[#]
4172b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
418a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    ((uint32_t *)buf)[0] = op;
419a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    return buf + 4;
420a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams}
421a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
422a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsstatic uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) {
423a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    //vmull.f32 Q#1, D#1, D#2[#]
4242b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4));
4252b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    ((uint32_t *)buf)[0] = op;
4262b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    return buf + 4;
4272b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams}
4282b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams
4292b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Samsstatic uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
4302b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    //vadd.f32 Q#1, D#1, D#2
4312b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
432a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    ((uint32_t *)buf)[0] = op;
433a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    return buf + 4;
434a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams}
435a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
436c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosiestatic uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) {
437c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie    //vmov.32 Q#1, #imm
438c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie    rsAssert(imm == 0);
439c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie    uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0);
440c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie    ((uint32_t *)buf)[0] = op;
441c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie    return buf + 4;
442c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie}
443c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie
444a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsstatic uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) {
445a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    //vadd.f32 Q#1, D#1, D#2
4462b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1);
447a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    ((uint32_t *)buf)[0] = op;
448a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    return buf + 4;
449a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams}
4509b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#endif
4519b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
4527b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(ARCH_X86_HAVE_SSSE3)
453ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src,
4547b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                  const short *coef, uint32_t count);
455ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src,
4567b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                  const short *coef, uint32_t count);
457ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src,
4587b7060c61e4182b29186849c5a857ea5f0898e56Rose, James                                  const short *coef, uint32_t count);
4597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
460462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsiehusing android::renderscript::Key_t;
461462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsieh
4627b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid * selectKernel(Key_t key)
4637b7060c61e4182b29186849c5a857ea5f0898e56Rose, James{
46444bef6fba6244292b751387f3d6c31cca96c28adChris Wailes    void * kernel = nullptr;
4657b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    // inType, outType float if nonzero
4677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    if (!(key.u.inType || key.u.outType)) {
4687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        if (key.u.dot)
4697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            kernel = (void *)rsdIntrinsicColorMatrixDot_K;
4707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        else if (key.u.copyAlpha)
4717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            kernel = (void *)rsdIntrinsicColorMatrix3x3_K;
4727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        else
4737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James            kernel = (void *)rsdIntrinsicColorMatrix4x4_K;
4747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
4757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
4767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    return kernel;
4777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James}
4787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif
4799b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
480462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsiehnamespace android {
481462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsiehnamespace renderscript {
482462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsieh
483a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsbool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) {
484074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS)
4859b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    mBufSize = 4096;
4869b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    //StopWatch build_time("rs cm: build time");
4879b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE,
4889b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams                                  MAP_PRIVATE | MAP_ANON, -1, 0);
489c214fe59fc48740ed003a3cde4e5a60517c5d5ceJason Sams    if (mBuf == MAP_FAILED) {
490c214fe59fc48740ed003a3cde4e5a60517c5d5ceJason Sams        mBuf = NULL;
4919b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        return false;
4929b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
4939b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
4949b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    uint8_t *buf = mBuf;
49544bef6fba6244292b751387f3d6c31cca96c28adChris Wailes    uint8_t *buf2 = nullptr;
4969b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
497a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    int ops[5][4];  // 0=unused, 1 = set, 2 = accumulate, 3 = final
498a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    int opInit[4] = {0, 0, 0, 0};
4999b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
500a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    memset(ops, 0, sizeof(ops));
501a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    for (int i=0; i < 4; i++) {
502a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        if (key.u.coeffMask & (1 << (i*4))) {
5032b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            ops[i][0] = 0x2 | opInit[0];
504a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            opInit[0] = 1;
5059b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
506a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        if (!key.u.dot) {
507a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            if (key.u.coeffMask & (1 << (1 + i*4))) {
5082b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ops[i][1] = 0x2 | opInit[1];
509a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                opInit[1] = 1;
510a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            }
511a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            if (key.u.coeffMask & (1 << (2 + i*4))) {
5122b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ops[i][2] = 0x2 | opInit[2];
513a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                opInit[2] = 1;
5149b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
5159b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
516a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        if (!key.u.copyAlpha) {
517a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            if (key.u.coeffMask & (1 << (3 + i*4))) {
5182b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ops[i][3] = 0x2 | opInit[3];
519a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                opInit[3] = 1;
5209b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
5219b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
5229b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
5239b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
524a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    if (key.u.inType || key.u.outType) {
5259e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        key.u.copyAlpha = 0;
526a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        ADD_CHUNK(prefix_f);
527a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        buf2 = buf;
528a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
529a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // Load the incoming r,g,b,a as needed
530a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        if (key.u.inType) {
531a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            switch(key.u.inVecSize) {
532a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 3:
533a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(load_f32_4);
534a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
5359e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            case 2:
5369e4a96af136dab5b21a37580d17cbcb89872114eJason Sams                ADD_CHUNK(load_f32_3);
5379e4a96af136dab5b21a37580d17cbcb89872114eJason Sams                break;
538a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 1:
539a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(load_f32_2);
540a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
541a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 0:
542a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(load_f32_1);
543a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
544a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            }
5459b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        } else {
546a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            switch(key.u.inVecSize) {
547a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 3:
548a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(load_u8f_4);
549a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
5509e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            case 2:
5519e4a96af136dab5b21a37580d17cbcb89872114eJason Sams                ADD_CHUNK(load_u8f_3);
5529e4a96af136dab5b21a37580d17cbcb89872114eJason Sams                break;
553a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 1:
554a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(load_u8f_2);
555a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
556a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 0:
557a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(load_u8f_1);
558a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
5599b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
5609b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
561a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
562a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        for (int i=0; i < 4; i++) {
563a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            for (int j=0; j < 4; j++) {
564a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                switch(ops[i][j]) {
565a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                case 0:
566a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                    break;
567a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                case 2:
5682b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                    buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
5692b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                    break;
5702b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                case 3:
5712b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                    buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1);
572a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                    break;
573a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                }
5749b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
5759b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
576a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        for (int j=0; j < 4; j++) {
5772b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            if (opInit[j]) {
5782b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                if (key.u.addMask & (1 << j)) {
5792b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                    buf = addVADD_F32(buf, j, 12+j, 8+j);
5802b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                } else {
5812b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                    buf = addVORR_32(buf, j, 12+j, 12+j);
5822b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                }
5832b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            } else {
5842b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                if (key.u.addMask & (1 << j)) {
585c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie                    buf = addVORR_32(buf, j, 8+j, 8+j);
586c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie                } else {
587c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie                    buf = addVMOV_32(buf, j, 0);
5882b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                }
5892b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            }
5902b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        }
5912b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams
5922b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        if (key.u.outType) {
5932b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            switch(key.u.outVecSize) {
5942b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            case 3:
5952b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ADD_CHUNK(store_f32_4);
5962b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                break;
5979e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            case 2:
5989e4a96af136dab5b21a37580d17cbcb89872114eJason Sams                ADD_CHUNK(store_f32_3);
5999e4a96af136dab5b21a37580d17cbcb89872114eJason Sams                break;
6002b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            case 1:
6012b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ADD_CHUNK(store_f32_2);
6022b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                break;
6032b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            case 0:
6042b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ADD_CHUNK(store_f32_1);
6052b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                break;
6062b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            }
6072b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        } else {
6082b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            switch(key.u.outVecSize) {
6092b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            case 3:
6102b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            case 2:
6112b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ADD_CHUNK(store_f32u_4);
6122b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                break;
6132b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            case 1:
6142b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ADD_CHUNK(store_f32u_2);
6152b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                break;
6162b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams            case 0:
6172b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                ADD_CHUNK(store_f32u_1);
6182b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                break;
619a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            }
620709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        }
6219b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
622a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
623a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    } else {
624a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // Add the function prefix
625a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // Store the address for the loop return
626a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        ADD_CHUNK(prefix_i);
627a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        buf2 = buf;
628a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
629a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // Load the incoming r,g,b,a as needed
630a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        switch(key.u.inVecSize) {
631a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 3:
632a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(load_u8_4);
633a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            if (key.u.copyAlpha) {
634a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(unpack_u8_3);
6359b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            } else {
636a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(unpack_u8_4);
6379b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
638a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
639a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 2:
6409e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            ADD_CHUNK(load_u8_3);
641a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(unpack_u8_3);
642a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
643a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 1:
644a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(load_u8_2);
645a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(unpack_u8_2);
646a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
647a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 0:
648a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(load_u8_1);
649a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(unpack_u8_1);
650a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
6519b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
652a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
653a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // Add multiply and accumulate
654a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // use MULL to init the output register,
655a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // use MLAL from there
656a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        for (int i=0; i < 4; i++) {
657a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            for (int j=0; j < 4; j++) {
658a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                switch(ops[i][j]) {
659a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                case 0:
660a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                    break;
6612b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                case 2:
662a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                    buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j);
663a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                    break;
6642b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams                case 3:
665a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                    buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j);
666a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                    break;
667a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                }
6689b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
6699b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
670a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        for (int j=0; j < 4; j++) {
671ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams            if (opInit[j]) {
672ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams                if (key.u.addMask & (1 << j)) {
673ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams                    buf = addVQADD_S32(buf, 8+j, 8+j, 4+j);
674ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams                }
675ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams            } else {
676ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams                if (key.u.addMask & (1 << j)) {
677c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie                    buf = addVORR_32(buf, 8+j, 4+j, 4+j);
678ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams                }
6799b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            }
6809b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
6819b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
682a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // If we have a dot product, perform the special pack.
683a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        if (key.u.dot) {
684a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(pack_u8_1);
685a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(dot);
686a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        } else {
687a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            switch(key.u.outVecSize) {
688a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 3:
68917e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams                if (key.u.copyAlpha) {
69017e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams                    ADD_CHUNK(pack_u8_3);
69117e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams                } else {
69217e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams                    ADD_CHUNK(pack_u8_4);
69317e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams                }
694a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
695a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 2:
696a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(pack_u8_3);
697a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
698a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 1:
699a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(pack_u8_2);
700a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
701a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            case 0:
702a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                ADD_CHUNK(pack_u8_1);
703a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                break;
704a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            }
705a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        }
706a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
707a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        // Write out result
708a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        switch(key.u.outVecSize) {
7099b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        case 3:
7109b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        case 2:
711a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(store_u8_4);
7129b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            break;
7139b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        case 1:
714a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(store_u8_2);
7159b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            break;
7169b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        case 0:
717a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ADD_CHUNK(store_u8_1);
7189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            break;
7199b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        }
7209b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
7219b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
7222b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    if (key.u.inType != key.u.outType) {
7232b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        key.u.copyAlpha = 0;
7242b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        key.u.dot = 0;
7252b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    }
7262b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams
7279b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    // Loop, branch, and cleanup
7289b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    ADD_CHUNK(postfix1);
7299b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    buf = addBranch(buf, buf2, 0x01);
7309b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    ADD_CHUNK(postfix2);
7319b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
7329b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC);
7339b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    if (ret == -1) {
7349b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        ALOGE("mprotect error %i", ret);
7359b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        return false;
7369b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
7379b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
73845e753a46e587c69b3b0d0c5138e88715a24a29aStephen Hines    __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize);
7399b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    return true;
7409b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#else
7419b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    return false;
7429b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#endif
7439b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams}
7449b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
745ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Samsvoid RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) {
7462b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    for(int ct=0; ct < 16; ct++) {
7472b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        ip[ct] = (short)(fp[ct] * 256.f + 0.5f);
7482b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        tmpFp[ct] = fp[ct] * fpMul;
7499e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        //ALOGE("mat %i %f  %f", ct, fp[ct], tmpFp[ct]);
7502b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    }
7512b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams
752ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams    float add = 0.f;
753ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams    if (fpMul > 254.f) add = 0.5f;
7542b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    for(int ct=0; ct < 4; ct++) {
7550462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        tmpFpa[ct] = fpa[ct] * addMul + add;
7569e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        //ALOGE("fpa %i %f  %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]);
7572b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    }
7582b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams
759ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams    for(int ct=0; ct < 4; ct++) {
7600462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f);
7612b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    }
7622b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams}
7632b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams
7649b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Samsvoid RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data,
7659b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams                                                    size_t dataLength) {
7669b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    switch(slot) {
7679b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    case 0:
7682b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        memcpy (fp, data, sizeof(fp));
7699b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        break;
7709b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    case 1:
7712b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams        memcpy (fpa, data, sizeof(fpa));
7729b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        break;
7739b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    default:
7749b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        rsAssert(0);
7759b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        break;
7769b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    }
7779b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    mRootPtr = &kernel;
778709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
779709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
780709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
781b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossstatic void One(const RsExpandKernelDriverInfo *info, void *out,
78217e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams                const void *py, const float* coeff, const float *add,
783a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams                uint32_t vsin, uint32_t vsout, bool fin, bool fout) {
784a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
785a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    float4 f = 0.f;
786a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    if (fin) {
787a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        switch(vsin) {
788a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 3:
7899e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            f = ((const float4 *)py)[0];
7909e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            break;
791a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 2:
792a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f = ((const float4 *)py)[0];
7939e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            f.w = 0.f;
794a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
795a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 1:
796a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.xy = ((const float2 *)py)[0];
797a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
798a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 0:
799a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.x = ((const float *)py)[0];
800a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
801a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        }
802a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    } else {
803a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        switch(vsin) {
804a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 3:
8059e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            f = convert_float4(((const uchar4 *)py)[0]);
8069e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            break;
807a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 2:
808a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f = convert_float4(((const uchar4 *)py)[0]);
8099e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            f.w = 0.f;
810a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
811a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 1:
81268c817211a6fe87bebed83d38a05fff32cc24a7eJason Sams            f.xy = convert_float2(((const uchar2 *)py)[0]);
813a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
814a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 0:
81568c817211a6fe87bebed83d38a05fff32cc24a7eJason Sams            f.x = (float)(((const uchar *)py)[0]);
816a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
817a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        }
818a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    }
8192b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    //ALOGE("f1  %f %f %f %f", f.x, f.y, f.z, f.w);
820709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
821709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    float4 sum;
822a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    sum.x = f.x * coeff[0] +
823a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.y * coeff[4] +
824a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.z * coeff[8] +
825a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.w * coeff[12];
826a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    sum.y = f.x * coeff[1] +
827a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.y * coeff[5] +
828a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.z * coeff[9] +
829a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.w * coeff[13];
830a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    sum.z = f.x * coeff[2] +
831a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.y * coeff[6] +
832a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.z * coeff[10] +
833a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.w * coeff[14];
834a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    sum.w = f.x * coeff[3] +
835a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.y * coeff[7] +
836a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.z * coeff[11] +
837a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            f.w * coeff[15];
8382b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    //ALOGE("f2  %f %f %f %f", sum.x, sum.y, sum.z, sum.w);
839709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
84017e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams    sum.x += add[0];
8410462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    sum.y += add[1];
8420462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    sum.z += add[2];
8430462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie    sum.w += add[3];
84417e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams
845709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
8462b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w);
847a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    if (fout) {
848a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        switch(vsout) {
849a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 3:
850a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 2:
851a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ((float4 *)out)[0] = sum;
852a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
853a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 1:
854a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ((float2 *)out)[0] = sum.xy;
855a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
856a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 0:
857a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ((float *)out)[0] = sum.x;
858a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
859a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        }
860a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    } else {
8619e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x);
8629e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y);
8639e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z);
8649e4a96af136dab5b21a37580d17cbcb89872114eJason Sams        sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w);
86517e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams
866a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        switch(vsout) {
867a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 3:
868a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 2:
869a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ((uchar4 *)out)[0] = convert_uchar4(sum);
870a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
871a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 1:
872a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ((uchar2 *)out)[0] = convert_uchar2(sum.xy);
873a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
874a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        case 0:
875a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            ((uchar *)out)[0] = sum.x;
876a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams            break;
877a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams        }
878a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    }
8792b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams    //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]);
880709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
881709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
882b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossvoid RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info,
8839b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams                                              uint32_t xstart, uint32_t xend,
8849ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes                                              uint32_t outstep) {
885b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr;
886f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes
887b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uint32_t instep = info->inStride[0];
888f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes
889b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uchar *out = (uchar *)info->outPtr[0];
890b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    uchar *in = (uchar *)info->inPtr[0];
891709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    uint32_t x1 = xstart;
892709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    uint32_t x2 = xend;
893709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
894a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    uint32_t vsin = cp->mLastKey.u.inVecSize;
895a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    uint32_t vsout = cp->mLastKey.u.outVecSize;
896a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    bool floatIn = !!cp->mLastKey.u.inType;
897a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    bool floatOut = !!cp->mLastKey.u.outType;
898a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams
899b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross    //if (!info->current.y) ALOGE("steps %i %i   %i %i", instep, outstep, vsin, vsout);
9009e4a96af136dab5b21a37580d17cbcb89872114eJason Sams
901709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    if(x2 > x1) {
9020462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        int32_t len = x2 - x1;
9030462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        if (gArchUseSIMD) {
90444bef6fba6244292b751387f3d6c31cca96c28adChris Wailes            if((cp->mOptKernel != nullptr) && (len >= 4)) {
905858d0352934596aa46fe97a70f30d4d837f6fc7fJason Sams                // The optimized kernel processes 4 pixels at once
906858d0352934596aa46fe97a70f30d4d837f6fc7fJason Sams                // and requires a minimum of 1 chunk of 4
9070462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                cp->mOptKernel(out, in, cp->ip, len >> 2);
908858d0352934596aa46fe97a70f30d4d837f6fc7fJason Sams                // Update the len and pointers so the generic code can
909858d0352934596aa46fe97a70f30d4d837f6fc7fJason Sams                // finish any leftover pixels
91098dd4bb2b1b08f04dd5034fe0c69daa15f6cc6daJason Sams                len &= ~3;
9110462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                x1 += len;
9120462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                out += outstep * len;
9130462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                in += instep * len;
9140462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            }
91532f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams#if defined(ARCH_ARM64_USE_INTRINSICS)
9160462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            else {
9170462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) {
91832f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams                    // Currently this generates off by one errors.
91932f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams                    //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa);
92032f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams                    //x1 += len;
92132f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams                    //out += outstep * len;
92232f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams                    //in += instep * len;
9230462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                } else {
9246e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie                    rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa);
92532f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams                    x1 += len;
92632f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams                    out += outstep * len;
92732f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams                    in += instep * len;
9280462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                }
9290462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            }
9300462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie#endif
931709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        }
932709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
933709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        while(x1 != x2) {
934b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross            One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut);
93568c817211a6fe87bebed83d38a05fff32cc24a7eJason Sams            out += outstep;
93668c817211a6fe87bebed83d38a05fff32cc24a7eJason Sams            in += instep;
937709a0978ae141198018ca9769f8d96292a8928e6Jason Sams            x1++;
938709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        }
939709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    }
940709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
941709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
942f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailesvoid RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot,
943f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes                                                 const Allocation ** ains,
944f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes                                                 uint32_t inLen,
945f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes                                                 Allocation * aout,
946f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes                                                 const void * usr,
947f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes                                                 uint32_t usrLen,
948f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes                                                 const RsScriptCall *sc) {
9499b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams
950f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes    const Element *ein = ains[0]->mHal.state.type->getElement();
95117e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams    const Element *eout = aout->mHal.state.type->getElement();
95217e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams
95317e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams    if (ein->getType() == eout->getType()) {
954ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
955ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams            updateCoeffCache(1.f, 255.f);
956ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams        } else {
957ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams            updateCoeffCache(1.f, 1.f);
958ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams        }
95917e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams    } else {
96017e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams        if (eout->getType() == RS_TYPE_UNSIGNED_8) {
9619e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            updateCoeffCache(255.f, 255.f);
96217e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams        } else {
9639e4a96af136dab5b21a37580d17cbcb89872114eJason Sams            updateCoeffCache(1.f / 255.f, 1.f);
96417e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams        }
96517e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams    }
96617e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams
967f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes    Key_t key = computeKey(ein, eout);
968f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes
9697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(ARCH_X86_HAVE_SSSE3)
97044bef6fba6244292b751387f3d6c31cca96c28adChris Wailes    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
9717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        // FIXME: Disable mOptKernel to pass RS color matrix CTS cases
9727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key);
9737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James        mLastKey = key;
9747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James    }
9757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James
9767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else //if !defined(ARCH_X86_HAVE_SSSE3)
97744bef6fba6244292b751387f3d6c31cca96c28adChris Wailes    if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) {
9789b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        if (mBuf) munmap(mBuf, mBufSize);
97944bef6fba6244292b751387f3d6c31cca96c28adChris Wailes        mBuf = nullptr;
98044bef6fba6244292b751387f3d6c31cca96c28adChris Wailes        mOptKernel = nullptr;
9819b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams        if (build(key)) {
9829b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams            mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf;
983709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        }
98432f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams#if defined(ARCH_ARM64_USE_INTRINSICS)
9850462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        else {
9860462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0);
9870462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0);
9880462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            uint32_t mm = 0;
9890462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            int i;
9900462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            for (i = 0; i < 4; i++)
9910462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            {
9920462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                uint32_t m = (key.u.coeffMask >> i) & 0x1111;
9930462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                m = ((m * 0x249) >> 9) & 15;
9940462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                m |= ((key.u.addMask >> i) & 1) << 4;
9950462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                mm |= m << (i * 5);
9960462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            }
9970462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie
9980462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) {
9990462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st);
10000462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            } else {
10010462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie                rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st);
10020462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie            }
10030462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        }
10040462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie#endif
10050462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie        mLastKey = key;
1006709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    }
10077b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif //if !defined(ARCH_X86_HAVE_SSSE3)
1008709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
1009709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
1010709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix(
1011c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams            RsdCpuReferenceImpl *ctx, const Script *s, const Element *e)
1012c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) {
1013709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
1014a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams    mLastKey.key = 0;
101544bef6fba6244292b751387f3d6c31cca96c28adChris Wailes    mBuf = nullptr;
10169b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    mBufSize = 0;
101744bef6fba6244292b751387f3d6c31cca96c28adChris Wailes    mOptKernel = nullptr;
1018709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    const static float defaultMatrix[] = {
1019709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        1.f, 0.f, 0.f, 0.f,
1020709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        0.f, 1.f, 0.f, 0.f,
1021709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        0.f, 0.f, 1.f, 0.f,
1022709a0978ae141198018ca9769f8d96292a8928e6Jason Sams        0.f, 0.f, 0.f, 1.f
1023709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    };
10249b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f};
1025709a0978ae141198018ca9769f8d96292a8928e6Jason Sams    setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix));
10269b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    setGlobalVar(1, defaultAdd, sizeof(defaultAdd));
1027709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
1028709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
1029709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() {
10309b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    if (mBuf) munmap(mBuf, mBufSize);
103144bef6fba6244292b751387f3d6c31cca96c28adChris Wailes    mBuf = nullptr;
103244bef6fba6244292b751387f3d6c31cca96c28adChris Wailes    mOptKernel = nullptr;
1033709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
1034709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
1035709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) {
10369b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams    s->mHal.info.exportedVariableCount = 2;
1037709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
1038709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
1039c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason SamsRsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx,
1040c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams                                            const Script *s, const Element *e) {
1041709a0978ae141198018ca9769f8d96292a8928e6Jason Sams
1042c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams    return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e);
1043709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}
1044462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsieh
1045462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsieh} // namespace renderscript
1046462de21ac2e1773b99aedee012adb374e476ae36Chih-Hung Hsieh} // namespace android
1047