1709a0978ae141198018ca9769f8d96292a8928e6Jason Sams/* 2709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * Copyright (C) 2012 The Android Open Source Project 3709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * 4709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * Licensed under the Apache License, Version 2.0 (the "License"); 5709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * you may not use this file except in compliance with the License. 6709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * You may obtain a copy of the License at 7709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * 8709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * http://www.apache.org/licenses/LICENSE-2.0 9709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * 10709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * Unless required by applicable law or agreed to in writing, software 11709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * distributed under the License is distributed on an "AS IS" BASIS, 12709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * See the License for the specific language governing permissions and 14709a0978ae141198018ca9769f8d96292a8928e6Jason Sams * limitations under the License. 15709a0978ae141198018ca9769f8d96292a8928e6Jason Sams */ 16709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 179b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <sys/mman.h> 189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <unistd.h> 19709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 20709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsic.h" 21709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsicInlines.h" 229b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 239b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <sys/mman.h> 249b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <stddef.h> 259b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <stdint.h> 269b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#include <stdlib.h> 279b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams//#include <utils/StopWatch.h> 289b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 29709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 30a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams/* uint kernel 31a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q0 D0: Load slot for R 32a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D1: Load slot for G 33a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q1 D2: Load slot for B 34a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D3: Load slot for A 35a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q2 D4: Matrix 36a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D5: = 37a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q3 D6: = 38a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D7: = 39a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q4 D8: Add R 40a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D9: 41a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q5 D10: Add G 42a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D11: 43a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q6 D12: Add B 44a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D13: 45a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q7 D14: Add A 46a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D15: 47a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q8 D16: I32: R Sum 48a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D17: 49a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q9 D18: I32: G Sum 50a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D19: 51a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q10 D20: I32: B Sum 52a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D21: 53a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q11 D22: I32: A Sum 54a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D23: 55a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q12 D24: U16: expanded R 56a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D25: 57a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q13 D26: U16: expanded G 58a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D27: 59a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q14 D28: U16: expanded B 60a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D29: 61a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q15 D30: U16: expanded A 62a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D31: 63a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * 64a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams */ 65a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 66a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams/* float kernel 67a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q0 D0: Load slot for R 68a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D1: = 69a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q1 D2: Load slot for G 70a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D3: = 71a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q2 D4: Load slot for B 72a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D5: = 73a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q3 D6: Load slot for A 74a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D7: = 75a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q4 D8: Matrix 76a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D9: = 77a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q5 D10: = 78a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D11: = 79a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q6 D12: = 80a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D13: = 81a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q7 D14: = 82a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D15: = 83a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q8 D16: Add R 84a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D17: = 85a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q9 D18: Add G 86a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D19: = 87a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q10 D20: Add B 88a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D21: = 89a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q11 D22: Add A 90a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D23: = 91a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q12 D24: Sum R 92a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D25: = 93a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q13 D26: Sum G 94a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D27: = 95a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q14 D28: Sum B 96a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D29: = 97a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * Q15 D30: Sum A 98a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * D31: = 99a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams * 100a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams */ 101a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 102a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 103a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 104709a0978ae141198018ca9769f8d96292a8928e6Jason Samsusing namespace android; 105709a0978ae141198018ca9769f8d96292a8928e6Jason Samsusing namespace android::renderscript; 106709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 107709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace android { 108709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace renderscript { 109709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 110a65de10aabdee0794d0e9c96db944e990166ef0dJason Samstypedef union { 111a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint64_t key; 112a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams struct { 113a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t inVecSize :2; // [0 - 1] 114a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t outVecSize :2; // [2 - 3] 115a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t inType :4; // [4 - 7] 116a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t outType :4; // [8 - 11] 117a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t dot :1; // [12] 118a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t _unused1 :1; // [13] 119a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t copyAlpha :1; // [14] 120a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t _unused2 :1; // [15] 121a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t coeffMask :16; // [16-31] 122a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t addMask :4; // [32-35] 123a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } u; 124a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams} Key_t; 125709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 1266a45ddb32f391060aa05da6ff09c4814d450586eTim Murray//Re-enable when intrinsic is fixed 12732f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams#if defined(ARCH_ARM64_USE_INTRINSICS) 1280462a39371659d1eeed5eb48dd6d507760301c22Simon Hosietypedef struct { 1290462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie void (*column[4])(void); 1300462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie void (*store)(void); 1310462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie void (*load)(void); 1326e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie void (*store_end)(void); 1336e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie void (*load_end)(void); 1340462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie} FunctionTab_t; 1350462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie 1366e7e258316f72be95039278e88e3bc1daea1668fSimon Hosieextern "C" void rsdIntrinsicColorMatrix_int_K( 1370462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie void *out, void const *in, size_t count, 1380462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie FunctionTab_t const *fns, 1390462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie int16_t const *mult, int32_t const *add); 1400462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie 1416e7e258316f72be95039278e88e3bc1daea1668fSimon Hosieextern "C" void rsdIntrinsicColorMatrix_float_K( 1420462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie void *out, void const *in, size_t count, 1430462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie FunctionTab_t const *fns, 1440462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie float const *mult, float const *add); 1450462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie 1466e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie/* The setup functions fill in function tables to be used by above functions; 1476e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie * this code also eliminates jump-to-another-jump cases by short-circuiting 1486e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie * empty functions. While it's not performance critical, it works out easier 1496e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie * to write the set-up code in assembly than to try to expose the same symbols 1506e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie * and write the code in C. 1516e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie */ 1526e7e258316f72be95039278e88e3bc1daea1668fSimon Hosieextern "C" void rsdIntrinsicColorMatrixSetup_int_K( 1536e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie FunctionTab_t *fns, 1546e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie uint32_t mask, int dt, int st); 1556e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie 1560462a39371659d1eeed5eb48dd6d507760301c22Simon Hosieextern "C" void rsdIntrinsicColorMatrixSetup_float_K( 1576e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie FunctionTab_t *fns, 1580462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie uint32_t mask, int dt, int st); 1590462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie#endif 1600462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie 161709a0978ae141198018ca9769f8d96292a8928e6Jason Samsclass RsdCpuScriptIntrinsicColorMatrix : public RsdCpuScriptIntrinsic { 162709a0978ae141198018ca9769f8d96292a8928e6Jason Samspublic: 163c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines void populateScript(Script *) override; 164709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 165c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override; 166709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 167c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines ~RsdCpuScriptIntrinsicColorMatrix() override; 168c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams RsdCpuScriptIntrinsicColorMatrix(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 169709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 170c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines void preLaunch(uint32_t slot, const Allocation ** ains, 171c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines uint32_t inLen, Allocation * aout, const void * usr, 172c060f1435e7b9405f3be8974417fa6f410f03753Stephen Hines uint32_t usrLen, const RsScriptCall *sc) override; 1739b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 174709a0978ae141198018ca9769f8d96292a8928e6Jason Samsprotected: 175709a0978ae141198018ca9769f8d96292a8928e6Jason Sams float fp[16]; 1762b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams float fpa[4]; 177a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 1782b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams // The following four fields are read as constants 1792b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams // by the SIMD assembly code. 180709a0978ae141198018ca9769f8d96292a8928e6Jason Sams short ip[16]; 1810462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie int ipa[4]; 1822b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams float tmpFp[16]; 1830462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie float tmpFpa[4]; 18432f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams#if defined(ARCH_ARM64_USE_INTRINSICS) 1850462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie FunctionTab_t mFnTab; 1860462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie#endif 1879b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 188b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross static void kernel(const RsExpandKernelDriverInfo *info, 1899b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams uint32_t xstart, uint32_t xend, 1909ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes uint32_t outstep); 1919e4a96af136dab5b21a37580d17cbcb89872114eJason Sams void updateCoeffCache(float fpMul, float addMul); 1929b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 193a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams Key_t mLastKey; 1949b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams unsigned char *mBuf; 1959b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams size_t mBufSize; 1969b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 197a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams Key_t computeKey(const Element *ein, const Element *eout); 1989b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 199a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams bool build(Key_t key); 2009b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 2019b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams void (*mOptKernel)(void *dst, const void *src, const short *coef, uint32_t count); 202709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 203709a0978ae141198018ca9769f8d96292a8928e6Jason Sams}; 204709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 205709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 206709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 207709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 208709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 209a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsKey_t RsdCpuScriptIntrinsicColorMatrix::computeKey( 2109b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams const Element *ein, const Element *eout) { 2119b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 212a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams Key_t key; 213a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.key = 0; 2149b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 2159b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams // Compute a unique code key for this operation 2169b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 2179b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams // Add to the key the input and output types 2189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams bool hasFloat = false; 2199b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (ein->getType() == RS_TYPE_FLOAT_32) { 2209b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams hasFloat = true; 221a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.inType = RS_TYPE_FLOAT_32; 222a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams rsAssert(key.u.inType == RS_TYPE_FLOAT_32); 2239b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 2249b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (eout->getType() == RS_TYPE_FLOAT_32) { 2259b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams hasFloat = true; 226a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.outType = RS_TYPE_FLOAT_32; 227a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams rsAssert(key.u.outType == RS_TYPE_FLOAT_32); 228709a0978ae141198018ca9769f8d96292a8928e6Jason Sams } 229709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 2309b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams // Mask in the bits indicating which coefficients in the 2319b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams // color matrix are needed. 2329b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (hasFloat) { 2339b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams for (uint32_t i=0; i < 16; i++) { 2349b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (fabs(fp[i]) != 0.f) { 235a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.coeffMask |= 1 << i; 2369b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 2379b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 238a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (fabs(fpa[0]) != 0.f) key.u.addMask |= 0x1; 2392b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams if (fabs(fpa[1]) != 0.f) key.u.addMask |= 0x2; 2402b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams if (fabs(fpa[2]) != 0.f) key.u.addMask |= 0x4; 2412b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams if (fabs(fpa[3]) != 0.f) key.u.addMask |= 0x8; 242a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 2439b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } else { 2449b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams for (uint32_t i=0; i < 16; i++) { 2459b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (ip[i] != 0) { 246a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.coeffMask |= 1 << i; 2479b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 2489b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 249a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (ipa[0] != 0) key.u.addMask |= 0x1; 2500462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie if (ipa[1] != 0) key.u.addMask |= 0x2; 2510462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie if (ipa[2] != 0) key.u.addMask |= 0x4; 2520462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie if (ipa[3] != 0) key.u.addMask |= 0x8; 2539b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 2549b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 2559b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams // Look for a dot product where the r,g,b colums are the same 2569b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if ((ip[0] == ip[1]) && (ip[0] == ip[2]) && 2579b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams (ip[4] == ip[5]) && (ip[4] == ip[6]) && 2589b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams (ip[8] == ip[9]) && (ip[8] == ip[10]) && 2599b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams (ip[12] == ip[13]) && (ip[12] == ip[14])) { 2609b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 261a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (!key.u.addMask) key.u.dot = 1; 2629b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 2639b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 2649b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams // Is alpha a simple copy 265a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (!(key.u.coeffMask & 0x0888) && (ip[15] == 256) && !(key.u.addMask & 0x8)) { 2669e4a96af136dab5b21a37580d17cbcb89872114eJason Sams key.u.copyAlpha = !(key.u.inType || key.u.outType); 267a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 268a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 269a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 270a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 271a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch (ein->getVectorSize()) { 272a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 4: 273a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.inVecSize = 3; 274a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 275a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 276a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.inVecSize = 2; 277a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.coeffMask &= ~0xF000; 278a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 279a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 280a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.inVecSize = 1; 281a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.coeffMask &= ~0xFF00; 282a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 283a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams default: 284a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.coeffMask &= ~0xFFF0; 285a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 286a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 287a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 288a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch (eout->getVectorSize()) { 289a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 4: 290a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.outVecSize = 3; 291a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 292a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 293a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.outVecSize = 2; 294a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.coeffMask &= ~0x8888; 2950462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie key.u.addMask &= 7; 296a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 297a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 298a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.outVecSize = 1; 299a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.coeffMask &= ~0xCCCC; 3000462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie key.u.addMask &= 3; 301a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 302a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams default: 303a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams key.u.coeffMask &= ~0xEEEE; 3040462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie key.u.addMask &= 1; 305a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 3069b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 3079b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 3089e4a96af136dab5b21a37580d17cbcb89872114eJason Sams if (key.u.inType && !key.u.outType) { 3099e4a96af136dab5b21a37580d17cbcb89872114eJason Sams key.u.addMask |= 1; 3109e4a96af136dab5b21a37580d17cbcb89872114eJason Sams if (key.u.outVecSize > 0) key.u.addMask |= 2; 3119e4a96af136dab5b21a37580d17cbcb89872114eJason Sams if (key.u.outVecSize > 1) key.u.addMask |= 4; 3129e4a96af136dab5b21a37580d17cbcb89872114eJason Sams if (key.u.outVecSize > 2) key.u.addMask |= 8; 3139e4a96af136dab5b21a37580d17cbcb89872114eJason Sams } 3149e4a96af136dab5b21a37580d17cbcb89872114eJason Sams 315a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams //ALOGE("build key %08x, %08x", (int32_t)(key.key >> 32), (int32_t)key.key); 3169b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return key; 3179b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams} 3189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 319074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) 3209b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 3219b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#define DEF_SYM(x) \ 3229b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams extern "C" uint32_t _N_ColorMatrix_##x; \ 3239b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams extern "C" uint32_t _N_ColorMatrix_##x##_end; \ 3249b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams extern "C" uint32_t _N_ColorMatrix_##x##_len; 3259b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 326a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(prefix_i) 327a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(prefix_f) 3289b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(postfix1) 3299b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(postfix2) 330a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 3319b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(load_u8_4) 3329e4a96af136dab5b21a37580d17cbcb89872114eJason SamsDEF_SYM(load_u8_3) 3339b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(load_u8_2) 3349b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(load_u8_1) 335a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_u8f_4) 3369e4a96af136dab5b21a37580d17cbcb89872114eJason SamsDEF_SYM(load_u8f_3) 337a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_u8f_2) 338a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_u8f_1) 339a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_f32_4) 3409e4a96af136dab5b21a37580d17cbcb89872114eJason SamsDEF_SYM(load_f32_3) 341a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_f32_2) 342a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(load_f32_1) 343a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 3449b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(store_u8_4) 3459b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(store_u8_2) 3469b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(store_u8_1) 347a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(store_f32_4) 3489e4a96af136dab5b21a37580d17cbcb89872114eJason SamsDEF_SYM(store_f32_3) 349a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(store_f32_2) 350a65de10aabdee0794d0e9c96db944e990166ef0dJason SamsDEF_SYM(store_f32_1) 3512b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason SamsDEF_SYM(store_f32u_4) 3522b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason SamsDEF_SYM(store_f32u_2) 3532b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason SamsDEF_SYM(store_f32u_1) 3542b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams 3559b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(unpack_u8_4) 3569b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(unpack_u8_3) 3579b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(unpack_u8_2) 3589b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(unpack_u8_1) 3599b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(pack_u8_4) 3609b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(pack_u8_3) 3619b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(pack_u8_2) 3629b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(pack_u8_1) 3639b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(dot) 3649b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(add_0_u8) 3659b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(add_1_u8) 3669b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(add_2_u8) 3679b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason SamsDEF_SYM(add_3_u8) 3689b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 3699b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#define ADD_CHUNK(x) \ 3709b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams memcpy(buf, &_N_ColorMatrix_##x, _N_ColorMatrix_##x##_len); \ 3719b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams buf += _N_ColorMatrix_##x##_len 3729b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 3739b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 3749b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Samsstatic uint8_t * addBranch(uint8_t *buf, const uint8_t *target, uint32_t condition) { 3759b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams size_t off = (target - buf - 8) >> 2; 3769b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams rsAssert(((off & 0xff000000) == 0) || 3779b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams ((off & 0xff000000) == 0xff000000)); 3789b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 3799b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams uint32_t op = (condition << 28); 3809b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams op |= 0xa << 24; // branch 3819b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams op |= 0xffffff & off; 3829b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams ((uint32_t *)buf)[0] = op; 3839b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return buf + 4; 3849b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams} 3859b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 3862b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Samsstatic uint32_t encodeSIMDRegs(uint32_t vd, uint32_t vn, uint32_t vm) { 3879b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams rsAssert(vd < 32); 3889b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams rsAssert(vm < 32); 3899b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams rsAssert(vn < 32); 3909b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 3919b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams uint32_t op = ((vd & 0xf) << 12) | (((vd & 0x10) >> 4) << 22); 3929b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams op |= (vm & 0xf) | (((vm & 0x10) >> 4) << 5); 3939b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams op |= ((vn & 0xf) << 16) | (((vn & 0x10) >> 4) << 7); 3949b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return op; 3959b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams} 396709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 3979b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Samsstatic uint8_t * addVMLAL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 3989b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams //vmlal.s16 Q#1, D#1, D#2[#] 3992b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams uint32_t op = 0xf2900240 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 4009b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams ((uint32_t *)buf)[0] = op; 4019b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return buf + 4; 4029b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams} 4039b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 4049b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Samsstatic uint8_t * addVMULL_S16(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 4059b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams //vmull.s16 Q#1, D#1, D#2[#] 4062b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams uint32_t op = 0xf2900A40 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 3)); 4079b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams ((uint32_t *)buf)[0] = op; 4089b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return buf + 4; 4099b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams} 410a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 411a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsstatic uint8_t * addVQADD_S32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 412c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie //vqadd.s32 Q#1, Q#1, Q#2 4132b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams uint32_t op = 0xf2200050 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 414a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((uint32_t *)buf)[0] = op; 415a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams return buf + 4; 416a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams} 417a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 418a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsstatic uint8_t * addVMLAL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 419a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams //vmlal.f32 Q#1, D#1, D#2[#] 4202b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams uint32_t op = 0xf3a00140 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 421a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((uint32_t *)buf)[0] = op; 422a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams return buf + 4; 423a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams} 424a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 425a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsstatic uint8_t * addVMULL_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_d1, uint32_t src_d2, uint32_t src_d2_s) { 426a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams //vmull.f32 Q#1, D#1, D#2[#] 4272b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams uint32_t op = 0xf3a00940 | encodeSIMDRegs(dest_q << 1, src_d1, src_d2 | (src_d2_s << 4)); 4282b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ((uint32_t *)buf)[0] = op; 4292b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams return buf + 4; 4302b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams} 4312b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams 4322b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Samsstatic uint8_t * addVORR_32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 4332b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams //vadd.f32 Q#1, D#1, D#2 4342b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams uint32_t op = 0xf2200150 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 435a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((uint32_t *)buf)[0] = op; 436a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams return buf + 4; 437a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams} 438a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 439c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosiestatic uint8_t * addVMOV_32(uint8_t *buf, uint32_t dest_q, uint32_t imm) { 440c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie //vmov.32 Q#1, #imm 441c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie rsAssert(imm == 0); 442c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie uint32_t op = 0xf2800050 | encodeSIMDRegs(dest_q << 1, 0, 0); 443c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie ((uint32_t *)buf)[0] = op; 444c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie return buf + 4; 445c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie} 446c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie 447a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsstatic uint8_t * addVADD_F32(uint8_t *buf, uint32_t dest_q, uint32_t src_q1, uint32_t src_q2) { 448a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams //vadd.f32 Q#1, D#1, D#2 4492b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams uint32_t op = 0xf2000d40 | encodeSIMDRegs(dest_q << 1, src_q1 << 1, src_q2 << 1); 450a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((uint32_t *)buf)[0] = op; 451a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams return buf + 4; 452a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams} 4539b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#endif 4549b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 4557b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(ARCH_X86_HAVE_SSSE3) 456ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern void rsdIntrinsicColorMatrixDot_K(void *dst, const void *src, 4577b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const short *coef, uint32_t count); 458ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern void rsdIntrinsicColorMatrix3x3_K(void *dst, const void *src, 4597b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const short *coef, uint32_t count); 460ebf0eb95cba9579af7cb67205b94b286f221c4edDan Albertextern void rsdIntrinsicColorMatrix4x4_K(void *dst, const void *src, 4617b7060c61e4182b29186849c5a857ea5f0898e56Rose, James const short *coef, uint32_t count); 4627b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4637b7060c61e4182b29186849c5a857ea5f0898e56Rose, Jamesvoid * selectKernel(Key_t key) 4647b7060c61e4182b29186849c5a857ea5f0898e56Rose, James{ 46544bef6fba6244292b751387f3d6c31cca96c28adChris Wailes void * kernel = nullptr; 4667b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James // inType, outType float if nonzero 4687b7060c61e4182b29186849c5a857ea5f0898e56Rose, James if (!(key.u.inType || key.u.outType)) { 4697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James if (key.u.dot) 4707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James kernel = (void *)rsdIntrinsicColorMatrixDot_K; 4717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James else if (key.u.copyAlpha) 4727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James kernel = (void *)rsdIntrinsicColorMatrix3x3_K; 4737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James else 4747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James kernel = (void *)rsdIntrinsicColorMatrix4x4_K; 4757b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 4767b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 4777b7060c61e4182b29186849c5a857ea5f0898e56Rose, James return kernel; 4787b7060c61e4182b29186849c5a857ea5f0898e56Rose, James} 4797b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif 4809b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 481a65de10aabdee0794d0e9c96db944e990166ef0dJason Samsbool RsdCpuScriptIntrinsicColorMatrix::build(Key_t key) { 482074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS) && !defined(ARCH_ARM64_USE_INTRINSICS) 4839b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams mBufSize = 4096; 4849b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams //StopWatch build_time("rs cm: build time"); 4859b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams mBuf = (uint8_t *)mmap(0, mBufSize, PROT_READ | PROT_WRITE, 4869b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams MAP_PRIVATE | MAP_ANON, -1, 0); 487c214fe59fc48740ed003a3cde4e5a60517c5d5ceJason Sams if (mBuf == MAP_FAILED) { 488c214fe59fc48740ed003a3cde4e5a60517c5d5ceJason Sams mBuf = NULL; 4899b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return false; 4909b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 4919b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 4929b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams uint8_t *buf = mBuf; 49344bef6fba6244292b751387f3d6c31cca96c28adChris Wailes uint8_t *buf2 = nullptr; 4949b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 495a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams int ops[5][4]; // 0=unused, 1 = set, 2 = accumulate, 3 = final 496a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams int opInit[4] = {0, 0, 0, 0}; 4979b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 498a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams memset(ops, 0, sizeof(ops)); 499a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams for (int i=0; i < 4; i++) { 500a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (key.u.coeffMask & (1 << (i*4))) { 5012b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ops[i][0] = 0x2 | opInit[0]; 502a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams opInit[0] = 1; 5039b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 504a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (!key.u.dot) { 505a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (key.u.coeffMask & (1 << (1 + i*4))) { 5062b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ops[i][1] = 0x2 | opInit[1]; 507a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams opInit[1] = 1; 508a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 509a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (key.u.coeffMask & (1 << (2 + i*4))) { 5102b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ops[i][2] = 0x2 | opInit[2]; 511a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams opInit[2] = 1; 5129b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 5139b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 514a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (!key.u.copyAlpha) { 515a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (key.u.coeffMask & (1 << (3 + i*4))) { 5162b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ops[i][3] = 0x2 | opInit[3]; 517a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams opInit[3] = 1; 5189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 5199b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 5209b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 5219b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 522a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (key.u.inType || key.u.outType) { 5239e4a96af136dab5b21a37580d17cbcb89872114eJason Sams key.u.copyAlpha = 0; 524a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(prefix_f); 525a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams buf2 = buf; 526a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 527a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // Load the incoming r,g,b,a as needed 528a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (key.u.inType) { 529a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(key.u.inVecSize) { 530a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 531a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_f32_4); 532a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 5339e4a96af136dab5b21a37580d17cbcb89872114eJason Sams case 2: 5349e4a96af136dab5b21a37580d17cbcb89872114eJason Sams ADD_CHUNK(load_f32_3); 5359e4a96af136dab5b21a37580d17cbcb89872114eJason Sams break; 536a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 1: 537a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_f32_2); 538a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 539a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 540a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_f32_1); 541a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 542a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 5439b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } else { 544a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(key.u.inVecSize) { 545a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 546a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_u8f_4); 547a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 5489e4a96af136dab5b21a37580d17cbcb89872114eJason Sams case 2: 5499e4a96af136dab5b21a37580d17cbcb89872114eJason Sams ADD_CHUNK(load_u8f_3); 5509e4a96af136dab5b21a37580d17cbcb89872114eJason Sams break; 551a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 1: 552a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_u8f_2); 553a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 554a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 555a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_u8f_1); 556a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 5579b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 5589b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 559a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 560a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams for (int i=0; i < 4; i++) { 561a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams for (int j=0; j < 4; j++) { 562a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(ops[i][j]) { 563a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 564a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 565a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 5662b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams buf = addVMULL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 5672b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams break; 5682b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 3: 5692b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams buf = addVMLAL_F32(buf, 12+j, i*2, 8+i*2 + (j >> 1), j & 1); 570a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 571a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 5729b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 5739b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 574a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams for (int j=0; j < 4; j++) { 5752b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams if (opInit[j]) { 5762b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams if (key.u.addMask & (1 << j)) { 5772b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams buf = addVADD_F32(buf, j, 12+j, 8+j); 5782b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } else { 5792b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams buf = addVORR_32(buf, j, 12+j, 12+j); 5802b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 5812b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } else { 5822b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams if (key.u.addMask & (1 << j)) { 583c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie buf = addVORR_32(buf, j, 8+j, 8+j); 584c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie } else { 585c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie buf = addVMOV_32(buf, j, 0); 5862b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 5872b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 5882b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 5892b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams 5902b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams if (key.u.outType) { 5912b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams switch(key.u.outVecSize) { 5922b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 3: 5932b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ADD_CHUNK(store_f32_4); 5942b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams break; 5959e4a96af136dab5b21a37580d17cbcb89872114eJason Sams case 2: 5969e4a96af136dab5b21a37580d17cbcb89872114eJason Sams ADD_CHUNK(store_f32_3); 5979e4a96af136dab5b21a37580d17cbcb89872114eJason Sams break; 5982b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 1: 5992b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ADD_CHUNK(store_f32_2); 6002b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams break; 6012b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 0: 6022b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ADD_CHUNK(store_f32_1); 6032b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams break; 6042b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 6052b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } else { 6062b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams switch(key.u.outVecSize) { 6072b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 3: 6082b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 2: 6092b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ADD_CHUNK(store_f32u_4); 6102b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams break; 6112b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 1: 6122b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ADD_CHUNK(store_f32u_2); 6132b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams break; 6142b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 0: 6152b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ADD_CHUNK(store_f32u_1); 6162b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams break; 617a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 618709a0978ae141198018ca9769f8d96292a8928e6Jason Sams } 6199b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 620a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 621a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } else { 622a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // Add the function prefix 623a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // Store the address for the loop return 624a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(prefix_i); 625a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams buf2 = buf; 626a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 627a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // Load the incoming r,g,b,a as needed 628a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(key.u.inVecSize) { 629a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 630a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_u8_4); 631a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (key.u.copyAlpha) { 632a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(unpack_u8_3); 6339b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } else { 634a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(unpack_u8_4); 6359b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 636a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 637a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 6389e4a96af136dab5b21a37580d17cbcb89872114eJason Sams ADD_CHUNK(load_u8_3); 639a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(unpack_u8_3); 640a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 641a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 1: 642a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_u8_2); 643a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(unpack_u8_2); 644a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 645a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 646a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(load_u8_1); 647a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(unpack_u8_1); 648a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 6499b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 650a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 651a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // Add multiply and accumulate 652a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // use MULL to init the output register, 653a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // use MLAL from there 654a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams for (int i=0; i < 4; i++) { 655a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams for (int j=0; j < 4; j++) { 656a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(ops[i][j]) { 657a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 658a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 6592b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 2: 660a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams buf = addVMULL_S16(buf, 8+j, 24+i*2, 4+i, j); 661a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 6622b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams case 3: 663a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams buf = addVMLAL_S16(buf, 8+j, 24+i*2, 4+i, j); 664a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 665a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 6669b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 6679b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 668a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams for (int j=0; j < 4; j++) { 669ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams if (opInit[j]) { 670ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams if (key.u.addMask & (1 << j)) { 671ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams buf = addVQADD_S32(buf, 8+j, 8+j, 4+j); 672ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams } 673ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams } else { 674ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams if (key.u.addMask & (1 << j)) { 675c7c255e86b2cbd36e4da94632c49b3c8b4f74031Simon Hosie buf = addVORR_32(buf, 8+j, 4+j, 4+j); 676ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams } 6779b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 6789b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 6799b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 680a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // If we have a dot product, perform the special pack. 681a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (key.u.dot) { 682a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(pack_u8_1); 683a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(dot); 684a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } else { 685a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(key.u.outVecSize) { 686a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 68717e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams if (key.u.copyAlpha) { 68817e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams ADD_CHUNK(pack_u8_3); 68917e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams } else { 69017e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams ADD_CHUNK(pack_u8_4); 69117e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams } 692a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 693a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 694a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(pack_u8_3); 695a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 696a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 1: 697a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(pack_u8_2); 698a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 699a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 700a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(pack_u8_1); 701a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 702a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 703a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 704a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 705a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams // Write out result 706a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(key.u.outVecSize) { 7079b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams case 3: 7089b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams case 2: 709a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(store_u8_4); 7109b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams break; 7119b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams case 1: 712a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(store_u8_2); 7139b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams break; 7149b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams case 0: 715a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ADD_CHUNK(store_u8_1); 7169b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams break; 7179b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 7189b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 7199b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 7202b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams if (key.u.inType != key.u.outType) { 7212b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams key.u.copyAlpha = 0; 7222b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams key.u.dot = 0; 7232b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 7242b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams 7259b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams // Loop, branch, and cleanup 7269b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams ADD_CHUNK(postfix1); 7279b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams buf = addBranch(buf, buf2, 0x01); 7289b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams ADD_CHUNK(postfix2); 7299b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 7309b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams int ret = mprotect(mBuf, mBufSize, PROT_READ | PROT_EXEC); 7319b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (ret == -1) { 7329b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams ALOGE("mprotect error %i", ret); 7339b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return false; 7349b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 7359b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 73645e753a46e587c69b3b0d0c5138e88715a24a29aStephen Hines __builtin___clear_cache((char *) mBuf, (char*) mBuf + mBufSize); 7379b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return true; 7389b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#else 7399b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams return false; 7409b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams#endif 7419b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams} 7429b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 743ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Samsvoid RsdCpuScriptIntrinsicColorMatrix::updateCoeffCache(float fpMul, float addMul) { 7442b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams for(int ct=0; ct < 16; ct++) { 7452b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams ip[ct] = (short)(fp[ct] * 256.f + 0.5f); 7462b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams tmpFp[ct] = fp[ct] * fpMul; 7479e4a96af136dab5b21a37580d17cbcb89872114eJason Sams //ALOGE("mat %i %f %f", ct, fp[ct], tmpFp[ct]); 7482b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 7492b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams 750ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams float add = 0.f; 751ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams if (fpMul > 254.f) add = 0.5f; 7522b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams for(int ct=0; ct < 4; ct++) { 7530462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie tmpFpa[ct] = fpa[ct] * addMul + add; 7549e4a96af136dab5b21a37580d17cbcb89872114eJason Sams //ALOGE("fpa %i %f %f", ct, fpa[ct], tmpFpa[ct * 4 + 0]); 7552b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 7562b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams 757ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams for(int ct=0; ct < 4; ct++) { 7580462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie ipa[ct] = (int)(fpa[ct] * 65536.f + 0.5f); 7592b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams } 7602b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams} 7612b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams 7629b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Samsvoid RsdCpuScriptIntrinsicColorMatrix::setGlobalVar(uint32_t slot, const void *data, 7639b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams size_t dataLength) { 7649b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams switch(slot) { 7659b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams case 0: 7662b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams memcpy (fp, data, sizeof(fp)); 7679b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams break; 7689b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams case 1: 7692b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams memcpy (fpa, data, sizeof(fpa)); 7709b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams break; 7719b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams default: 7729b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams rsAssert(0); 7739b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams break; 7749b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams } 7759b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams mRootPtr = &kernel; 776709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 777709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 778709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 779b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossstatic void One(const RsExpandKernelDriverInfo *info, void *out, 78017e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams const void *py, const float* coeff, const float *add, 781a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t vsin, uint32_t vsout, bool fin, bool fout) { 782a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 783a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams float4 f = 0.f; 784a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (fin) { 785a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(vsin) { 786a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 7879e4a96af136dab5b21a37580d17cbcb89872114eJason Sams f = ((const float4 *)py)[0]; 7889e4a96af136dab5b21a37580d17cbcb89872114eJason Sams break; 789a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 790a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f = ((const float4 *)py)[0]; 7919e4a96af136dab5b21a37580d17cbcb89872114eJason Sams f.w = 0.f; 792a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 793a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 1: 794a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.xy = ((const float2 *)py)[0]; 795a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 796a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 797a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.x = ((const float *)py)[0]; 798a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 799a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 800a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } else { 801a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(vsin) { 802a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 8039e4a96af136dab5b21a37580d17cbcb89872114eJason Sams f = convert_float4(((const uchar4 *)py)[0]); 8049e4a96af136dab5b21a37580d17cbcb89872114eJason Sams break; 805a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 806a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f = convert_float4(((const uchar4 *)py)[0]); 8079e4a96af136dab5b21a37580d17cbcb89872114eJason Sams f.w = 0.f; 808a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 809a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 1: 81068c817211a6fe87bebed83d38a05fff32cc24a7eJason Sams f.xy = convert_float2(((const uchar2 *)py)[0]); 811a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 812a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 81368c817211a6fe87bebed83d38a05fff32cc24a7eJason Sams f.x = (float)(((const uchar *)py)[0]); 814a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 815a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 816a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 8172b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams //ALOGE("f1 %f %f %f %f", f.x, f.y, f.z, f.w); 818709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 819709a0978ae141198018ca9769f8d96292a8928e6Jason Sams float4 sum; 820a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams sum.x = f.x * coeff[0] + 821a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.y * coeff[4] + 822a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.z * coeff[8] + 823a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.w * coeff[12]; 824a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams sum.y = f.x * coeff[1] + 825a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.y * coeff[5] + 826a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.z * coeff[9] + 827a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.w * coeff[13]; 828a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams sum.z = f.x * coeff[2] + 829a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.y * coeff[6] + 830a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.z * coeff[10] + 831a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.w * coeff[14]; 832a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams sum.w = f.x * coeff[3] + 833a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.y * coeff[7] + 834a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.z * coeff[11] + 835a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams f.w * coeff[15]; 8362b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams //ALOGE("f2 %f %f %f %f", sum.x, sum.y, sum.z, sum.w); 837709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 83817e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams sum.x += add[0]; 8390462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie sum.y += add[1]; 8400462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie sum.z += add[2]; 8410462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie sum.w += add[3]; 84217e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams 843709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 8442b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams //ALOGE("fout %i vs %i, sum %f %f %f %f", fout, vsout, sum.x, sum.y, sum.z, sum.w); 845a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams if (fout) { 846a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(vsout) { 847a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 848a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 849a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((float4 *)out)[0] = sum; 850a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 851a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 1: 852a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((float2 *)out)[0] = sum.xy; 853a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 854a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 855a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((float *)out)[0] = sum.x; 856a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 857a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 858a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } else { 8599e4a96af136dab5b21a37580d17cbcb89872114eJason Sams sum.x = sum.x < 0 ? 0 : (sum.x > 255.5 ? 255.5 : sum.x); 8609e4a96af136dab5b21a37580d17cbcb89872114eJason Sams sum.y = sum.y < 0 ? 0 : (sum.y > 255.5 ? 255.5 : sum.y); 8619e4a96af136dab5b21a37580d17cbcb89872114eJason Sams sum.z = sum.z < 0 ? 0 : (sum.z > 255.5 ? 255.5 : sum.z); 8629e4a96af136dab5b21a37580d17cbcb89872114eJason Sams sum.w = sum.w < 0 ? 0 : (sum.w > 255.5 ? 255.5 : sum.w); 86317e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams 864a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams switch(vsout) { 865a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 3: 866a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 2: 867a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((uchar4 *)out)[0] = convert_uchar4(sum); 868a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 869a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 1: 870a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((uchar2 *)out)[0] = convert_uchar2(sum.xy); 871a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 872a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams case 0: 873a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams ((uchar *)out)[0] = sum.x; 874a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams break; 875a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 876a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams } 8772b0d8e60d7daeffa7c9a5e11cfbfc9e5e04933b7Jason Sams //ALOGE("out %p %f %f %f %f", out, ((float *)out)[0], ((float *)out)[1], ((float *)out)[2], ((float *)out)[3]); 878709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 879709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 880b0abb140ac51b93d1a85aadaa63fe057f2d29850David Grossvoid RsdCpuScriptIntrinsicColorMatrix::kernel(const RsExpandKernelDriverInfo *info, 8819b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams uint32_t xstart, uint32_t xend, 8829ed79105cc6a8dbfaf959875249f36022cc2c798Chris Wailes uint32_t outstep) { 883b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross RsdCpuScriptIntrinsicColorMatrix *cp = (RsdCpuScriptIntrinsicColorMatrix *)info->usr; 884f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes 885b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross uint32_t instep = info->inStride[0]; 886f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes 887b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross uchar *out = (uchar *)info->outPtr[0]; 888b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross uchar *in = (uchar *)info->inPtr[0]; 889709a0978ae141198018ca9769f8d96292a8928e6Jason Sams uint32_t x1 = xstart; 890709a0978ae141198018ca9769f8d96292a8928e6Jason Sams uint32_t x2 = xend; 891709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 892a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t vsin = cp->mLastKey.u.inVecSize; 893a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams uint32_t vsout = cp->mLastKey.u.outVecSize; 894a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams bool floatIn = !!cp->mLastKey.u.inType; 895a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams bool floatOut = !!cp->mLastKey.u.outType; 896a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams 897b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross //if (!info->current.y) ALOGE("steps %i %i %i %i", instep, outstep, vsin, vsout); 8989e4a96af136dab5b21a37580d17cbcb89872114eJason Sams 899709a0978ae141198018ca9769f8d96292a8928e6Jason Sams if(x2 > x1) { 9000462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie int32_t len = x2 - x1; 9010462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie if (gArchUseSIMD) { 90244bef6fba6244292b751387f3d6c31cca96c28adChris Wailes if((cp->mOptKernel != nullptr) && (len >= 4)) { 903858d0352934596aa46fe97a70f30d4d837f6fc7fJason Sams // The optimized kernel processes 4 pixels at once 904858d0352934596aa46fe97a70f30d4d837f6fc7fJason Sams // and requires a minimum of 1 chunk of 4 9050462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie cp->mOptKernel(out, in, cp->ip, len >> 2); 906858d0352934596aa46fe97a70f30d4d837f6fc7fJason Sams // Update the len and pointers so the generic code can 907858d0352934596aa46fe97a70f30d4d837f6fc7fJason Sams // finish any leftover pixels 90898dd4bb2b1b08f04dd5034fe0c69daa15f6cc6daJason Sams len &= ~3; 9090462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie x1 += len; 9100462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie out += outstep * len; 9110462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie in += instep * len; 9120462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie } 91332f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams#if defined(ARCH_ARM64_USE_INTRINSICS) 9140462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie else { 9150462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie if (cp->mLastKey.u.inType == RS_TYPE_FLOAT_32 || cp->mLastKey.u.outType == RS_TYPE_FLOAT_32) { 91632f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams // Currently this generates off by one errors. 91732f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams //rsdIntrinsicColorMatrix_float_K(out, in, len, &cp->mFnTab, cp->tmpFp, cp->tmpFpa); 91832f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams //x1 += len; 91932f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams //out += outstep * len; 92032f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams //in += instep * len; 9210462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie } else { 9226e7e258316f72be95039278e88e3bc1daea1668fSimon Hosie rsdIntrinsicColorMatrix_int_K(out, in, len, &cp->mFnTab, cp->ip, cp->ipa); 92332f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams x1 += len; 92432f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams out += outstep * len; 92532f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams in += instep * len; 9260462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie } 9270462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie } 9280462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie#endif 929709a0978ae141198018ca9769f8d96292a8928e6Jason Sams } 930709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 931709a0978ae141198018ca9769f8d96292a8928e6Jason Sams while(x1 != x2) { 932b0abb140ac51b93d1a85aadaa63fe057f2d29850David Gross One(info, out, in, cp->tmpFp, cp->tmpFpa, vsin, vsout, floatIn, floatOut); 93368c817211a6fe87bebed83d38a05fff32cc24a7eJason Sams out += outstep; 93468c817211a6fe87bebed83d38a05fff32cc24a7eJason Sams in += instep; 935709a0978ae141198018ca9769f8d96292a8928e6Jason Sams x1++; 936709a0978ae141198018ca9769f8d96292a8928e6Jason Sams } 937709a0978ae141198018ca9769f8d96292a8928e6Jason Sams } 938709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 939709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 940f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailesvoid RsdCpuScriptIntrinsicColorMatrix::preLaunch(uint32_t slot, 941f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes const Allocation ** ains, 942f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes uint32_t inLen, 943f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes Allocation * aout, 944f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes const void * usr, 945f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes uint32_t usrLen, 946f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes const RsScriptCall *sc) { 9479b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams 948f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes const Element *ein = ains[0]->mHal.state.type->getElement(); 94917e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams const Element *eout = aout->mHal.state.type->getElement(); 95017e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams 95117e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams if (ein->getType() == eout->getType()) { 952ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams if (eout->getType() == RS_TYPE_UNSIGNED_8) { 953ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams updateCoeffCache(1.f, 255.f); 954ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams } else { 955ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams updateCoeffCache(1.f, 1.f); 956ec3cd2dff915d476ce0d7cdbb20c0497635a700fJason Sams } 95717e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams } else { 95817e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams if (eout->getType() == RS_TYPE_UNSIGNED_8) { 9599e4a96af136dab5b21a37580d17cbcb89872114eJason Sams updateCoeffCache(255.f, 255.f); 96017e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams } else { 9619e4a96af136dab5b21a37580d17cbcb89872114eJason Sams updateCoeffCache(1.f / 255.f, 1.f); 96217e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams } 96317e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams } 96417e3cdc24776d8fdbf1ce16287b9b4dcd516708fJason Sams 965f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes Key_t key = computeKey(ein, eout); 966f37121300217d3b39ab66dd9c8881bcbcad932dfChris Wailes 9677b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(ARCH_X86_HAVE_SSSE3) 96844bef6fba6244292b751387f3d6c31cca96c28adChris Wailes if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) { 9697b7060c61e4182b29186849c5a857ea5f0898e56Rose, James // FIXME: Disable mOptKernel to pass RS color matrix CTS cases 9707b7060c61e4182b29186849c5a857ea5f0898e56Rose, James // mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) selectKernel(key); 9717b7060c61e4182b29186849c5a857ea5f0898e56Rose, James mLastKey = key; 9727b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 9737b7060c61e4182b29186849c5a857ea5f0898e56Rose, James 9747b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#else //if !defined(ARCH_X86_HAVE_SSSE3) 97544bef6fba6244292b751387f3d6c31cca96c28adChris Wailes if ((mOptKernel == nullptr) || (mLastKey.key != key.key)) { 9769b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (mBuf) munmap(mBuf, mBufSize); 97744bef6fba6244292b751387f3d6c31cca96c28adChris Wailes mBuf = nullptr; 97844bef6fba6244292b751387f3d6c31cca96c28adChris Wailes mOptKernel = nullptr; 9799b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (build(key)) { 9809b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams mOptKernel = (void (*)(void *, const void *, const short *, uint32_t)) mBuf; 981709a0978ae141198018ca9769f8d96292a8928e6Jason Sams } 98232f9d04ae7b5f680c0921b3f9d4cdbf1665532b3Jason Sams#if defined(ARCH_ARM64_USE_INTRINSICS) 9830462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie else { 9840462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie int dt = key.u.outVecSize + (key.u.outType == RS_TYPE_FLOAT_32 ? 4 : 0); 9850462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie int st = key.u.inVecSize + (key.u.inType == RS_TYPE_FLOAT_32 ? 4 : 0); 9860462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie uint32_t mm = 0; 9870462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie int i; 9880462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie for (i = 0; i < 4; i++) 9890462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie { 9900462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie uint32_t m = (key.u.coeffMask >> i) & 0x1111; 9910462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie m = ((m * 0x249) >> 9) & 15; 9920462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie m |= ((key.u.addMask >> i) & 1) << 4; 9930462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie mm |= m << (i * 5); 9940462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie } 9950462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie 9960462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie if (key.u.inType == RS_TYPE_FLOAT_32 || key.u.outType == RS_TYPE_FLOAT_32) { 9970462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie rsdIntrinsicColorMatrixSetup_float_K(&mFnTab, mm, dt, st); 9980462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie } else { 9990462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie rsdIntrinsicColorMatrixSetup_int_K(&mFnTab, mm, dt, st); 10000462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie } 10010462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie } 10020462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie#endif 10030462a39371659d1eeed5eb48dd6d507760301c22Simon Hosie mLastKey = key; 1004709a0978ae141198018ca9769f8d96292a8928e6Jason Sams } 10057b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif //if !defined(ARCH_X86_HAVE_SSSE3) 1006709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 1007709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 1008709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicColorMatrix::RsdCpuScriptIntrinsicColorMatrix( 1009c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 1010c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_COLOR_MATRIX) { 1011709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 1012a65de10aabdee0794d0e9c96db944e990166ef0dJason Sams mLastKey.key = 0; 101344bef6fba6244292b751387f3d6c31cca96c28adChris Wailes mBuf = nullptr; 10149b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams mBufSize = 0; 101544bef6fba6244292b751387f3d6c31cca96c28adChris Wailes mOptKernel = nullptr; 1016709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const static float defaultMatrix[] = { 1017709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 1.f, 0.f, 0.f, 0.f, 1018709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 0.f, 1.f, 0.f, 0.f, 1019709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 0.f, 0.f, 1.f, 0.f, 1020709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 0.f, 0.f, 0.f, 1.f 1021709a0978ae141198018ca9769f8d96292a8928e6Jason Sams }; 10229b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams const static float defaultAdd[] = {0.f, 0.f, 0.f, 0.f}; 1023709a0978ae141198018ca9769f8d96292a8928e6Jason Sams setGlobalVar(0, defaultMatrix, sizeof(defaultMatrix)); 10249b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams setGlobalVar(1, defaultAdd, sizeof(defaultAdd)); 1025709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 1026709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 1027709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicColorMatrix::~RsdCpuScriptIntrinsicColorMatrix() { 10289b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams if (mBuf) munmap(mBuf, mBufSize); 102944bef6fba6244292b751387f3d6c31cca96c28adChris Wailes mBuf = nullptr; 103044bef6fba6244292b751387f3d6c31cca96c28adChris Wailes mOptKernel = nullptr; 1031709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 1032709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 1033709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicColorMatrix::populateScript(Script *s) { 10349b2b9efa1a1f9b0ec8c20601216f8dc5698c75f5Jason Sams s->mHal.info.exportedVariableCount = 2; 1035709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 1036709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 1037c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason SamsRsdCpuScriptImpl * rsdIntrinsic_ColorMatrix(RsdCpuReferenceImpl *ctx, 1038c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams const Script *s, const Element *e) { 1039709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 1040c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams return new RsdCpuScriptIntrinsicColorMatrix(ctx, s, e); 1041709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 1042