1e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams/* 2e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Copyright (C) 2012 The Android Open Source Project 3e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * 4e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Licensed under the Apache License, Version 2.0 (the "License"); 5e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * you may not use this file except in compliance with the License. 6e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * You may obtain a copy of the License at 7e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * 8e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * http://www.apache.org/licenses/LICENSE-2.0 9e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * 10e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * Unless required by applicable law or agreed to in writing, software 11e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * distributed under the License is distributed on an "AS IS" BASIS, 12e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * See the License for the specific language governing permissions and 14e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams * limitations under the License. 15e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams */ 16e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 17e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 18709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsic.h" 19709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsicInlines.h" 20e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 21e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Samsusing namespace android; 22e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Samsusing namespace android::renderscript; 23e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 24709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace android { 25709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace renderscript { 26709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 27709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 28709a0978ae141198018ca9769f8d96292a8928e6Jason Samsclass RsdCpuScriptIntrinsicConvolve3x3 : public RsdCpuScriptIntrinsic { 29709a0978ae141198018ca9769f8d96292a8928e6Jason Samspublic: 30709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual void populateScript(Script *); 31709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual void invokeFreeChildren(); 32709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 33709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); 34709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual void setGlobalObj(uint32_t slot, ObjectBase *data); 35709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 36709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual ~RsdCpuScriptIntrinsicConvolve3x3(); 37c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams RsdCpuScriptIntrinsicConvolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *); 38709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 39709a0978ae141198018ca9769f8d96292a8928e6Jason Samsprotected: 40c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams float mFp[16]; 41c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams short mIp[16]; 42c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams ObjectBaseRef<const Allocation> mAlloc; 43c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams ObjectBaseRef<const Element> mElement; 44709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams static void kernelU1(const RsForEachStubParamStruct *p, 463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep); 483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams static void kernelU2(const RsForEachStubParamStruct *p, 493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 503b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep); 513b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams static void kernelU4(const RsForEachStubParamStruct *p, 523b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 533b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep); 543b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams static void kernelF1(const RsForEachStubParamStruct *p, 553b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 563b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep); 573b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams static void kernelF2(const RsForEachStubParamStruct *p, 583b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 593b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep); 603b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams static void kernelF4(const RsForEachStubParamStruct *p, 613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep); 63e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams}; 64e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 65709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 66e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams} 67e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 68e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 69709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::setGlobalObj(uint32_t slot, ObjectBase *data) { 70709a0978ae141198018ca9769f8d96292a8928e6Jason Sams rsAssert(slot == 1); 71c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams mAlloc.set(static_cast<Allocation *>(data)); 72709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 73709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 74709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::setGlobalVar(uint32_t slot, const void *data, 75709a0978ae141198018ca9769f8d96292a8928e6Jason Sams size_t dataLength) { 76e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams rsAssert(slot == 0); 77c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams memcpy (&mFp, data, dataLength); 78e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams for(int ct=0; ct < 9; ct++) { 793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if (mFp[ct] >= 0) { 803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } else { 823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f); 833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 84e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams } 85e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams} 86e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 87709a0978ae141198018ca9769f8d96292a8928e6Jason Samsextern "C" void rsdIntrinsicConvolve3x3_K(void *dst, const void *y0, const void *y1, 88709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const void *y2, const short *coef, uint32_t count); 89e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 90e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 913b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out, 923b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, 933b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float* coeff) { 94e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 95d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 964cca49b13db92b13ca07c1d330ad450d1b10f507Tim Murray uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 97e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 98e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams float4 px = convert_float4(py0[x1]) * coeff[0] + 99e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams convert_float4(py0[x]) * coeff[1] + 100e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams convert_float4(py0[x2]) * coeff[2] + 101e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams convert_float4(py1[x1]) * coeff[3] + 102e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams convert_float4(py1[x]) * coeff[4] + 103e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams convert_float4(py1[x2]) * coeff[5] + 104e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams convert_float4(py2[x1]) * coeff[6] + 105e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams convert_float4(py2[x]) * coeff[7] + 106e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams convert_float4(py2[x2]) * coeff[8]; 107e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 1084283f579c424f07bc07c7f075398053eed3f8281Miao Wang px = clamp(px + 0.5f, 0.f, 255.f); 109e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams uchar4 o = {(uchar)px.x, (uchar)px.y, (uchar)px.z, (uchar)px.w}; 110e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams *out = o; 111e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams} 112e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 1133b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out, 1143b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, 1153b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float* coeff) { 1163b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1173b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 1183b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 1193b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1203b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams float2 px = convert_float2(py0[x1]) * coeff[0] + 1213b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams convert_float2(py0[x]) * coeff[1] + 1223b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams convert_float2(py0[x2]) * coeff[2] + 1233b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams convert_float2(py1[x1]) * coeff[3] + 1243b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams convert_float2(py1[x]) * coeff[4] + 1253b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams convert_float2(py1[x2]) * coeff[5] + 1263b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams convert_float2(py2[x1]) * coeff[6] + 1273b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams convert_float2(py2[x]) * coeff[7] + 1283b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams convert_float2(py2[x2]) * coeff[8]; 1293b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1304283f579c424f07bc07c7f075398053eed3f8281Miao Wang px = clamp(px + 0.5f, 0.f, 255.f); 1313b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams *out = convert_uchar2(px); 1323b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 1333b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1343b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out, 1353b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *py0, const uchar *py1, const uchar *py2, 1363b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float* coeff) { 1373b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1383b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 1393b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 1403b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1413b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams float px = ((float)py0[x1]) * coeff[0] + 1423b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ((float)py0[x]) * coeff[1] + 1433b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ((float)py0[x2]) * coeff[2] + 1443b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ((float)py1[x1]) * coeff[3] + 1453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ((float)py1[x]) * coeff[4] + 1463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ((float)py1[x2]) * coeff[5] + 1473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ((float)py2[x1]) * coeff[6] + 1483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ((float)py2[x]) * coeff[7] + 1493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ((float)py2[x2]) * coeff[8]; 1504283f579c424f07bc07c7f075398053eed3f8281Miao Wang *out = clamp(px + 0.5f, 0.f, 255.f); 1513b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 1523b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1533b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out, 1543b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float4 *py0, const float4 *py1, const float4 *py2, 1553b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float* coeff) { 1563b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1573b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 1583b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 1593b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 1603b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 1613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 1623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 1633b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1643b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out, 1653b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float2 *py0, const float2 *py1, const float2 *py2, 1663b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float* coeff) { 1673b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1683b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 1693b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 1703b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 1713b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 1723b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 1733b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 1743b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1753b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsstatic void ConvolveOneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out, 1763b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float *py0, const float *py1, const float *py2, 1773b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float* coeff) { 1783b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 1803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = rsMin((int32_t)x+1, (int32_t)p->dimX-1); 1813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams *out = (py0[x1] * coeff[0]) + (py0[x] * coeff[1]) + (py0[x2] * coeff[2]) + 1823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams (py1[x1] * coeff[3]) + (py1[x] * coeff[4]) + (py1[x2] * coeff[5]) + 1833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams (py2[x1] * coeff[6]) + (py2[x] * coeff[7]) + (py2[x2] * coeff[8]); 1843b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 1853b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 1863b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelU4(const RsForEachStubParamStruct *p, 1873b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 1883b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep) { 189709a0978ae141198018ca9769f8d96292a8928e6Jason Sams RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 190b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams 191c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams if (!cp->mAlloc.get()) { 192b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams ALOGE("Convolve3x3 executed without input, skipping"); 193b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams return; 194b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams } 195c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 196c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 197e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 198e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 199e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 200709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar4 *py0 = (const uchar4 *)(pin + stride * y2); 201709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar4 *py1 = (const uchar4 *)(pin + stride * p->y); 202709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar4 *py2 = (const uchar4 *)(pin + stride * y1); 203e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 204e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams uchar4 *out = (uchar4 *)p->out; 205e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams uint32_t x1 = xstart; 206e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams uint32_t x2 = xend; 207e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams if(x1 == 0) { 2083b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneU4(p, 0, out, py0, py1, py2, cp->mFp); 209e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams x1 ++; 210e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams out++; 211e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams } 212e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 213e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams if(x2 > x1) { 214074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS) || defined(ARCH_X86_HAVE_SSSE3) 215f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams if (gArchUseSIMD) { 216f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams int32_t len = (x2 - x1 - 1) >> 1; 217f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams if(len > 0) { 218f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 219f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams x1 += len << 1; 220f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams out += len << 1; 221f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams } 222e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams } 223e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams#endif 224e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 225e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams while(x1 != x2) { 2263b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneU4(p, x1, out, py0, py1, py2, cp->mFp); 2273b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 2283b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1++; 2293b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 2303b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 2313b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 2323b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2333b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelU2(const RsForEachStubParamStruct *p, 2343b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 2353b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep) { 2363b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 2373b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2383b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if (!cp->mAlloc.get()) { 2393b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ALOGE("Convolve3x3 executed without input, skipping"); 2403b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams return; 2413b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 2423b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 2433b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 2443b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 2463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 2473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar2 *py0 = (const uchar2 *)(pin + stride * y2); 2483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar2 *py1 = (const uchar2 *)(pin + stride * p->y); 2493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar2 *py2 = (const uchar2 *)(pin + stride * y1); 2503b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2513b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uchar2 *out = (uchar2 *)p->out; 2523b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = xstart; 2533b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = xend; 2543b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x1 == 0) { 2553b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneU2(p, 0, out, py0, py1, py2, cp->mFp); 2563b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 ++; 2573b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 2583b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 2593b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2603b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x2 > x1) { 2613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 2623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams int32_t len = (x2 - x1 - 1) >> 1; 2633b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(len > 0) { 2643b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 2653b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 += len << 1; 2663b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out += len << 1; 2673b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 2683b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif 2693b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2703b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams while(x1 != x2) { 2713b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneU2(p, x1, out, py0, py1, py2, cp->mFp); 2723b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 2733b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1++; 2743b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 2753b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 2763b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 2773b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2783b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelU1(const RsForEachStubParamStruct *p, 2793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 2803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep) { 2813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 2823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if (!cp->mAlloc.get()) { 2843b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ALOGE("Convolve3x3 executed without input, skipping"); 2853b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams return; 2863b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 2873b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 2883b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 2893b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2903b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 2913b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 2923b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *py0 = (const uchar *)(pin + stride * y2); 2933b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *py1 = (const uchar *)(pin + stride * p->y); 2943b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *py2 = (const uchar *)(pin + stride * y1); 2953b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 2963b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uchar *out = (uchar *)p->out; 2973b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = xstart; 2983b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = xend; 2993b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x1 == 0) { 3003b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneU1(p, 0, out, py0, py1, py2, cp->mFp); 3013b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 ++; 3023b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 3033b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3043b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3053b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x2 > x1) { 3063b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 3073b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams int32_t len = (x2 - x1 - 1) >> 1; 3083b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(len > 0) { 3093b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 3103b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 += len << 1; 3113b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out += len << 1; 3123b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3133b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif 3143b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3153b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams while(x1 != x2) { 3163b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneU1(p, x1, out, py0, py1, py2, cp->mFp); 3173b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 3183b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1++; 3193b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3203b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3213b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 3223b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3233b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelF4(const RsForEachStubParamStruct *p, 3243b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 3253b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep) { 3263b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 3273b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3283b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if (!cp->mAlloc.get()) { 3293b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ALOGE("Convolve3x3 executed without input, skipping"); 3303b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams return; 3313b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3323b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 3333b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 3343b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3353b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 3363b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 3373b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float4 *py0 = (const float4 *)(pin + stride * y2); 3383b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float4 *py1 = (const float4 *)(pin + stride * p->y); 3393b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float4 *py2 = (const float4 *)(pin + stride * y1); 3403b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3413b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams float4 *out = (float4 *)p->out; 3423b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = xstart; 3433b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = xend; 3443b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x1 == 0) { 3453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneF4(p, 0, out, py0, py1, py2, cp->mFp); 3463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 ++; 3473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 3483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3503b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x2 > x1) { 3513b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 3523b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams int32_t len = (x2 - x1 - 1) >> 1; 3533b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(len > 0) { 3543b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 3553b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 += len << 1; 3563b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out += len << 1; 3573b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3583b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif 3593b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3603b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams while(x1 != x2) { 3613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneF4(p, x1, out, py0, py1, py2, cp->mFp); 3623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 3633b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1++; 3643b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3653b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3663b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 3673b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3683b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelF2(const RsForEachStubParamStruct *p, 3693b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 3703b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep) { 3713b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 3723b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3733b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if (!cp->mAlloc.get()) { 3743b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ALOGE("Convolve3x3 executed without input, skipping"); 3753b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams return; 3763b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3773b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 3783b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 3793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 3813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 3823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float2 *py0 = (const float2 *)(pin + stride * y2); 3833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float2 *py1 = (const float2 *)(pin + stride * p->y); 3843b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float2 *py2 = (const float2 *)(pin + stride * y1); 3853b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3863b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams float2 *out = (float2 *)p->out; 3873b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = xstart; 3883b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = xend; 3893b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x1 == 0) { 3903b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneF2(p, 0, out, py0, py1, py2, cp->mFp); 3913b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 ++; 3923b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 3933b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 3943b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 3953b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x2 > x1) { 3963b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 3973b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams int32_t len = (x2 - x1 - 1) >> 1; 3983b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(len > 0) { 3993b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 4003b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 += len << 1; 4013b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out += len << 1; 4023b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 4033b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif 4043b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 4053b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams while(x1 != x2) { 4063b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneF2(p, x1, out, py0, py1, py2, cp->mFp); 4073b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 4083b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1++; 4093b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 4103b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 4113b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams} 4123b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::kernelF1(const RsForEachStubParamStruct *p, 4133b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t xstart, uint32_t xend, 4143b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t instep, uint32_t outstep) { 4153b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams RsdCpuScriptIntrinsicConvolve3x3 *cp = (RsdCpuScriptIntrinsicConvolve3x3 *)p->usr; 4163b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 4173b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if (!cp->mAlloc.get()) { 4183b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ALOGE("Convolve3x3 executed without input, skipping"); 4193b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams return; 4203b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 4213b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const uchar *pin = (const uchar *)cp->mAlloc->mHal.drvState.lod[0].mallocPtr; 4223b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const size_t stride = cp->mAlloc->mHal.drvState.lod[0].stride; 4233b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 4243b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y1 = rsMin((int32_t)p->y + 1, (int32_t)(p->dimY-1)); 4253b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t y2 = rsMax((int32_t)p->y - 1, 0); 4263b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float *py0 = (const float *)(pin + stride * y2); 4273b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float *py1 = (const float *)(pin + stride * p->y); 4283b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams const float *py2 = (const float *)(pin + stride * y1); 4293b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 4303b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams float *out = (float *)p->out; 4313b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x1 = xstart; 4323b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams uint32_t x2 = xend; 4333b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x1 == 0) { 4343b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneF1(p, 0, out, py0, py1, py2, cp->mFp); 4353b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 ++; 4363b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out++; 4373b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 4383b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 4393b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(x2 > x1) { 4403b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 4413b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams int32_t len = (x2 - x1 - 1) >> 1; 4423b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if(len > 0) { 4433b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams rsdIntrinsicConvolve3x3_K(out, &py0[x1-1], &py1[x1-1], &py2[x1-1], cp->mIp, len); 4443b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams x1 += len << 1; 4453b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams out += len << 1; 4463b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 4473b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams#endif 4483b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams 4493b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams while(x1 != x2) { 4503b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams ConvolveOneF1(p, x1, out, py0, py1, py2, cp->mFp); 451e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams out++; 452e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams x1++; 453e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams } 454e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams } 455e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams} 456e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 457709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve3x3::RsdCpuScriptIntrinsicConvolve3x3( 458c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 459c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_3x3) { 460e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 4613b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams if (e->getType() == RS_TYPE_FLOAT_32) { 4623b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams switch(e->getVectorSize()) { 4633b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams case 1: 4643b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mRootPtr = &kernelF1; 4653b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams break; 4663b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams case 2: 4673b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mRootPtr = &kernelF2; 4683b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams break; 4693b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams case 3: 4703b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams case 4: 4713b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mRootPtr = &kernelF4; 4723b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams break; 4733b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 4743b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } else { 4753b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams switch(e->getVectorSize()) { 4763b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams case 1: 4773b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mRootPtr = &kernelU1; 4783b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams break; 4793b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams case 2: 4803b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mRootPtr = &kernelU2; 4813b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams break; 4823b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams case 3: 4833b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams case 4: 4843b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mRootPtr = &kernelU4; 4853b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams break; 4863b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 4873b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams } 488e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams for(int ct=0; ct < 9; ct++) { 489c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams mFp[ct] = 1.f / 9.f; 4903b35d775a777c36a178ce3fc97ff1e169aab3f1eJason Sams mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 491e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams } 492709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 493709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 494709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve3x3::~RsdCpuScriptIntrinsicConvolve3x3() { 495709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 496709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 497709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::populateScript(Script *s) { 498709a0978ae141198018ca9769f8d96292a8928e6Jason Sams s->mHal.info.exportedVariableCount = 2; 499709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 500709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 501709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve3x3::invokeFreeChildren() { 502c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams mAlloc.clear(); 503709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 504709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 505709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 506c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason SamsRsdCpuScriptImpl * rsdIntrinsic_Convolve3x3(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) { 507709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 508c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams return new RsdCpuScriptIntrinsicConvolve3x3(ctx, s, e); 509e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams} 510e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 511e1e08b4c9cc80c51224fdaf3aeab0804daf073e6Jason Sams 512