1d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams/* 2d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Copyright (C) 2012 The Android Open Source Project 3d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * 4d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Licensed under the Apache License, Version 2.0 (the "License"); 5d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * you may not use this file except in compliance with the License. 6d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * You may obtain a copy of the License at 7d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * 8d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * http://www.apache.org/licenses/LICENSE-2.0 9d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * 10d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * Unless required by applicable law or agreed to in writing, software 11d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * distributed under the License is distributed on an "AS IS" BASIS, 12d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * See the License for the specific language governing permissions and 14d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams * limitations under the License. 15d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams */ 16d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 17d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 18709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsic.h" 19709a0978ae141198018ca9769f8d96292a8928e6Jason Sams#include "rsCpuIntrinsicInlines.h" 20d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 21d85e283087ecd00760a0d8d0c9d8482cda845efcJason Samsusing namespace android; 22d85e283087ecd00760a0d8d0c9d8482cda845efcJason Samsusing namespace android::renderscript; 23d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 24709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace android { 25709a0978ae141198018ca9769f8d96292a8928e6Jason Samsnamespace renderscript { 26709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 27709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 28709a0978ae141198018ca9769f8d96292a8928e6Jason Samsclass RsdCpuScriptIntrinsicConvolve5x5 : public RsdCpuScriptIntrinsic { 29709a0978ae141198018ca9769f8d96292a8928e6Jason Samspublic: 30709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual void populateScript(Script *); 31709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual void invokeFreeChildren(); 32709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 33709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual void setGlobalVar(uint32_t slot, const void *data, size_t dataLength); 34709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual void setGlobalObj(uint32_t slot, ObjectBase *data); 35709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 36709a0978ae141198018ca9769f8d96292a8928e6Jason Sams virtual ~RsdCpuScriptIntrinsicConvolve5x5(); 37c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams RsdCpuScriptIntrinsicConvolve5x5(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e); 38709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 39709a0978ae141198018ca9769f8d96292a8928e6Jason Samsprotected: 4034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float mFp[28]; 4134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams short mIp[28]; 42d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams ObjectBaseRef<Allocation> alloc; 43709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 44709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 4534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams static void kernelU1(const RsForEachStubParamStruct *p, 4634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 4734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep); 4834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams static void kernelU2(const RsForEachStubParamStruct *p, 4934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 5034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep); 5134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams static void kernelU4(const RsForEachStubParamStruct *p, 5234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 5334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep); 5434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams static void kernelF1(const RsForEachStubParamStruct *p, 5534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 5634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep); 5734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams static void kernelF2(const RsForEachStubParamStruct *p, 5834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 5934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep); 6034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams static void kernelF4(const RsForEachStubParamStruct *p, 6134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 6234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep); 63709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 64709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 65d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams}; 66d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 67709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 68d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams} 69d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 70709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::setGlobalObj(uint32_t slot, ObjectBase *data) { 71709a0978ae141198018ca9769f8d96292a8928e6Jason Sams rsAssert(slot == 1); 72709a0978ae141198018ca9769f8d96292a8928e6Jason Sams alloc.set(static_cast<Allocation *>(data)); 73709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 74d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 75709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::setGlobalVar(uint32_t slot, 76709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const void *data, size_t dataLength) { 77d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams rsAssert(slot == 0); 7834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams memcpy (&mFp, data, dataLength); 79d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams for(int ct=0; ct < 25; ct++) { 8034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if (mFp[ct] >= 0) { 8134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mIp[ct] = (short)(mFp[ct] * 256.f + 0.5f); 8234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } else { 8334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mIp[ct] = (short)(mFp[ct] * 256.f - 0.5f); 8434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 85d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams } 86d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams} 87d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 88d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 8934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsstatic void OneU4(const RsForEachStubParamStruct *p, uint32_t x, uchar4 *out, 9034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar4 *py0, const uchar4 *py1, const uchar4 *py2, const uchar4 *py3, const uchar4 *py4, 9134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float* coeff) { 92d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 93d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t x0 = rsMax((int32_t)x-2, 0); 94d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 95d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t x2 = x; 96d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 97d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 98d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 99d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams float4 px = convert_float4(py0[x0]) * coeff[0] + 100d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py0[x1]) * coeff[1] + 101d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py0[x2]) * coeff[2] + 102d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py0[x3]) * coeff[3] + 103d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py0[x4]) * coeff[4] + 104d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 105d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py1[x0]) * coeff[5] + 106d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py1[x1]) * coeff[6] + 107d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py1[x2]) * coeff[7] + 108d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py1[x3]) * coeff[8] + 109d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py1[x4]) * coeff[9] + 110d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 111d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py2[x0]) * coeff[10] + 112d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py2[x1]) * coeff[11] + 113d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py2[x2]) * coeff[12] + 114d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py2[x3]) * coeff[13] + 115d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py2[x4]) * coeff[14] + 116d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 117d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py3[x0]) * coeff[15] + 118d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py3[x1]) * coeff[16] + 119d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py3[x2]) * coeff[17] + 120d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py3[x3]) * coeff[18] + 121d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py3[x4]) * coeff[19] + 122d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 123d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py4[x0]) * coeff[20] + 124d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py4[x1]) * coeff[21] + 125d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py4[x2]) * coeff[22] + 126d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py4[x3]) * coeff[23] + 127d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams convert_float4(py4[x4]) * coeff[24]; 1284283f579c424f07bc07c7f075398053eed3f8281Miao Wang px = clamp(px + 0.5f, 0.f, 255.f); 12934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams *out = convert_uchar4(px); 13034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 131d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 13234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsstatic void OneU2(const RsForEachStubParamStruct *p, uint32_t x, uchar2 *out, 13334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar2 *py0, const uchar2 *py1, const uchar2 *py2, const uchar2 *py3, const uchar2 *py4, 13434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float* coeff) { 13534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 13634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x0 = rsMax((int32_t)x-2, 0); 13734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 13834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = x; 13934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 14034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 14134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 14234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float2 px = convert_float2(py0[x0]) * coeff[0] + 14334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py0[x1]) * coeff[1] + 14434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py0[x2]) * coeff[2] + 14534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py0[x3]) * coeff[3] + 14634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py0[x4]) * coeff[4] + 14734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 14834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py1[x0]) * coeff[5] + 14934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py1[x1]) * coeff[6] + 15034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py1[x2]) * coeff[7] + 15134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py1[x3]) * coeff[8] + 15234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py1[x4]) * coeff[9] + 15334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 15434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py2[x0]) * coeff[10] + 15534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py2[x1]) * coeff[11] + 15634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py2[x2]) * coeff[12] + 15734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py2[x3]) * coeff[13] + 15834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py2[x4]) * coeff[14] + 15934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 16034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py3[x0]) * coeff[15] + 16134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py3[x1]) * coeff[16] + 16234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py3[x2]) * coeff[17] + 16334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py3[x3]) * coeff[18] + 16434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py3[x4]) * coeff[19] + 16534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 16634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py4[x0]) * coeff[20] + 16734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py4[x1]) * coeff[21] + 16834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py4[x2]) * coeff[22] + 16934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py4[x3]) * coeff[23] + 17034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams convert_float2(py4[x4]) * coeff[24]; 1714283f579c424f07bc07c7f075398053eed3f8281Miao Wang px = clamp(px + 0.5f, 0.f, 255.f); 17234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams *out = convert_uchar2(px); 17334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 17434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 17534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsstatic void OneU1(const RsForEachStubParamStruct *p, uint32_t x, uchar *out, 17634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *py0, const uchar *py1, const uchar *py2, const uchar *py3, const uchar *py4, 17734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float* coeff) { 17834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 17934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x0 = rsMax((int32_t)x-2, 0); 18034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 18134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = x; 18234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 18334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 18434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 18534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float px = (float)(py0[x0]) * coeff[0] + 18634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py0[x1]) * coeff[1] + 18734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py0[x2]) * coeff[2] + 18834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py0[x3]) * coeff[3] + 18934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py0[x4]) * coeff[4] + 19034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 19134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py1[x0]) * coeff[5] + 19234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py1[x1]) * coeff[6] + 19334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py1[x2]) * coeff[7] + 19434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py1[x3]) * coeff[8] + 19534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py1[x4]) * coeff[9] + 19634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 19734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py2[x0]) * coeff[10] + 19834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py2[x1]) * coeff[11] + 19934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py2[x2]) * coeff[12] + 20034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py2[x3]) * coeff[13] + 20134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py2[x4]) * coeff[14] + 20234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 20334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py3[x0]) * coeff[15] + 20434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py3[x1]) * coeff[16] + 20534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py3[x2]) * coeff[17] + 20634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py3[x3]) * coeff[18] + 20734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py3[x4]) * coeff[19] + 20834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 20934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py4[x0]) * coeff[20] + 21034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py4[x1]) * coeff[21] + 21134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py4[x2]) * coeff[22] + 21234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py4[x3]) * coeff[23] + 21334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams (float)(py4[x4]) * coeff[24]; 2144283f579c424f07bc07c7f075398053eed3f8281Miao Wang px = clamp(px + 0.5f, 0.f, 255.f); 21534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams *out = px; 21634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 21734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 21834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsstatic void OneF4(const RsForEachStubParamStruct *p, uint32_t x, float4 *out, 21934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float4 *py0, const float4 *py1, const float4 *py2, const float4 *py3, const float4 *py4, 22034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float* coeff) { 22134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 22234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x0 = rsMax((int32_t)x-2, 0); 22334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 22434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = x; 22534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 22634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 22734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 22834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float4 px = py0[x0] * coeff[0] + 22934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x1] * coeff[1] + 23034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x2] * coeff[2] + 23134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x3] * coeff[3] + 23234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x4] * coeff[4] + 23334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 23434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x0] * coeff[5] + 23534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x1] * coeff[6] + 23634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x2] * coeff[7] + 23734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x3] * coeff[8] + 23834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x4] * coeff[9] + 23934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 24034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x0] * coeff[10] + 24134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x1] * coeff[11] + 24234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x2] * coeff[12] + 24334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x3] * coeff[13] + 24434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x4] * coeff[14] + 24534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 24634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x0] * coeff[15] + 24734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x1] * coeff[16] + 24834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x2] * coeff[17] + 24934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x3] * coeff[18] + 25034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x4] * coeff[19] + 25134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 25234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x0] * coeff[20] + 25334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x1] * coeff[21] + 25434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x2] * coeff[22] + 25534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x3] * coeff[23] + 25634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x4] * coeff[24]; 25734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams *out = px; 25834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 25934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 26034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsstatic void OneF2(const RsForEachStubParamStruct *p, uint32_t x, float2 *out, 26134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float2 *py0, const float2 *py1, const float2 *py2, const float2 *py3, const float2 *py4, 26234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float* coeff) { 26334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 26434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x0 = rsMax((int32_t)x-2, 0); 26534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 26634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = x; 26734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 26834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 26934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 27034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float2 px = py0[x0] * coeff[0] + 27134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x1] * coeff[1] + 27234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x2] * coeff[2] + 27334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x3] * coeff[3] + 27434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x4] * coeff[4] + 27534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 27634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x0] * coeff[5] + 27734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x1] * coeff[6] + 27834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x2] * coeff[7] + 27934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x3] * coeff[8] + 28034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x4] * coeff[9] + 28134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 28234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x0] * coeff[10] + 28334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x1] * coeff[11] + 28434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x2] * coeff[12] + 28534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x3] * coeff[13] + 28634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x4] * coeff[14] + 28734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 28834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x0] * coeff[15] + 28934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x1] * coeff[16] + 29034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x2] * coeff[17] + 29134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x3] * coeff[18] + 29234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x4] * coeff[19] + 29334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 29434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x0] * coeff[20] + 29534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x1] * coeff[21] + 29634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x2] * coeff[22] + 29734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x3] * coeff[23] + 29834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x4] * coeff[24]; 29934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams *out = px; 300d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams} 301d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 30234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsstatic void OneF1(const RsForEachStubParamStruct *p, uint32_t x, float *out, 30334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float *py0, const float *py1, const float *py2, const float *py3, const float *py4, 30434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float* coeff) { 30534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 30634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x0 = rsMax((int32_t)x-2, 0); 30734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = rsMax((int32_t)x-1, 0); 30834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = x; 30934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x3 = rsMin((int32_t)x+1, (int32_t)(p->dimX-1)); 31034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x4 = rsMin((int32_t)x+2, (int32_t)(p->dimX-1)); 31134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 31234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float px = py0[x0] * coeff[0] + 31334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x1] * coeff[1] + 31434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x2] * coeff[2] + 31534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x3] * coeff[3] + 31634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py0[x4] * coeff[4] + 31734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 31834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x0] * coeff[5] + 31934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x1] * coeff[6] + 32034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x2] * coeff[7] + 32134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x3] * coeff[8] + 32234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py1[x4] * coeff[9] + 32334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 32434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x0] * coeff[10] + 32534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x1] * coeff[11] + 32634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x2] * coeff[12] + 32734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x3] * coeff[13] + 32834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py2[x4] * coeff[14] + 32934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 33034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x0] * coeff[15] + 33134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x1] * coeff[16] + 33234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x2] * coeff[17] + 33334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x3] * coeff[18] + 33434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py3[x4] * coeff[19] + 33534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 33634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x0] * coeff[20] + 33734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x1] * coeff[21] + 33834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x2] * coeff[22] + 33934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x3] * coeff[23] + 34034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams py4[x4] * coeff[24]; 34134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams *out = px; 34234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 34334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 34434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 345a1b08e2cacf3891fcd6895422c6124887b75975eJason Samsextern "C" void rsdIntrinsicConvolve5x5_K(void *dst, const void *y0, const void *y1, 346a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams const void *y2, const void *y3, const void *y4, 347a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams const short *coef, uint32_t count); 348a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 34934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::kernelU4(const RsForEachStubParamStruct *p, 35034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 35134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep) { 352709a0978ae141198018ca9769f8d96292a8928e6Jason Sams RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 353b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams if (!cp->alloc.get()) { 354b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams ALOGE("Convolve5x5 executed without input, skipping"); 355b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams return; 356b801b949e286275b5d19a33135235ba68d3a19a9Jason Sams } 357709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 358709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 359d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 360d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t y0 = rsMax((int32_t)p->y-2, 0); 361d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t y1 = rsMax((int32_t)p->y-1, 0); 362d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t y2 = p->y; 363d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 364d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 365d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 366709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar4 *py0 = (const uchar4 *)(pin + stride * y0); 367709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar4 *py1 = (const uchar4 *)(pin + stride * y1); 368709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar4 *py2 = (const uchar4 *)(pin + stride * y2); 369709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar4 *py3 = (const uchar4 *)(pin + stride * y3); 370709a0978ae141198018ca9769f8d96292a8928e6Jason Sams const uchar4 *py4 = (const uchar4 *)(pin + stride * y4); 371d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 372d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uchar4 *out = (uchar4 *)p->out; 373d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t x1 = xstart; 374d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams uint32_t x2 = xend; 375d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 376a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams while((x1 < x2) && (x1 < 2)) { 37734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 378a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams out++; 379a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams x1++; 380a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams } 3817b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#if defined(ARCH_X86_HAVE_SSSE3) 3827b7060c61e4182b29186849c5a857ea5f0898e56Rose, James // for x86 SIMD, require minimum of 7 elements (4 for SIMD, 3837b7060c61e4182b29186849c5a857ea5f0898e56Rose, James // 3 for end boundary where x may hit the end boundary) 3847b7060c61e4182b29186849c5a857ea5f0898e56Rose, James if (gArchUseSIMD &&((x1 + 6) < x2)) { 3857b7060c61e4182b29186849c5a857ea5f0898e56Rose, James // subtract 3 for end boundary 3867b7060c61e4182b29186849c5a857ea5f0898e56Rose, James uint32_t len = (x2 - x1 - 3) >> 2; 3877688714916905f29362071ce2eb9e296ca469838Yong Chen rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len); 3887b7060c61e4182b29186849c5a857ea5f0898e56Rose, James out += len << 2; 3897b7060c61e4182b29186849c5a857ea5f0898e56Rose, James x1 += len << 2; 3907b7060c61e4182b29186849c5a857ea5f0898e56Rose, James } 3917b7060c61e4182b29186849c5a857ea5f0898e56Rose, James#endif 392a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 393074424a4ac5b093331df2c92e7a5bcbfff136b71Jason Sams#if defined(ARCH_ARM_USE_INTRINSICS) 394f5ef8df639ba6363aa5d546e57ce872d04144cb6Jason Sams if(gArchUseSIMD && ((x1 + 3) < x2)) { 395a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams uint32_t len = (x2 - x1 - 3) >> 1; 396de52a834dbcb2a3196948e7b9f67d395493ea9a4Jason Sams rsdIntrinsicConvolve5x5_K(out, py0 + x1 - 2, py1 + x1 - 2, py2 + x1 - 2, py3 + x1 - 2, py4 + x1 - 2, cp->mIp, len); 39734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out += len << 1; 39834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1 += len << 1; 39934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 40034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif 40134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 40234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while(x1 < x2) { 40334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneU4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 40434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 40534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 40634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 40734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 40834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 40934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::kernelU2(const RsForEachStubParamStruct *p, 41034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 41134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep) { 41234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 41334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if (!cp->alloc.get()) { 41434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams ALOGE("Convolve5x5 executed without input, skipping"); 41534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams return; 41634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 41734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 41834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 41934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 42034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y0 = rsMax((int32_t)p->y-2, 0); 42134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y1 = rsMax((int32_t)p->y-1, 0); 42234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y2 = p->y; 42334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 42434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 42534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 42634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar2 *py0 = (const uchar2 *)(pin + stride * y0); 42734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar2 *py1 = (const uchar2 *)(pin + stride * y1); 42834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar2 *py2 = (const uchar2 *)(pin + stride * y2); 42934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar2 *py3 = (const uchar2 *)(pin + stride * y3); 43034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar2 *py4 = (const uchar2 *)(pin + stride * y4); 43134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 43234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uchar2 *out = (uchar2 *)p->out; 43334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = xstart; 43434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = xend; 43534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 43634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while((x1 < x2) && (x1 < 2)) { 43734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 43834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 43934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 44034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 44134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 44234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 44334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if((x1 + 3) < x2) { 44434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t len = (x2 - x1 - 3) >> 1; 445a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 446a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams out += len << 1; 447a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams x1 += len << 1; 448a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams } 449a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams#endif 450a1b08e2cacf3891fcd6895422c6124887b75975eJason Sams 451d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams while(x1 < x2) { 45234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneU2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 453d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams out++; 454d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams x1++; 455d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams } 456d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams} 457d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 45834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::kernelU1(const RsForEachStubParamStruct *p, 45934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 46034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep) { 46134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 46234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if (!cp->alloc.get()) { 46334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams ALOGE("Convolve5x5 executed without input, skipping"); 46434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams return; 46534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 46634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 46734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 46834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 46934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y0 = rsMax((int32_t)p->y-2, 0); 47034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y1 = rsMax((int32_t)p->y-1, 0); 47134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y2 = p->y; 47234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 47334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 47434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 47534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *py0 = (const uchar *)(pin + stride * y0); 47634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *py1 = (const uchar *)(pin + stride * y1); 47734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *py2 = (const uchar *)(pin + stride * y2); 47834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *py3 = (const uchar *)(pin + stride * y3); 47934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *py4 = (const uchar *)(pin + stride * y4); 48034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 48134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uchar *out = (uchar *)p->out; 48234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = xstart; 48334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = xend; 48434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 48534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while((x1 < x2) && (x1 < 2)) { 48634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 48734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 48834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 48934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 49034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 49134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 49234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if((x1 + 3) < x2) { 49334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t len = (x2 - x1 - 3) >> 1; 49434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 49534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out += len << 1; 49634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1 += len << 1; 49734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 49834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif 49934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 50034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while(x1 < x2) { 50134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneU1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 50234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 50334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 50434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 50534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 50634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 50734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::kernelF4(const RsForEachStubParamStruct *p, 50834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 50934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep) { 51034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 51134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if (!cp->alloc.get()) { 51234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams ALOGE("Convolve5x5 executed without input, skipping"); 51334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams return; 51434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 51534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 51634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 51734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 51834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y0 = rsMax((int32_t)p->y-2, 0); 51934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y1 = rsMax((int32_t)p->y-1, 0); 52034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y2 = p->y; 52134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 52234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 52334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 52434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float4 *py0 = (const float4 *)(pin + stride * y0); 52534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float4 *py1 = (const float4 *)(pin + stride * y1); 52634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float4 *py2 = (const float4 *)(pin + stride * y2); 52734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float4 *py3 = (const float4 *)(pin + stride * y3); 52834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float4 *py4 = (const float4 *)(pin + stride * y4); 52934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 53034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float4 *out = (float4 *)p->out; 53134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = xstart; 53234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = xend; 53334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 53434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while((x1 < x2) && (x1 < 2)) { 53534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 53634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 53734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 53834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 53934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 54034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 54134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if((x1 + 3) < x2) { 54234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t len = (x2 - x1 - 3) >> 1; 54334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 54434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out += len << 1; 54534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1 += len << 1; 54634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 54734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif 54834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 54934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while(x1 < x2) { 55034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneF4(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 55134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 55234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 55334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 55434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 55534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 55634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::kernelF2(const RsForEachStubParamStruct *p, 55734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 55834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep) { 55934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 56034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if (!cp->alloc.get()) { 56134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams ALOGE("Convolve5x5 executed without input, skipping"); 56234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams return; 56334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 56434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 56534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 56634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 56734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y0 = rsMax((int32_t)p->y-2, 0); 56834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y1 = rsMax((int32_t)p->y-1, 0); 56934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y2 = p->y; 57034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 57134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 57234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 57334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float2 *py0 = (const float2 *)(pin + stride * y0); 57434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float2 *py1 = (const float2 *)(pin + stride * y1); 57534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float2 *py2 = (const float2 *)(pin + stride * y2); 57634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float2 *py3 = (const float2 *)(pin + stride * y3); 57734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float2 *py4 = (const float2 *)(pin + stride * y4); 57834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 57934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float2 *out = (float2 *)p->out; 58034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = xstart; 58134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = xend; 58234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 58334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while((x1 < x2) && (x1 < 2)) { 58434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 58534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 58634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 58734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 58834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 58934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 59034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if((x1 + 3) < x2) { 59134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t len = (x2 - x1 - 3) >> 1; 59234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 59334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out += len << 1; 59434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1 += len << 1; 59534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 59634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif 59734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 59834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while(x1 < x2) { 59934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneF2(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 60034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 60134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 60234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 60334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 60434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 60534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::kernelF1(const RsForEachStubParamStruct *p, 60634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t xstart, uint32_t xend, 60734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t instep, uint32_t outstep) { 60834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams RsdCpuScriptIntrinsicConvolve5x5 *cp = (RsdCpuScriptIntrinsicConvolve5x5 *)p->usr; 60934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if (!cp->alloc.get()) { 61034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams ALOGE("Convolve5x5 executed without input, skipping"); 61134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams return; 61234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 61334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const uchar *pin = (const uchar *)cp->alloc->mHal.drvState.lod[0].mallocPtr; 61434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const size_t stride = cp->alloc->mHal.drvState.lod[0].stride; 61534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 61634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y0 = rsMax((int32_t)p->y-2, 0); 61734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y1 = rsMax((int32_t)p->y-1, 0); 61834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y2 = p->y; 61934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y3 = rsMin((int32_t)p->y+1, (int32_t)(p->dimY-1)); 62034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t y4 = rsMin((int32_t)p->y+2, (int32_t)(p->dimY-1)); 62134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 62234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float *py0 = (const float *)(pin + stride * y0); 62334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float *py1 = (const float *)(pin + stride * y1); 62434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float *py2 = (const float *)(pin + stride * y2); 62534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float *py3 = (const float *)(pin + stride * y3); 62634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams const float *py4 = (const float *)(pin + stride * y4); 62734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 62834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams float *out = (float *)p->out; 62934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x1 = xstart; 63034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t x2 = xend; 63134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 63234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while((x1 < x2) && (x1 < 2)) { 63334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 63434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 63534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 63634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 63734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 63834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#if 0//defined(ARCH_ARM_HAVE_NEON) 63934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if((x1 + 3) < x2) { 64034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams uint32_t len = (x2 - x1 - 3) >> 1; 64134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams rsdIntrinsicConvolve5x5_K(out, py0, py1, py2, py3, py4, cp->ip, len); 64234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out += len << 1; 64334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1 += len << 1; 64434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 64534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams#endif 64634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams 64734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams while(x1 < x2) { 64834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams OneF1(p, x1, out, py0, py1, py2, py3, py4, cp->mFp); 64934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams out++; 65034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams x1++; 65134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 65234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams} 653d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 654709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve5x5::RsdCpuScriptIntrinsicConvolve5x5( 655c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) 656c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_CONVOLVE_5x5) { 657d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 65834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams if (e->getType() == RS_TYPE_FLOAT_32) { 65934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams switch(e->getVectorSize()) { 66034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams case 1: 66134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mRootPtr = &kernelF1; 66234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams break; 66334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams case 2: 66434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mRootPtr = &kernelF2; 66534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams break; 66634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams case 3: 66734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams case 4: 66834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mRootPtr = &kernelF4; 66934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams break; 67034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 67134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } else { 67234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams switch(e->getVectorSize()) { 67334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams case 1: 67434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mRootPtr = &kernelU1; 67534b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams break; 67634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams case 2: 67734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mRootPtr = &kernelU2; 67834b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams break; 67934b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams case 3: 68034b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams case 4: 68134b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mRootPtr = &kernelU4; 68234b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams break; 68334b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 68434b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams } 685ce0351debba8dadd1a7af2b3e926de6d787b49afJason Sams for(int ct=0; ct < 25; ct++) { 68634b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mFp[ct] = 1.f / 25.f; 68734b0d3119567b992f0f876a2dffc0161bdcef3e6Jason Sams mIp[ct] = (short)(mFp[ct] * 256.f); 688d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams } 689d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams} 690d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 691709a0978ae141198018ca9769f8d96292a8928e6Jason SamsRsdCpuScriptIntrinsicConvolve5x5::~RsdCpuScriptIntrinsicConvolve5x5() { 692709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 693709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 694709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::populateScript(Script *s) { 695709a0978ae141198018ca9769f8d96292a8928e6Jason Sams s->mHal.info.exportedVariableCount = 2; 696709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 697709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 698709a0978ae141198018ca9769f8d96292a8928e6Jason Samsvoid RsdCpuScriptIntrinsicConvolve5x5::invokeFreeChildren() { 699709a0978ae141198018ca9769f8d96292a8928e6Jason Sams alloc.clear(); 700709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 701709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 702709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 703c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason SamsRsdCpuScriptImpl * rsdIntrinsic_Convolve5x5(RsdCpuReferenceImpl *ctx, 704c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams const Script *s, const Element *e) { 705709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 706c905efd76fdcc1b8846b229bf7d991d185a7b4b7Jason Sams return new RsdCpuScriptIntrinsicConvolve5x5(ctx, s, e); 707709a0978ae141198018ca9769f8d96292a8928e6Jason Sams} 708709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 709709a0978ae141198018ca9769f8d96292a8928e6Jason Sams 710d85e283087ecd00760a0d8d0c9d8482cda845efcJason Sams 711