1/*
2 * Copyright (C) 2013 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "rsCpuIntrinsic.h"
18#include "rsCpuIntrinsicInlines.h"
19
20using namespace android;
21using namespace android::renderscript;
22
23namespace android {
24namespace renderscript {
25
26
27class RsdCpuScriptIntrinsicHistogram : public RsdCpuScriptIntrinsic {
28public:
29    void populateScript(Script *) override;
30    void invokeFreeChildren() override;
31
32    void setGlobalVar(uint32_t slot, const void *data, size_t dataLength) override;
33    void setGlobalObj(uint32_t slot, ObjectBase *data) override;
34
35    ~RsdCpuScriptIntrinsicHistogram() override;
36    RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e);
37
38protected:
39    void preLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
40                   Allocation * aout, const void * usr,
41                   uint32_t usrLen, const RsScriptCall *sc);
42    void postLaunch(uint32_t slot, const Allocation ** ains, uint32_t inLen,
43                    Allocation * aout, const void * usr,
44                    uint32_t usrLen, const RsScriptCall *sc);
45
46
47    float mDot[4];
48    int mDotI[4];
49    int *mSums;
50    ObjectBaseRef<Allocation> mAllocOut;
51
52    static void kernelP1U4(const RsExpandKernelDriverInfo *info,
53                           uint32_t xstart, uint32_t xend,
54                           uint32_t outstep);
55    static void kernelP1U3(const RsExpandKernelDriverInfo *info,
56                           uint32_t xstart, uint32_t xend,
57                           uint32_t outstep);
58    static void kernelP1U2(const RsExpandKernelDriverInfo *info,
59                           uint32_t xstart, uint32_t xend,
60                           uint32_t outstep);
61    static void kernelP1U1(const RsExpandKernelDriverInfo *info,
62                           uint32_t xstart, uint32_t xend,
63                           uint32_t outstep);
64
65    static void kernelP1L4(const RsExpandKernelDriverInfo *info,
66                           uint32_t xstart, uint32_t xend,
67                           uint32_t outstep);
68    static void kernelP1L3(const RsExpandKernelDriverInfo *info,
69                           uint32_t xstart, uint32_t xend,
70                           uint32_t outstep);
71    static void kernelP1L2(const RsExpandKernelDriverInfo *info,
72                           uint32_t xstart, uint32_t xend,
73                           uint32_t outstep);
74    static void kernelP1L1(const RsExpandKernelDriverInfo *info,
75                           uint32_t xstart, uint32_t xend,
76                           uint32_t outstep);
77
78};
79
80}
81}
82
83void RsdCpuScriptIntrinsicHistogram::setGlobalObj(uint32_t slot, ObjectBase *data) {
84    rsAssert(slot == 1);
85    mAllocOut.set(static_cast<Allocation *>(data));
86}
87
88void RsdCpuScriptIntrinsicHistogram::setGlobalVar(uint32_t slot, const void *data, size_t dataLength) {
89    rsAssert(slot == 0);
90    rsAssert(dataLength == 16);
91    memcpy(mDot, data, 16);
92    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
93    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
94    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
95    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
96}
97
98
99
100void
101RsdCpuScriptIntrinsicHistogram::preLaunch(uint32_t slot,
102                                          const Allocation ** ains,
103                                          uint32_t inLen, Allocation * aout,
104                                          const void * usr, uint32_t usrLen,
105                                          const RsScriptCall *sc) {
106
107    const uint32_t threads = mCtx->getThreadCount();
108    uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
109
110    switch (slot) {
111    case 0:
112        switch(vSize) {
113        case 1:
114            mRootPtr = &kernelP1U1;
115            break;
116        case 2:
117            mRootPtr = &kernelP1U2;
118            break;
119        case 3:
120            mRootPtr = &kernelP1U3;
121            vSize = 4;
122            break;
123        case 4:
124            mRootPtr = &kernelP1U4;
125            break;
126        }
127        break;
128    case 1:
129        switch(ains[0]->getType()->getElement()->getVectorSize()) {
130        case 1:
131            mRootPtr = &kernelP1L1;
132            break;
133        case 2:
134            mRootPtr = &kernelP1L2;
135            break;
136        case 3:
137            mRootPtr = &kernelP1L3;
138            break;
139        case 4:
140            mRootPtr = &kernelP1L4;
141            break;
142        }
143        break;
144    }
145    memset(mSums, 0, 256 * sizeof(int32_t) * threads * vSize);
146}
147
148void
149RsdCpuScriptIntrinsicHistogram::postLaunch(uint32_t slot,
150                                           const Allocation ** ains,
151                                           uint32_t inLen,  Allocation * aout,
152                                           const void * usr, uint32_t usrLen,
153                                           const RsScriptCall *sc) {
154
155    unsigned int *o = (unsigned int *)mAllocOut->mHal.drvState.lod[0].mallocPtr;
156    uint32_t threads = mCtx->getThreadCount();
157    uint32_t vSize = mAllocOut->getType()->getElement()->getVectorSize();
158
159    if (vSize == 3) vSize = 4;
160
161    for (uint32_t ct=0; ct < (256 * vSize); ct++) {
162        o[ct] = mSums[ct];
163        for (uint32_t t=1; t < threads; t++) {
164            o[ct] += mSums[ct + (256 * vSize * t)];
165        }
166    }
167}
168
169void RsdCpuScriptIntrinsicHistogram::kernelP1U4(const RsExpandKernelDriverInfo *info,
170                                                uint32_t xstart, uint32_t xend,
171                                                uint32_t outstep) {
172
173    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
174    uchar *in = (uchar *)info->inPtr[0];
175    int * sums = &cp->mSums[256 * 4 * info->lid];
176
177    for (uint32_t x = xstart; x < xend; x++) {
178        sums[(in[0] << 2)    ] ++;
179        sums[(in[1] << 2) + 1] ++;
180        sums[(in[2] << 2) + 2] ++;
181        sums[(in[3] << 2) + 3] ++;
182        in += info->inStride[0];
183    }
184}
185
186void RsdCpuScriptIntrinsicHistogram::kernelP1U3(const RsExpandKernelDriverInfo *info,
187                                                uint32_t xstart, uint32_t xend,
188                                                uint32_t outstep) {
189
190    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
191    uchar *in = (uchar *)info->inPtr[0];
192    int * sums = &cp->mSums[256 * 4 * info->lid];
193
194    for (uint32_t x = xstart; x < xend; x++) {
195        sums[(in[0] << 2)    ] ++;
196        sums[(in[1] << 2) + 1] ++;
197        sums[(in[2] << 2) + 2] ++;
198        in += info->inStride[0];
199    }
200}
201
202void RsdCpuScriptIntrinsicHistogram::kernelP1U2(const RsExpandKernelDriverInfo *info,
203                                                uint32_t xstart, uint32_t xend,
204                                                uint32_t outstep) {
205
206    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
207    uchar *in = (uchar *)info->inPtr[0];
208    int * sums = &cp->mSums[256 * 2 * info->lid];
209
210    for (uint32_t x = xstart; x < xend; x++) {
211        sums[(in[0] << 1)    ] ++;
212        sums[(in[1] << 1) + 1] ++;
213        in += info->inStride[0];
214    }
215}
216
217void RsdCpuScriptIntrinsicHistogram::kernelP1L4(const RsExpandKernelDriverInfo *info,
218                                                uint32_t xstart, uint32_t xend,
219                                                uint32_t outstep) {
220
221    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
222    uchar *in = (uchar *)info->inPtr[0];
223    int * sums = &cp->mSums[256 * info->lid];
224
225    for (uint32_t x = xstart; x < xend; x++) {
226        int t = (cp->mDotI[0] * in[0]) +
227                (cp->mDotI[1] * in[1]) +
228                (cp->mDotI[2] * in[2]) +
229                (cp->mDotI[3] * in[3]);
230        sums[(t + 0x7f) >> 8] ++;
231        in += info->inStride[0];
232    }
233}
234
235void RsdCpuScriptIntrinsicHistogram::kernelP1L3(const RsExpandKernelDriverInfo *info,
236                                                uint32_t xstart, uint32_t xend,
237                                                uint32_t outstep) {
238
239    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
240    uchar *in = (uchar *)info->inPtr[0];
241    int * sums = &cp->mSums[256 * info->lid];
242
243    for (uint32_t x = xstart; x < xend; x++) {
244        int t = (cp->mDotI[0] * in[0]) +
245                (cp->mDotI[1] * in[1]) +
246                (cp->mDotI[2] * in[2]);
247        sums[(t + 0x7f) >> 8] ++;
248        in += info->inStride[0];
249    }
250}
251
252void RsdCpuScriptIntrinsicHistogram::kernelP1L2(const RsExpandKernelDriverInfo *info,
253                                                uint32_t xstart, uint32_t xend,
254                                                uint32_t outstep) {
255
256    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
257    uchar *in = (uchar *)info->inPtr[0];
258    int * sums = &cp->mSums[256 * info->lid];
259
260    for (uint32_t x = xstart; x < xend; x++) {
261        int t = (cp->mDotI[0] * in[0]) +
262                (cp->mDotI[1] * in[1]);
263        sums[(t + 0x7f) >> 8] ++;
264        in += info->inStride[0];
265    }
266}
267
268void RsdCpuScriptIntrinsicHistogram::kernelP1L1(const RsExpandKernelDriverInfo *info,
269                                                uint32_t xstart, uint32_t xend,
270                                                uint32_t outstep) {
271
272    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
273    uchar *in = (uchar *)info->inPtr[0];
274    int * sums = &cp->mSums[256 * info->lid];
275
276    for (uint32_t x = xstart; x < xend; x++) {
277        int t = (cp->mDotI[0] * in[0]);
278        sums[(t + 0x7f) >> 8] ++;
279        in += info->inStride[0];
280    }
281}
282
283void RsdCpuScriptIntrinsicHistogram::kernelP1U1(const RsExpandKernelDriverInfo *info,
284                                                uint32_t xstart, uint32_t xend,
285                                                uint32_t outstep) {
286
287    RsdCpuScriptIntrinsicHistogram *cp = (RsdCpuScriptIntrinsicHistogram *)info->usr;
288    uchar *in = (uchar *)info->inPtr[0];
289    int * sums = &cp->mSums[256 * info->lid];
290
291    for (uint32_t x = xstart; x < xend; x++) {
292        sums[in[0]] ++;
293        in += info->inStride[0];
294    }
295}
296
297
298RsdCpuScriptIntrinsicHistogram::RsdCpuScriptIntrinsicHistogram(RsdCpuReferenceImpl *ctx,
299                                                     const Script *s, const Element *e)
300            : RsdCpuScriptIntrinsic(ctx, s, e, RS_SCRIPT_INTRINSIC_ID_HISTOGRAM) {
301
302    mRootPtr = nullptr;
303    mSums = new int[256 * 4 * mCtx->getThreadCount()];
304    mDot[0] = 0.299f;
305    mDot[1] = 0.587f;
306    mDot[2] = 0.114f;
307    mDot[3] = 0;
308    mDotI[0] = (int)((mDot[0] * 256.f) + 0.5f);
309    mDotI[1] = (int)((mDot[1] * 256.f) + 0.5f);
310    mDotI[2] = (int)((mDot[2] * 256.f) + 0.5f);
311    mDotI[3] = (int)((mDot[3] * 256.f) + 0.5f);
312}
313
314RsdCpuScriptIntrinsicHistogram::~RsdCpuScriptIntrinsicHistogram() {
315    if (mSums) {
316        delete []mSums;
317    }
318}
319
320void RsdCpuScriptIntrinsicHistogram::populateScript(Script *s) {
321    s->mHal.info.exportedVariableCount = 2;
322}
323
324void RsdCpuScriptIntrinsicHistogram::invokeFreeChildren() {
325}
326
327
328RsdCpuScriptImpl * rsdIntrinsic_Histogram(RsdCpuReferenceImpl *ctx, const Script *s, const Element *e) {
329
330    return new RsdCpuScriptIntrinsicHistogram(ctx, s, e);
331}
332