1/*
2 * Copyright (C) 2012 Gabor Rapcsanyi (rgabor@inf.u-szeged.hu), University of Szeged
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 *    notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 *    notice, this list of conditions and the following disclaimer in the
11 *    documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF SZEGED ``AS IS'' AND ANY
14 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
16 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL UNIVERSITY OF SZEGED OR
17 * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
18 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
19 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
20 * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
21 * OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
22 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
23 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
24 */
25
26#ifndef WebGLImageConversionNEON_h
27#define WebGLImageConversionNEON_h
28
29#if HAVE(ARM_NEON_INTRINSICS)
30
31#include <arm_neon.h>
32
33namespace blink {
34
35namespace SIMD {
36
37ALWAYS_INLINE void unpackOneRowOfRGBA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
38{
39    unsigned componentsPerRow = pixelsPerRow * 4;
40    unsigned tailComponents = componentsPerRow % 16;
41    unsigned componentsSize = componentsPerRow - tailComponents;
42    const uint8_t* src = reinterpret_cast<const uint8_t*>(source);
43
44    for (unsigned i = 0; i < componentsSize; i += 16) {
45        uint8x16x2_t components = vld2q_u8(src + i * 2);
46        vst1q_u8(destination + i, components.val[1]);
47    }
48
49    source += componentsSize;
50    destination += componentsSize;
51    pixelsPerRow = tailComponents / 4;
52}
53
54ALWAYS_INLINE void unpackOneRowOfRGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
55{
56    unsigned componentsPerRow = pixelsPerRow * 3;
57    unsigned tailComponents = componentsPerRow % 24;
58    unsigned componentsSize = componentsPerRow - tailComponents;
59
60    uint8x8_t componentA = vdup_n_u8(0xFF);
61    for (unsigned i = 0; i < componentsSize; i += 24) {
62        uint16x8x3_t RGB16 = vld3q_u16(source + i);
63        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(RGB16.val[0], 8));
64        uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(RGB16.val[1], 8));
65        uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(RGB16.val[2], 8));
66        uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
67        vst4_u8(destination, RGBA8);
68        destination += 32;
69    }
70
71    source += componentsSize;
72    pixelsPerRow = tailComponents / 3;
73}
74
75ALWAYS_INLINE void unpackOneRowOfARGB16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
76{
77    unsigned componentsPerRow = pixelsPerRow * 4;
78    unsigned tailComponents = componentsPerRow % 32;
79    unsigned componentsSize = componentsPerRow - tailComponents;
80
81    for (unsigned i = 0; i < componentsSize; i += 32) {
82        uint16x8x4_t ARGB16 = vld4q_u16(source + i);
83        uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
84        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
85        uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
86        uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
87        uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
88        vst4_u8(destination + i, RGBA8);
89    }
90
91    source += componentsSize;
92    destination += componentsSize;
93    pixelsPerRow = tailComponents / 4;
94}
95
96ALWAYS_INLINE void unpackOneRowOfBGRA16LittleToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
97{
98    unsigned componentsPerRow = pixelsPerRow * 4;
99    unsigned tailComponents = componentsPerRow % 32;
100    unsigned componentsSize = componentsPerRow - tailComponents;
101
102    for (unsigned i = 0; i < componentsSize; i += 32) {
103        uint16x8x4_t ARGB16 = vld4q_u16(source + i);
104        uint8x8_t componentB = vqmovn_u16(vshrq_n_u16(ARGB16.val[0], 8));
105        uint8x8_t componentG = vqmovn_u16(vshrq_n_u16(ARGB16.val[1], 8));
106        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(ARGB16.val[2], 8));
107        uint8x8_t componentA = vqmovn_u16(vshrq_n_u16(ARGB16.val[3], 8));
108        uint8x8x4_t RGBA8 = {{componentR, componentG, componentB, componentA}};
109        vst4_u8(destination + i, RGBA8);
110    }
111
112    source += componentsSize;
113    destination += componentsSize;
114    pixelsPerRow = tailComponents / 4;
115}
116
117ALWAYS_INLINE void unpackOneRowOfRGBA4444ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
118{
119    unsigned tailPixels = pixelsPerRow % 8;
120    unsigned pixelSize = pixelsPerRow - tailPixels;
121
122    uint16x8_t immediate0x0f = vdupq_n_u16(0x0F);
123    for (unsigned i = 0; i < pixelSize; i += 8) {
124        uint16x8_t eightPixels = vld1q_u16(source + i);
125
126        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 12));
127        uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 8), immediate0x0f));
128        uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 4), immediate0x0f));
129        uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x0f));
130
131        componentR = vorr_u8(vshl_n_u8(componentR, 4), componentR);
132        componentG = vorr_u8(vshl_n_u8(componentG, 4), componentG);
133        componentB = vorr_u8(vshl_n_u8(componentB, 4), componentB);
134        componentA = vorr_u8(vshl_n_u8(componentA, 4), componentA);
135
136        uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
137        vst4_u8(destination, destComponents);
138        destination += 32;
139    }
140
141    source += pixelSize;
142    pixelsPerRow = tailPixels;
143}
144
145ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort4444(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
146{
147    unsigned componentsPerRow = pixelsPerRow * 4;
148    unsigned tailComponents = componentsPerRow % 32;
149    unsigned componentsSize = componentsPerRow - tailComponents;
150
151    uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
152    uint8x8_t immediate0xf0 = vdup_n_u8(0xF0);
153    for (unsigned i = 0; i < componentsSize; i += 32) {
154        uint8x8x4_t RGBA8 = vld4_u8(source + i);
155
156        uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf0);
157        uint8x8_t componentG = vshr_n_u8(vand_u8(RGBA8.val[1], immediate0xf0), 4);
158        uint8x8_t componentB = vand_u8(RGBA8.val[2], immediate0xf0);
159        uint8x8_t componentA = vshr_n_u8(vand_u8(RGBA8.val[3], immediate0xf0), 4);
160
161        uint8x8x2_t RGBA4;
162        RGBA4.val[0] = vorr_u8(componentB, componentA);
163        RGBA4.val[1] = vorr_u8(componentR, componentG);
164        vst2_u8(dst, RGBA4);
165        dst += 16;
166    }
167
168    source += componentsSize;
169    destination += componentsSize / 4;
170    pixelsPerRow = tailComponents / 4;
171}
172
173ALWAYS_INLINE void unpackOneRowOfRGBA5551ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
174{
175    unsigned tailPixels = pixelsPerRow % 8;
176    unsigned pixelSize = pixelsPerRow - tailPixels;
177
178    uint8x8_t immediate0x7 = vdup_n_u8(0x7);
179    uint8x8_t immediate0xff = vdup_n_u8(0xFF);
180    uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
181    uint16x8_t immediate0x1 = vdupq_n_u16(0x1);
182
183    for (unsigned i = 0; i < pixelSize; i += 8) {
184        uint16x8_t eightPixels = vld1q_u16(source + i);
185
186        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
187        uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 6), immediate0x1f));
188        uint8x8_t componentB = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 1), immediate0x1f));
189        uint8x8_t componentA = vqmovn_u16(vandq_u16(eightPixels, immediate0x1));
190
191        componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
192        componentG = vorr_u8(vshl_n_u8(componentG, 3), vand_u8(componentG, immediate0x7));
193        componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
194        componentA = vmul_u8(componentA, immediate0xff);
195
196        uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
197        vst4_u8(destination, destComponents);
198        destination += 32;
199    }
200
201    source += pixelSize;
202    pixelsPerRow = tailPixels;
203}
204
205ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort5551(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
206{
207    unsigned componentsPerRow = pixelsPerRow * 4;
208    unsigned tailComponents = componentsPerRow % 32;
209    unsigned componentsSize = componentsPerRow - tailComponents;
210
211    uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
212
213    uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
214    uint8x8_t immediate0x18 = vdup_n_u8(0x18);
215    for (unsigned i = 0; i < componentsSize; i += 32) {
216        uint8x8x4_t RGBA8 = vld4_u8(source + i);
217
218        uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
219        uint8x8_t componentG3bit = vshr_n_u8(RGBA8.val[1], 5);
220
221        uint8x8_t componentG2bit = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x18), 3);
222        uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 2);
223        uint8x8_t componentA = vshr_n_u8(RGBA8.val[3], 7);
224
225        uint8x8x2_t RGBA5551;
226        RGBA5551.val[0] = vorr_u8(vorr_u8(componentG2bit, componentB), componentA);
227        RGBA5551.val[1] = vorr_u8(componentR, componentG3bit);
228        vst2_u8(dst, RGBA5551);
229        dst += 16;
230    }
231
232    source += componentsSize;
233    destination += componentsSize / 4;
234    pixelsPerRow = tailComponents / 4;
235}
236
237ALWAYS_INLINE void unpackOneRowOfRGB565ToRGBA8(const uint16_t*& source, uint8_t*& destination, unsigned& pixelsPerRow)
238{
239    unsigned tailPixels = pixelsPerRow % 8;
240    unsigned pixelSize = pixelsPerRow - tailPixels;
241
242    uint16x8_t immediate0x3f = vdupq_n_u16(0x3F);
243    uint16x8_t immediate0x1f = vdupq_n_u16(0x1F);
244    uint8x8_t immediate0x3 = vdup_n_u8(0x3);
245    uint8x8_t immediate0x7 = vdup_n_u8(0x7);
246
247    uint8x8_t componentA = vdup_n_u8(0xFF);
248
249    for (unsigned i = 0; i < pixelSize; i += 8) {
250        uint16x8_t eightPixels = vld1q_u16(source + i);
251
252        uint8x8_t componentR = vqmovn_u16(vshrq_n_u16(eightPixels, 11));
253        uint8x8_t componentG = vqmovn_u16(vandq_u16(vshrq_n_u16(eightPixels, 5), immediate0x3f));
254        uint8x8_t componentB = vqmovn_u16(vandq_u16(eightPixels, immediate0x1f));
255
256        componentR = vorr_u8(vshl_n_u8(componentR, 3), vand_u8(componentR, immediate0x7));
257        componentG = vorr_u8(vshl_n_u8(componentG, 2), vand_u8(componentG, immediate0x3));
258        componentB = vorr_u8(vshl_n_u8(componentB, 3), vand_u8(componentB, immediate0x7));
259
260        uint8x8x4_t destComponents = {{componentR, componentG, componentB, componentA}};
261        vst4_u8(destination, destComponents);
262        destination += 32;
263    }
264
265    source += pixelSize;
266    pixelsPerRow = tailPixels;
267}
268
269ALWAYS_INLINE void packOneRowOfRGBA8ToUnsignedShort565(const uint8_t*& source, uint16_t*& destination, unsigned& pixelsPerRow)
270{
271    unsigned componentsPerRow = pixelsPerRow * 4;
272    unsigned tailComponents = componentsPerRow % 32;
273    unsigned componentsSize = componentsPerRow - tailComponents;
274    uint8_t* dst = reinterpret_cast<uint8_t*>(destination);
275
276    uint8x8_t immediate0xf8 = vdup_n_u8(0xF8);
277    uint8x8_t immediate0x1c = vdup_n_u8(0x1C);
278    for (unsigned i = 0; i < componentsSize; i += 32) {
279        uint8x8x4_t RGBA8 = vld4_u8(source + i);
280
281        uint8x8_t componentR = vand_u8(RGBA8.val[0], immediate0xf8);
282        uint8x8_t componentGLeft = vshr_n_u8(RGBA8.val[1], 5);
283        uint8x8_t componentGRight = vshl_n_u8(vand_u8(RGBA8.val[1], immediate0x1c), 3);
284        uint8x8_t componentB = vshr_n_u8(vand_u8(RGBA8.val[2], immediate0xf8), 3);
285
286        uint8x8x2_t RGB565;
287        RGB565.val[0] = vorr_u8(componentGRight, componentB);
288        RGB565.val[1] = vorr_u8(componentR, componentGLeft);
289        vst2_u8(dst, RGB565);
290        dst += 16;
291    }
292
293    source += componentsSize;
294    destination += componentsSize / 4;
295    pixelsPerRow = tailComponents / 4;
296}
297
298} // namespace SIMD
299
300} // namespace blink
301
302#endif // HAVE(ARM_NEON_INTRINSICS)
303
304#endif // WebGLImageConversionNEON_h
305