1#define COMPONENT_SIZE 8
2#define MASK 0xff
3#define ONE_HALF 0x80
4
5#define A_SHIFT 8 * 3
6#define R_SHIFT 8 * 2
7#define G_SHIFT 8
8#define A_MASK 0xff000000
9#define R_MASK 0xff0000
10#define G_MASK 0xff00
11
12#define RB_MASK 0xff00ff
13#define AG_MASK 0xff00ff00
14#define RB_ONE_HALF 0x800080
15#define RB_MASK_PLUS_ONE 0x10000100
16
17#define ALPHA_8(x) ((x) >> A_SHIFT)
18#define RED_8(x) (((x) >> R_SHIFT) & MASK)
19#define GREEN_8(x) (((x) >> G_SHIFT) & MASK)
20#define BLUE_8(x) ((x) & MASK)
21
22/*
23 * ARMv6 has UQADD8 instruction, which implements unsigned saturated
24 * addition for 8-bit values packed in 32-bit registers. It is very useful
25 * for UN8x4_ADD_UN8x4, UN8_rb_ADD_UN8_rb and ADD_UN8 macros (which would
26 * otherwise need a lot of arithmetic operations to simulate this operation).
27 * Since most of the major ARM linux distros are built for ARMv7, we are
28 * much less dependent on runtime CPU detection and can get practical
29 * benefits from conditional compilation here for a lot of users.
30 */
31
32#if defined(USE_GCC_INLINE_ASM) && defined(__arm__) && \
33    !defined(__aarch64__) && (!defined(__thumb__) || defined(__thumb2__))
34#if defined(__ARM_ARCH_6__)   || defined(__ARM_ARCH_6J__)  || \
35    defined(__ARM_ARCH_6K__)  || defined(__ARM_ARCH_6Z__)  || \
36    defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) || \
37    defined(__ARM_ARCH_6M__)  || defined(__ARM_ARCH_7__)   || \
38    defined(__ARM_ARCH_7A__)  || defined(__ARM_ARCH_7R__)  || \
39    defined(__ARM_ARCH_7M__)  || defined(__ARM_ARCH_7EM__)
40
41static force_inline uint32_t
42un8x4_add_un8x4 (uint32_t x, uint32_t y)
43{
44    uint32_t t;
45    asm ("uqadd8 %0, %1, %2" : "=r" (t) : "%r" (x), "r" (y));
46    return t;
47}
48
49#define UN8x4_ADD_UN8x4(x, y) \
50    ((x) = un8x4_add_un8x4 ((x), (y)))
51
52#define UN8_rb_ADD_UN8_rb(x, y, t) \
53    ((t) = un8x4_add_un8x4 ((x), (y)), (x) = (t))
54
55#define ADD_UN8(x, y, t) \
56    ((t) = (x), un8x4_add_un8x4 ((t), (y)))
57
58#endif
59#endif
60
61/*****************************************************************************/
62
63/*
64 * Helper macros.
65 */
66
67#define MUL_UN8(a, b, t)						\
68    ((t) = (a) * (uint16_t)(b) + ONE_HALF, ((((t) >> G_SHIFT ) + (t) ) >> G_SHIFT ))
69
70#define DIV_UN8(a, b)							\
71    (((uint16_t) (a) * MASK + ((b) / 2)) / (b))
72
73#ifndef ADD_UN8
74#define ADD_UN8(x, y, t)				     \
75    ((t) = (x) + (y),					     \
76     (uint32_t) (uint8_t) ((t) | (0 - ((t) >> G_SHIFT))))
77#endif
78
79#define DIV_ONE_UN8(x)							\
80    (((x) + ONE_HALF + (((x) + ONE_HALF) >> G_SHIFT)) >> G_SHIFT)
81
82/*
83 * The methods below use some tricks to be able to do two color
84 * components at the same time.
85 */
86
87/*
88 * x_rb = (x_rb * a) / 255
89 */
90#define UN8_rb_MUL_UN8(x, a, t)						\
91    do									\
92    {									\
93	t  = ((x) & RB_MASK) * (a);					\
94	t += RB_ONE_HALF;						\
95	x = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
96	x &= RB_MASK;							\
97    } while (0)
98
99/*
100 * x_rb = min (x_rb + y_rb, 255)
101 */
102#ifndef UN8_rb_ADD_UN8_rb
103#define UN8_rb_ADD_UN8_rb(x, y, t)					\
104    do									\
105    {									\
106	t = ((x) + (y));						\
107	t |= RB_MASK_PLUS_ONE - ((t >> G_SHIFT) & RB_MASK);		\
108	x = (t & RB_MASK);						\
109    } while (0)
110#endif
111
112/*
113 * x_rb = (x_rb * a_rb) / 255
114 */
115#define UN8_rb_MUL_UN8_rb(x, a, t)					\
116    do									\
117    {									\
118	t  = (x & MASK) * (a & MASK);					\
119	t |= (x & R_MASK) * ((a >> R_SHIFT) & MASK);			\
120	t += RB_ONE_HALF;						\
121	t = (t + ((t >> G_SHIFT) & RB_MASK)) >> G_SHIFT;		\
122	x = t & RB_MASK;						\
123    } while (0)
124
125/*
126 * x_c = (x_c * a) / 255
127 */
128#define UN8x4_MUL_UN8(x, a)						\
129    do									\
130    {									\
131	uint32_t r1__, r2__, t__;					\
132									\
133	r1__ = (x);							\
134	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
135									\
136	r2__ = (x) >> G_SHIFT;						\
137	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
138									\
139	(x) = r1__ | (r2__ << G_SHIFT);					\
140    } while (0)
141
142/*
143 * x_c = (x_c * a) / 255 + y_c
144 */
145#define UN8x4_MUL_UN8_ADD_UN8x4(x, a, y)				\
146    do									\
147    {									\
148	uint32_t r1__, r2__, r3__, t__;					\
149									\
150	r1__ = (x);							\
151	r2__ = (y) & RB_MASK;						\
152	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
153	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
154									\
155	r2__ = (x) >> G_SHIFT;						\
156	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
157	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
158	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
159									\
160	(x) = r1__ | (r2__ << G_SHIFT);					\
161    } while (0)
162
163/*
164 * x_c = (x_c * a + y_c * b) / 255
165 */
166#define UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8(x, a, y, b)			\
167    do									\
168    {									\
169	uint32_t r1__, r2__, r3__, t__;					\
170									\
171	r1__ = (x);							\
172	r2__ = (y);							\
173	UN8_rb_MUL_UN8 (r1__, (a), t__);				\
174	UN8_rb_MUL_UN8 (r2__, (b), t__);				\
175	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
176									\
177	r2__ = ((x) >> G_SHIFT);					\
178	r3__ = ((y) >> G_SHIFT);					\
179	UN8_rb_MUL_UN8 (r2__, (a), t__);				\
180	UN8_rb_MUL_UN8 (r3__, (b), t__);				\
181	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
182									\
183	(x) = r1__ | (r2__ << G_SHIFT);					\
184    } while (0)
185
186/*
187 * x_c = (x_c * a_c) / 255
188 */
189#define UN8x4_MUL_UN8x4(x, a)						\
190    do									\
191    {									\
192	uint32_t r1__, r2__, r3__, t__;					\
193									\
194	r1__ = (x);							\
195	r2__ = (a);							\
196	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
197									\
198	r2__ = (x) >> G_SHIFT;						\
199	r3__ = (a) >> G_SHIFT;						\
200	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
201									\
202	(x) = r1__ | (r2__ << G_SHIFT);					\
203    } while (0)
204
205/*
206 * x_c = (x_c * a_c) / 255 + y_c
207 */
208#define UN8x4_MUL_UN8x4_ADD_UN8x4(x, a, y)				\
209    do									\
210    {									\
211	uint32_t r1__, r2__, r3__, t__;					\
212									\
213	r1__ = (x);							\
214	r2__ = (a);							\
215	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
216	r2__ = (y) & RB_MASK;						\
217	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
218									\
219	r2__ = ((x) >> G_SHIFT);					\
220	r3__ = ((a) >> G_SHIFT);					\
221	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
222	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
223	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
224									\
225	(x) = r1__ | (r2__ << G_SHIFT);					\
226    } while (0)
227
228/*
229 * x_c = (x_c * a_c + y_c * b) / 255
230 */
231#define UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8(x, a, y, b)			\
232    do									\
233    {									\
234	uint32_t r1__, r2__, r3__, t__;					\
235									\
236	r1__ = (x);							\
237	r2__ = (a);							\
238	UN8_rb_MUL_UN8_rb (r1__, r2__, t__);				\
239	r2__ = (y);							\
240	UN8_rb_MUL_UN8 (r2__, (b), t__);				\
241	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
242									\
243	r2__ = (x) >> G_SHIFT;						\
244	r3__ = (a) >> G_SHIFT;						\
245	UN8_rb_MUL_UN8_rb (r2__, r3__, t__);				\
246	r3__ = (y) >> G_SHIFT;						\
247	UN8_rb_MUL_UN8 (r3__, (b), t__);				\
248	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
249									\
250	x = r1__ | (r2__ << G_SHIFT);					\
251    } while (0)
252
253/*
254  x_c = min(x_c + y_c, 255)
255*/
256#ifndef UN8x4_ADD_UN8x4
257#define UN8x4_ADD_UN8x4(x, y)						\
258    do									\
259    {									\
260	uint32_t r1__, r2__, r3__, t__;					\
261									\
262	r1__ = (x) & RB_MASK;						\
263	r2__ = (y) & RB_MASK;						\
264	UN8_rb_ADD_UN8_rb (r1__, r2__, t__);				\
265									\
266	r2__ = ((x) >> G_SHIFT) & RB_MASK;				\
267	r3__ = ((y) >> G_SHIFT) & RB_MASK;				\
268	UN8_rb_ADD_UN8_rb (r2__, r3__, t__);				\
269									\
270	x = r1__ | (r2__ << G_SHIFT);					\
271    } while (0)
272#endif
273