1
2/*
3 * Copyright 2006 The Android Open Source Project
4 *
5 * Use of this source code is governed by a BSD-style license that can be
6 * found in the LICENSE file.
7 */
8
9
10#ifndef SkColorPriv_DEFINED
11#define SkColorPriv_DEFINED
12
13// turn this own for extra debug checking when blending onto 565
14#ifdef SK_DEBUG
15    #define CHECK_FOR_565_OVERFLOW
16#endif
17
18#include "SkColor.h"
19#include "SkMath.h"
20
21///@{
22/** See ITU-R Recommendation BT.709 at http://www.itu.int/rec/R-REC-BT.709/ .*/
23#define SK_ITU_BT709_LUM_COEFF_R (0.2126f)
24#define SK_ITU_BT709_LUM_COEFF_G (0.7152f)
25#define SK_ITU_BT709_LUM_COEFF_B (0.0722f)
26///@}
27
28///@{
29/** A float value which specifies this channel's contribution to luminance. */
30#define SK_LUM_COEFF_R SK_ITU_BT709_LUM_COEFF_R
31#define SK_LUM_COEFF_G SK_ITU_BT709_LUM_COEFF_G
32#define SK_LUM_COEFF_B SK_ITU_BT709_LUM_COEFF_B
33///@}
34
35/** Computes the luminance from the given r, g, and b in accordance with
36    SK_LUM_COEFF_X. For correct results, r, g, and b should be in linear space.
37*/
38static inline U8CPU SkComputeLuminance(U8CPU r, U8CPU g, U8CPU b) {
39    //The following is
40    //r * SK_LUM_COEFF_R + g * SK_LUM_COEFF_G + b * SK_LUM_COEFF_B
41    //with SK_LUM_COEFF_X in 1.8 fixed point (rounding adjusted to sum to 256).
42    return (r * 54 + g * 183 + b * 19) >> 8;
43}
44
45/** Turn 0..255 into 0..256 by adding 1 at the half-way point. Used to turn a
46    byte into a scale value, so that we can say scale * value >> 8 instead of
47    alpha * value / 255.
48
49    In debugging, asserts that alpha is 0..255
50*/
51static inline unsigned SkAlpha255To256(U8CPU alpha) {
52    SkASSERT(SkToU8(alpha) == alpha);
53    // this one assues that blending on top of an opaque dst keeps it that way
54    // even though it is less accurate than a+(a>>7) for non-opaque dsts
55    return alpha + 1;
56}
57
58/** Multiplify value by 0..256, and shift the result down 8
59    (i.e. return (value * alpha256) >> 8)
60 */
61#define SkAlphaMul(value, alpha256)     (SkMulS16(value, alpha256) >> 8)
62
63//  The caller may want negative values, so keep all params signed (int)
64//  so we don't accidentally slip into unsigned math and lose the sign
65//  extension when we shift (in SkAlphaMul)
66static inline int SkAlphaBlend(int src, int dst, int scale256) {
67    SkASSERT((unsigned)scale256 <= 256);
68    return dst + SkAlphaMul(src - dst, scale256);
69}
70
71/**
72 *  Returns (src * alpha + dst * (255 - alpha)) / 255
73 *
74 *  This is more accurate than SkAlphaBlend, but slightly slower
75 */
76static inline int SkAlphaBlend255(S16CPU src, S16CPU dst, U8CPU alpha) {
77    SkASSERT((int16_t)src == src);
78    SkASSERT((int16_t)dst == dst);
79    SkASSERT((uint8_t)alpha == alpha);
80
81    int prod = SkMulS16(src - dst, alpha) + 128;
82    prod = (prod + (prod >> 8)) >> 8;
83    return dst + prod;
84}
85
86#define SK_R16_BITS     5
87#define SK_G16_BITS     6
88#define SK_B16_BITS     5
89
90#define SK_R16_SHIFT    (SK_B16_BITS + SK_G16_BITS)
91#define SK_G16_SHIFT    (SK_B16_BITS)
92#define SK_B16_SHIFT    0
93
94#define SK_R16_MASK     ((1 << SK_R16_BITS) - 1)
95#define SK_G16_MASK     ((1 << SK_G16_BITS) - 1)
96#define SK_B16_MASK     ((1 << SK_B16_BITS) - 1)
97
98#define SkGetPackedR16(color)   (((unsigned)(color) >> SK_R16_SHIFT) & SK_R16_MASK)
99#define SkGetPackedG16(color)   (((unsigned)(color) >> SK_G16_SHIFT) & SK_G16_MASK)
100#define SkGetPackedB16(color)   (((unsigned)(color) >> SK_B16_SHIFT) & SK_B16_MASK)
101
102#define SkR16Assert(r)  SkASSERT((unsigned)(r) <= SK_R16_MASK)
103#define SkG16Assert(g)  SkASSERT((unsigned)(g) <= SK_G16_MASK)
104#define SkB16Assert(b)  SkASSERT((unsigned)(b) <= SK_B16_MASK)
105
106static inline uint16_t SkPackRGB16(unsigned r, unsigned g, unsigned b) {
107    SkASSERT(r <= SK_R16_MASK);
108    SkASSERT(g <= SK_G16_MASK);
109    SkASSERT(b <= SK_B16_MASK);
110
111    return SkToU16((r << SK_R16_SHIFT) | (g << SK_G16_SHIFT) | (b << SK_B16_SHIFT));
112}
113
114#define SK_R16_MASK_IN_PLACE        (SK_R16_MASK << SK_R16_SHIFT)
115#define SK_G16_MASK_IN_PLACE        (SK_G16_MASK << SK_G16_SHIFT)
116#define SK_B16_MASK_IN_PLACE        (SK_B16_MASK << SK_B16_SHIFT)
117
118/** Expand the 16bit color into a 32bit value that can be scaled all at once
119    by a value up to 32. Used in conjunction with SkCompact_rgb_16.
120*/
121static inline uint32_t SkExpand_rgb_16(U16CPU c) {
122    SkASSERT(c == (uint16_t)c);
123
124    return ((c & SK_G16_MASK_IN_PLACE) << 16) | (c & ~SK_G16_MASK_IN_PLACE);
125}
126
127/** Compress an expanded value (from SkExpand_rgb_16) back down to a 16bit
128    color value. The computation yields only 16bits of valid data, but we claim
129    to return 32bits, so that the compiler won't generate extra instructions to
130    "clean" the top 16bits. However, the top 16 can contain garbage, so it is
131    up to the caller to safely ignore them.
132*/
133static inline U16CPU SkCompact_rgb_16(uint32_t c) {
134    return ((c >> 16) & SK_G16_MASK_IN_PLACE) | (c & ~SK_G16_MASK_IN_PLACE);
135}
136
137/** Scale the 16bit color value by the 0..256 scale parameter.
138    The computation yields only 16bits of valid data, but we claim
139    to return 32bits, so that the compiler won't generate extra instructions to
140    "clean" the top 16bits.
141*/
142static inline U16CPU SkAlphaMulRGB16(U16CPU c, unsigned scale) {
143    return SkCompact_rgb_16(SkExpand_rgb_16(c) * (scale >> 3) >> 5);
144}
145
146// this helper explicitly returns a clean 16bit value (but slower)
147#define SkAlphaMulRGB16_ToU16(c, s)  (uint16_t)SkAlphaMulRGB16(c, s)
148
149/** Blend src and dst 16bit colors by the 0..256 scale parameter.
150    The computation yields only 16bits of valid data, but we claim
151    to return 32bits, so that the compiler won't generate extra instructions to
152    "clean" the top 16bits.
153*/
154static inline U16CPU SkBlendRGB16(U16CPU src, U16CPU dst, int srcScale) {
155    SkASSERT((unsigned)srcScale <= 256);
156
157    srcScale >>= 3;
158
159    uint32_t src32 = SkExpand_rgb_16(src);
160    uint32_t dst32 = SkExpand_rgb_16(dst);
161    return SkCompact_rgb_16(dst32 + ((src32 - dst32) * srcScale >> 5));
162}
163
164static inline void SkBlendRGB16(const uint16_t src[], uint16_t dst[],
165                                int srcScale, int count) {
166    SkASSERT(count > 0);
167    SkASSERT((unsigned)srcScale <= 256);
168
169    srcScale >>= 3;
170
171    do {
172        uint32_t src32 = SkExpand_rgb_16(*src++);
173        uint32_t dst32 = SkExpand_rgb_16(*dst);
174        *dst++ = SkCompact_rgb_16(dst32 + ((src32 - dst32) * srcScale >> 5));
175    } while (--count > 0);
176}
177
178#ifdef SK_DEBUG
179    static inline U16CPU SkRGB16Add(U16CPU a, U16CPU b) {
180        SkASSERT(SkGetPackedR16(a) + SkGetPackedR16(b) <= SK_R16_MASK);
181        SkASSERT(SkGetPackedG16(a) + SkGetPackedG16(b) <= SK_G16_MASK);
182        SkASSERT(SkGetPackedB16(a) + SkGetPackedB16(b) <= SK_B16_MASK);
183
184        return a + b;
185    }
186#else
187    #define SkRGB16Add(a, b)  ((a) + (b))
188#endif
189
190///////////////////////////////////////////////////////////////////////////////
191
192#define SK_A32_BITS     8
193#define SK_R32_BITS     8
194#define SK_G32_BITS     8
195#define SK_B32_BITS     8
196
197#define SK_A32_MASK     ((1 << SK_A32_BITS) - 1)
198#define SK_R32_MASK     ((1 << SK_R32_BITS) - 1)
199#define SK_G32_MASK     ((1 << SK_G32_BITS) - 1)
200#define SK_B32_MASK     ((1 << SK_B32_BITS) - 1)
201
202#define SkGetPackedA32(packed)      ((uint32_t)((packed) << (24 - SK_A32_SHIFT)) >> 24)
203#define SkGetPackedR32(packed)      ((uint32_t)((packed) << (24 - SK_R32_SHIFT)) >> 24)
204#define SkGetPackedG32(packed)      ((uint32_t)((packed) << (24 - SK_G32_SHIFT)) >> 24)
205#define SkGetPackedB32(packed)      ((uint32_t)((packed) << (24 - SK_B32_SHIFT)) >> 24)
206
207#define SkA32Assert(a)  SkASSERT((unsigned)(a) <= SK_A32_MASK)
208#define SkR32Assert(r)  SkASSERT((unsigned)(r) <= SK_R32_MASK)
209#define SkG32Assert(g)  SkASSERT((unsigned)(g) <= SK_G32_MASK)
210#define SkB32Assert(b)  SkASSERT((unsigned)(b) <= SK_B32_MASK)
211
212#ifdef SK_DEBUG
213    static inline void SkPMColorAssert(SkPMColor c) {
214        unsigned a = SkGetPackedA32(c);
215        unsigned r = SkGetPackedR32(c);
216        unsigned g = SkGetPackedG32(c);
217        unsigned b = SkGetPackedB32(c);
218
219        SkA32Assert(a);
220        SkASSERT(r <= a);
221        SkASSERT(g <= a);
222        SkASSERT(b <= a);
223    }
224#else
225    #define SkPMColorAssert(c)
226#endif
227
228/**
229 *  Pack the components into a SkPMColor, checking (in the debug version) that
230 *  the components are 0..255, and are already premultiplied (i.e. alpha >= color)
231 */
232static inline SkPMColor SkPackARGB32(U8CPU a, U8CPU r, U8CPU g, U8CPU b) {
233    SkA32Assert(a);
234    SkASSERT(r <= a);
235    SkASSERT(g <= a);
236    SkASSERT(b <= a);
237
238    return (a << SK_A32_SHIFT) | (r << SK_R32_SHIFT) |
239           (g << SK_G32_SHIFT) | (b << SK_B32_SHIFT);
240}
241
242/**
243 * Abstract 4-byte interpolation, implemented on top of SkPMColor
244 * utility functions. Third parameter controls blending of the first two:
245 *   (src, dst, 0) returns dst
246 *   (src, dst, 0xFF) returns src
247 *   srcWeight is [0..256], unlike SkFourByteInterp which takes [0..255]
248 */
249static inline SkPMColor SkFourByteInterp256(SkPMColor src, SkPMColor dst,
250                                         unsigned scale) {
251    unsigned a = SkAlphaBlend(SkGetPackedA32(src), SkGetPackedA32(dst), scale);
252    unsigned r = SkAlphaBlend(SkGetPackedR32(src), SkGetPackedR32(dst), scale);
253    unsigned g = SkAlphaBlend(SkGetPackedG32(src), SkGetPackedG32(dst), scale);
254    unsigned b = SkAlphaBlend(SkGetPackedB32(src), SkGetPackedB32(dst), scale);
255
256    return SkPackARGB32(a, r, g, b);
257}
258
259/**
260 * Abstract 4-byte interpolation, implemented on top of SkPMColor
261 * utility functions. Third parameter controls blending of the first two:
262 *   (src, dst, 0) returns dst
263 *   (src, dst, 0xFF) returns src
264 */
265static inline SkPMColor SkFourByteInterp(SkPMColor src, SkPMColor dst,
266                                         U8CPU srcWeight) {
267    unsigned scale = SkAlpha255To256(srcWeight);
268    return SkFourByteInterp256(src, dst, scale);
269}
270
271/**
272 * 0xAARRGGBB -> 0x00AA00GG, 0x00RR00BB
273 */
274static inline void SkSplay(uint32_t color, uint32_t* ag, uint32_t* rb) {
275    const uint32_t mask = 0x00FF00FF;
276    *ag = (color >> 8) & mask;
277    *rb = color & mask;
278}
279
280/**
281 * 0xAARRGGBB -> 0x00AA00GG00RR00BB
282 * (note, ARGB -> AGRB)
283 */
284static inline uint64_t SkSplay(uint32_t color) {
285    const uint32_t mask = 0x00FF00FF;
286    uint64_t agrb = (color >> 8) & mask;  // 0x0000000000AA00GG
287    agrb <<= 32;                          // 0x00AA00GG00000000
288    agrb |= color & mask;                 // 0x00AA00GG00RR00BB
289    return agrb;
290}
291
292/**
293 * 0xAAxxGGxx, 0xRRxxBBxx-> 0xAARRGGBB
294 */
295static inline uint32_t SkUnsplay(uint32_t ag, uint32_t rb) {
296    const uint32_t mask = 0xFF00FF00;
297    return (ag & mask) | ((rb & mask) >> 8);
298}
299
300/**
301 * 0xAAxxGGxxRRxxBBxx -> 0xAARRGGBB
302 * (note, AGRB -> ARGB)
303 */
304static inline uint32_t SkUnsplay(uint64_t agrb) {
305    const uint32_t mask = 0xFF00FF00;
306    return SkPMColor(
307        ((agrb & mask) >> 8) |   // 0x00RR00BB
308        ((agrb >> 32) & mask));  // 0xAARRGGBB
309}
310
311static inline SkPMColor SkFastFourByteInterp256_32(SkPMColor src, SkPMColor dst, unsigned scale) {
312    SkASSERT(scale <= 256);
313
314    // Two 8-bit blends per two 32-bit registers, with space to make sure the math doesn't collide.
315    uint32_t src_ag, src_rb, dst_ag, dst_rb;
316    SkSplay(src, &src_ag, &src_rb);
317    SkSplay(dst, &dst_ag, &dst_rb);
318
319    const uint32_t ret_ag = src_ag * scale + (256 - scale) * dst_ag;
320    const uint32_t ret_rb = src_rb * scale + (256 - scale) * dst_rb;
321
322    return SkUnsplay(ret_ag, ret_rb);
323}
324
325static inline SkPMColor SkFastFourByteInterp256_64(SkPMColor src, SkPMColor dst, unsigned scale) {
326    SkASSERT(scale <= 256);
327    // Four 8-bit blends in one 64-bit register, with space to make sure the math doesn't collide.
328    return SkUnsplay(SkSplay(src) * scale + (256-scale) * SkSplay(dst));
329}
330
331// TODO(mtklein): Replace slow versions with fast versions, using scale + (scale>>7) everywhere.
332
333/**
334 * Same as SkFourByteInterp256, but faster.
335 */
336static inline SkPMColor SkFastFourByteInterp256(SkPMColor src, SkPMColor dst, unsigned scale) {
337    // On a 64-bit machine, _64 is about 10% faster than _32, but ~40% slower on a 32-bit machine.
338    if (sizeof(void*) == 4) {
339        return SkFastFourByteInterp256_32(src, dst, scale);
340    } else {
341        return SkFastFourByteInterp256_64(src, dst, scale);
342    }
343}
344
345/**
346 * Nearly the same as SkFourByteInterp, but faster and a touch more accurate, due to better
347 * srcWeight scaling to [0, 256].
348 */
349static inline SkPMColor SkFastFourByteInterp(SkPMColor src,
350                                             SkPMColor dst,
351                                             U8CPU srcWeight) {
352    SkASSERT(srcWeight <= 255);
353    // scale = srcWeight + (srcWeight >> 7) is more accurate than
354    // scale = srcWeight + 1, but 7% slower
355    return SkFastFourByteInterp256(src, dst, srcWeight + (srcWeight >> 7));
356}
357
358/**
359 *  Same as SkPackARGB32, but this version guarantees to not check that the
360 *  values are premultiplied in the debug version.
361 */
362static inline SkPMColor SkPackARGB32NoCheck(U8CPU a, U8CPU r, U8CPU g, U8CPU b) {
363    return (a << SK_A32_SHIFT) | (r << SK_R32_SHIFT) |
364           (g << SK_G32_SHIFT) | (b << SK_B32_SHIFT);
365}
366
367static inline
368SkPMColor SkPremultiplyARGBInline(U8CPU a, U8CPU r, U8CPU g, U8CPU b) {
369    SkA32Assert(a);
370    SkR32Assert(r);
371    SkG32Assert(g);
372    SkB32Assert(b);
373
374    if (a != 255) {
375        r = SkMulDiv255Round(r, a);
376        g = SkMulDiv255Round(g, a);
377        b = SkMulDiv255Round(b, a);
378    }
379    return SkPackARGB32(a, r, g, b);
380}
381
382SK_API extern const uint32_t gMask_00FF00FF;
383
384static inline uint32_t SkAlphaMulQ(uint32_t c, unsigned scale) {
385    uint32_t mask = gMask_00FF00FF;
386
387    uint32_t rb = ((c & mask) * scale) >> 8;
388    uint32_t ag = ((c >> 8) & mask) * scale;
389    return (rb & mask) | (ag & ~mask);
390}
391
392static inline SkPMColor SkPMSrcOver(SkPMColor src, SkPMColor dst) {
393    return src + SkAlphaMulQ(dst, SkAlpha255To256(255 - SkGetPackedA32(src)));
394}
395
396static inline SkPMColor SkBlendARGB32(SkPMColor src, SkPMColor dst, U8CPU aa) {
397    SkASSERT((unsigned)aa <= 255);
398
399    unsigned src_scale = SkAlpha255To256(aa);
400    unsigned dst_scale = SkAlpha255To256(255 - SkAlphaMul(SkGetPackedA32(src), src_scale));
401
402    return SkAlphaMulQ(src, src_scale) + SkAlphaMulQ(dst, dst_scale);
403}
404
405////////////////////////////////////////////////////////////////////////////////////////////
406// Convert a 32bit pixel to a 16bit pixel (no dither)
407
408#define SkR32ToR16_MACRO(r)   ((unsigned)(r) >> (SK_R32_BITS - SK_R16_BITS))
409#define SkG32ToG16_MACRO(g)   ((unsigned)(g) >> (SK_G32_BITS - SK_G16_BITS))
410#define SkB32ToB16_MACRO(b)   ((unsigned)(b) >> (SK_B32_BITS - SK_B16_BITS))
411
412#ifdef SK_DEBUG
413    static inline unsigned SkR32ToR16(unsigned r) {
414        SkR32Assert(r);
415        return SkR32ToR16_MACRO(r);
416    }
417    static inline unsigned SkG32ToG16(unsigned g) {
418        SkG32Assert(g);
419        return SkG32ToG16_MACRO(g);
420    }
421    static inline unsigned SkB32ToB16(unsigned b) {
422        SkB32Assert(b);
423        return SkB32ToB16_MACRO(b);
424    }
425#else
426    #define SkR32ToR16(r)   SkR32ToR16_MACRO(r)
427    #define SkG32ToG16(g)   SkG32ToG16_MACRO(g)
428    #define SkB32ToB16(b)   SkB32ToB16_MACRO(b)
429#endif
430
431#define SkPacked32ToR16(c)  (((unsigned)(c) >> (SK_R32_SHIFT + SK_R32_BITS - SK_R16_BITS)) & SK_R16_MASK)
432#define SkPacked32ToG16(c)  (((unsigned)(c) >> (SK_G32_SHIFT + SK_G32_BITS - SK_G16_BITS)) & SK_G16_MASK)
433#define SkPacked32ToB16(c)  (((unsigned)(c) >> (SK_B32_SHIFT + SK_B32_BITS - SK_B16_BITS)) & SK_B16_MASK)
434
435static inline U16CPU SkPixel32ToPixel16(SkPMColor c) {
436    unsigned r = ((c >> (SK_R32_SHIFT + (8 - SK_R16_BITS))) & SK_R16_MASK) << SK_R16_SHIFT;
437    unsigned g = ((c >> (SK_G32_SHIFT + (8 - SK_G16_BITS))) & SK_G16_MASK) << SK_G16_SHIFT;
438    unsigned b = ((c >> (SK_B32_SHIFT + (8 - SK_B16_BITS))) & SK_B16_MASK) << SK_B16_SHIFT;
439    return r | g | b;
440}
441
442static inline U16CPU SkPack888ToRGB16(U8CPU r, U8CPU g, U8CPU b) {
443    return  (SkR32ToR16(r) << SK_R16_SHIFT) |
444            (SkG32ToG16(g) << SK_G16_SHIFT) |
445            (SkB32ToB16(b) << SK_B16_SHIFT);
446}
447
448#define SkPixel32ToPixel16_ToU16(src)   SkToU16(SkPixel32ToPixel16(src))
449
450/////////////////////////////////////////////////////////////////////////////////////////
451// Fast dither from 32->16
452
453#define SkShouldDitherXY(x, y)  (((x) ^ (y)) & 1)
454
455static inline uint16_t SkDitherPack888ToRGB16(U8CPU r, U8CPU g, U8CPU b) {
456    r = ((r << 1) - ((r >> (8 - SK_R16_BITS) << (8 - SK_R16_BITS)) | (r >> SK_R16_BITS))) >> (8 - SK_R16_BITS);
457    g = ((g << 1) - ((g >> (8 - SK_G16_BITS) << (8 - SK_G16_BITS)) | (g >> SK_G16_BITS))) >> (8 - SK_G16_BITS);
458    b = ((b << 1) - ((b >> (8 - SK_B16_BITS) << (8 - SK_B16_BITS)) | (b >> SK_B16_BITS))) >> (8 - SK_B16_BITS);
459
460    return SkPackRGB16(r, g, b);
461}
462
463static inline uint16_t SkDitherPixel32ToPixel16(SkPMColor c) {
464    return SkDitherPack888ToRGB16(SkGetPackedR32(c), SkGetPackedG32(c), SkGetPackedB32(c));
465}
466
467/*  Return c in expanded_rgb_16 format, but also scaled up by 32 (5 bits)
468    It is now suitable for combining with a scaled expanded_rgb_16 color
469    as in SkSrcOver32To16().
470    We must do this 565 high-bit replication, in order for the subsequent add
471    to saturate properly (and not overflow). If we take the 8 bits as is, it is
472    possible to overflow.
473*/
474static inline uint32_t SkPMColorToExpanded16x5(SkPMColor c) {
475    unsigned sr = SkPacked32ToR16(c);
476    unsigned sg = SkPacked32ToG16(c);
477    unsigned sb = SkPacked32ToB16(c);
478
479    sr = (sr << 5) | sr;
480    sg = (sg << 5) | (sg >> 1);
481    sb = (sb << 5) | sb;
482    return (sr << 11) | (sg << 21) | (sb << 0);
483}
484
485/*  SrcOver the 32bit src color with the 16bit dst, returning a 16bit value
486    (with dirt in the high 16bits, so caller beware).
487*/
488static inline U16CPU SkSrcOver32To16(SkPMColor src, uint16_t dst) {
489    unsigned sr = SkGetPackedR32(src);
490    unsigned sg = SkGetPackedG32(src);
491    unsigned sb = SkGetPackedB32(src);
492
493    unsigned dr = SkGetPackedR16(dst);
494    unsigned dg = SkGetPackedG16(dst);
495    unsigned db = SkGetPackedB16(dst);
496
497    unsigned isa = 255 - SkGetPackedA32(src);
498
499    dr = (sr + SkMul16ShiftRound(dr, isa, SK_R16_BITS)) >> (8 - SK_R16_BITS);
500    dg = (sg + SkMul16ShiftRound(dg, isa, SK_G16_BITS)) >> (8 - SK_G16_BITS);
501    db = (sb + SkMul16ShiftRound(db, isa, SK_B16_BITS)) >> (8 - SK_B16_BITS);
502
503    return SkPackRGB16(dr, dg, db);
504}
505
506////////////////////////////////////////////////////////////////////////////////////////////
507// Convert a 16bit pixel to a 32bit pixel
508
509static inline unsigned SkR16ToR32(unsigned r) {
510    return (r << (8 - SK_R16_BITS)) | (r >> (2 * SK_R16_BITS - 8));
511}
512
513static inline unsigned SkG16ToG32(unsigned g) {
514    return (g << (8 - SK_G16_BITS)) | (g >> (2 * SK_G16_BITS - 8));
515}
516
517static inline unsigned SkB16ToB32(unsigned b) {
518    return (b << (8 - SK_B16_BITS)) | (b >> (2 * SK_B16_BITS - 8));
519}
520
521#define SkPacked16ToR32(c)      SkR16ToR32(SkGetPackedR16(c))
522#define SkPacked16ToG32(c)      SkG16ToG32(SkGetPackedG16(c))
523#define SkPacked16ToB32(c)      SkB16ToB32(SkGetPackedB16(c))
524
525static inline SkPMColor SkPixel16ToPixel32(U16CPU src) {
526    SkASSERT(src == SkToU16(src));
527
528    unsigned    r = SkPacked16ToR32(src);
529    unsigned    g = SkPacked16ToG32(src);
530    unsigned    b = SkPacked16ToB32(src);
531
532    SkASSERT((r >> (8 - SK_R16_BITS)) == SkGetPackedR16(src));
533    SkASSERT((g >> (8 - SK_G16_BITS)) == SkGetPackedG16(src));
534    SkASSERT((b >> (8 - SK_B16_BITS)) == SkGetPackedB16(src));
535
536    return SkPackARGB32(0xFF, r, g, b);
537}
538
539// similar to SkPixel16ToPixel32, but returns SkColor instead of SkPMColor
540static inline SkColor SkPixel16ToColor(U16CPU src) {
541    SkASSERT(src == SkToU16(src));
542
543    unsigned    r = SkPacked16ToR32(src);
544    unsigned    g = SkPacked16ToG32(src);
545    unsigned    b = SkPacked16ToB32(src);
546
547    SkASSERT((r >> (8 - SK_R16_BITS)) == SkGetPackedR16(src));
548    SkASSERT((g >> (8 - SK_G16_BITS)) == SkGetPackedG16(src));
549    SkASSERT((b >> (8 - SK_B16_BITS)) == SkGetPackedB16(src));
550
551    return SkColorSetRGB(r, g, b);
552}
553
554///////////////////////////////////////////////////////////////////////////////
555
556typedef uint16_t SkPMColor16;
557
558// Put in OpenGL order (r g b a)
559#define SK_A4444_SHIFT    0
560#define SK_R4444_SHIFT    12
561#define SK_G4444_SHIFT    8
562#define SK_B4444_SHIFT    4
563
564#define SkA32To4444(a)  ((unsigned)(a) >> 4)
565#define SkR32To4444(r)  ((unsigned)(r) >> 4)
566#define SkG32To4444(g)  ((unsigned)(g) >> 4)
567#define SkB32To4444(b)  ((unsigned)(b) >> 4)
568
569static inline U8CPU SkReplicateNibble(unsigned nib) {
570    SkASSERT(nib <= 0xF);
571    return (nib << 4) | nib;
572}
573
574#define SkA4444ToA32(a)     SkReplicateNibble(a)
575#define SkR4444ToR32(r)     SkReplicateNibble(r)
576#define SkG4444ToG32(g)     SkReplicateNibble(g)
577#define SkB4444ToB32(b)     SkReplicateNibble(b)
578
579#define SkGetPackedA4444(c)     (((unsigned)(c) >> SK_A4444_SHIFT) & 0xF)
580#define SkGetPackedR4444(c)     (((unsigned)(c) >> SK_R4444_SHIFT) & 0xF)
581#define SkGetPackedG4444(c)     (((unsigned)(c) >> SK_G4444_SHIFT) & 0xF)
582#define SkGetPackedB4444(c)     (((unsigned)(c) >> SK_B4444_SHIFT) & 0xF)
583
584#define SkPacked4444ToA32(c)    SkReplicateNibble(SkGetPackedA4444(c))
585#define SkPacked4444ToR32(c)    SkReplicateNibble(SkGetPackedR4444(c))
586#define SkPacked4444ToG32(c)    SkReplicateNibble(SkGetPackedG4444(c))
587#define SkPacked4444ToB32(c)    SkReplicateNibble(SkGetPackedB4444(c))
588
589#ifdef SK_DEBUG
590static inline void SkPMColor16Assert(U16CPU c) {
591    unsigned a = SkGetPackedA4444(c);
592    unsigned r = SkGetPackedR4444(c);
593    unsigned g = SkGetPackedG4444(c);
594    unsigned b = SkGetPackedB4444(c);
595
596    SkASSERT(a <= 0xF);
597    SkASSERT(r <= a);
598    SkASSERT(g <= a);
599    SkASSERT(b <= a);
600}
601#else
602#define SkPMColor16Assert(c)
603#endif
604
605static inline unsigned SkAlpha15To16(unsigned a) {
606    SkASSERT(a <= 0xF);
607    return a + (a >> 3);
608}
609
610#ifdef SK_DEBUG
611    static inline int SkAlphaMul4(int value, int scale) {
612        SkASSERT((unsigned)scale <= 0x10);
613        return value * scale >> 4;
614    }
615#else
616    #define SkAlphaMul4(value, scale)   ((value) * (scale) >> 4)
617#endif
618
619static inline unsigned SkR4444ToR565(unsigned r) {
620    SkASSERT(r <= 0xF);
621    return (r << (SK_R16_BITS - 4)) | (r >> (8 - SK_R16_BITS));
622}
623
624static inline unsigned SkG4444ToG565(unsigned g) {
625    SkASSERT(g <= 0xF);
626    return (g << (SK_G16_BITS - 4)) | (g >> (8 - SK_G16_BITS));
627}
628
629static inline unsigned SkB4444ToB565(unsigned b) {
630    SkASSERT(b <= 0xF);
631    return (b << (SK_B16_BITS - 4)) | (b >> (8 - SK_B16_BITS));
632}
633
634static inline SkPMColor16 SkPackARGB4444(unsigned a, unsigned r,
635                                         unsigned g, unsigned b) {
636    SkASSERT(a <= 0xF);
637    SkASSERT(r <= a);
638    SkASSERT(g <= a);
639    SkASSERT(b <= a);
640
641    return (SkPMColor16)((a << SK_A4444_SHIFT) | (r << SK_R4444_SHIFT) |
642                         (g << SK_G4444_SHIFT) | (b << SK_B4444_SHIFT));
643}
644
645extern const uint16_t gMask_0F0F;
646
647static inline U16CPU SkAlphaMulQ4(U16CPU c, unsigned scale) {
648    SkASSERT(scale <= 16);
649
650    const unsigned mask = 0xF0F;    //gMask_0F0F;
651
652#if 0
653    unsigned rb = ((c & mask) * scale) >> 4;
654    unsigned ag = ((c >> 4) & mask) * scale;
655    return (rb & mask) | (ag & ~mask);
656#else
657    c = (c & mask) | ((c & (mask << 4)) << 12);
658    c = c * scale >> 4;
659    return (c & mask) | ((c >> 12) & (mask << 4));
660#endif
661}
662
663/** Expand the SkPMColor16 color into a 32bit value that can be scaled all at
664    once by a value up to 16. Used in conjunction with SkCompact_4444.
665*/
666static inline uint32_t SkExpand_4444(U16CPU c) {
667    SkASSERT(c == (uint16_t)c);
668
669    const unsigned mask = 0xF0F;    //gMask_0F0F;
670    return (c & mask) | ((c & ~mask) << 12);
671}
672
673/** Compress an expanded value (from SkExpand_4444) back down to a SkPMColor16.
674    NOTE: this explicitly does not clean the top 16 bits (which may be garbage).
675    It does this for speed, since if it is being written directly to 16bits of
676    memory, the top 16bits will be ignored. Casting the result to uint16_t here
677    would add 2 more instructions, slow us down. It is up to the caller to
678    perform the cast if needed.
679*/
680static inline U16CPU SkCompact_4444(uint32_t c) {
681    const unsigned mask = 0xF0F;    //gMask_0F0F;
682    return (c & mask) | ((c >> 12) & ~mask);
683}
684
685static inline uint16_t SkSrcOver4444To16(SkPMColor16 s, uint16_t d) {
686    unsigned sa = SkGetPackedA4444(s);
687    unsigned sr = SkR4444ToR565(SkGetPackedR4444(s));
688    unsigned sg = SkG4444ToG565(SkGetPackedG4444(s));
689    unsigned sb = SkB4444ToB565(SkGetPackedB4444(s));
690
691    // To avoid overflow, we have to clear the low bit of the synthetic sg
692    // if the src alpha is <= 7.
693    // to see why, try blending 0x4444 on top of 565-white and watch green
694    // overflow (sum == 64)
695    sg &= ~(~(sa >> 3) & 1);
696
697    unsigned scale = SkAlpha15To16(15 - sa);
698    unsigned dr = SkAlphaMul4(SkGetPackedR16(d), scale);
699    unsigned dg = SkAlphaMul4(SkGetPackedG16(d), scale);
700    unsigned db = SkAlphaMul4(SkGetPackedB16(d), scale);
701
702#if 0
703    if (sg + dg > 63) {
704        SkDebugf("---- SkSrcOver4444To16 src=%x dst=%x scale=%d, sg=%d dg=%d\n", s, d, scale, sg, dg);
705    }
706#endif
707    return SkPackRGB16(sr + dr, sg + dg, sb + db);
708}
709
710static inline uint16_t SkBlend4444To16(SkPMColor16 src, uint16_t dst, int scale16) {
711    SkASSERT((unsigned)scale16 <= 16);
712
713    return SkSrcOver4444To16(SkAlphaMulQ4(src, scale16), dst);
714}
715
716static inline uint16_t SkBlend4444(SkPMColor16 src, SkPMColor16 dst, int scale16) {
717    SkASSERT((unsigned)scale16 <= 16);
718
719    uint32_t src32 = SkExpand_4444(src) * scale16;
720    // the scaled srcAlpha is the bottom byte
721#ifdef SK_DEBUG
722    {
723        unsigned srcA = SkGetPackedA4444(src) * scale16;
724        SkASSERT(srcA == (src32 & 0xFF));
725    }
726#endif
727    unsigned dstScale = SkAlpha255To256(255 - (src32 & 0xFF)) >> 4;
728    uint32_t dst32 = SkExpand_4444(dst) * dstScale;
729    return SkCompact_4444((src32 + dst32) >> 4);
730}
731
732static inline SkPMColor SkPixel4444ToPixel32(U16CPU c) {
733    uint32_t d = (SkGetPackedA4444(c) << SK_A32_SHIFT) |
734                 (SkGetPackedR4444(c) << SK_R32_SHIFT) |
735                 (SkGetPackedG4444(c) << SK_G32_SHIFT) |
736                 (SkGetPackedB4444(c) << SK_B32_SHIFT);
737    return d | (d << 4);
738}
739
740static inline SkPMColor16 SkPixel32ToPixel4444(SkPMColor c) {
741    return  (((c >> (SK_A32_SHIFT + 4)) & 0xF) << SK_A4444_SHIFT) |
742    (((c >> (SK_R32_SHIFT + 4)) & 0xF) << SK_R4444_SHIFT) |
743    (((c >> (SK_G32_SHIFT + 4)) & 0xF) << SK_G4444_SHIFT) |
744    (((c >> (SK_B32_SHIFT + 4)) & 0xF) << SK_B4444_SHIFT);
745}
746
747// cheap 2x2 dither
748static inline SkPMColor16 SkDitherARGB32To4444(U8CPU a, U8CPU r,
749                                               U8CPU g, U8CPU b) {
750    // to ensure that we stay a legal premultiplied color, we take the max()
751    // of the truncated and dithered alpha values. If we didn't, cases like
752    // SkDitherARGB32To4444(0x31, 0x2E, ...) would generate SkPackARGB4444(2, 3, ...)
753    // which is not legal premultiplied, since a < color
754    unsigned dithered_a = ((a << 1) - ((a >> 4 << 4) | (a >> 4))) >> 4;
755    a = SkMax32(a >> 4, dithered_a);
756    // these we just dither in place
757    r = ((r << 1) - ((r >> 4 << 4) | (r >> 4))) >> 4;
758    g = ((g << 1) - ((g >> 4 << 4) | (g >> 4))) >> 4;
759    b = ((b << 1) - ((b >> 4 << 4) | (b >> 4))) >> 4;
760
761    return SkPackARGB4444(a, r, g, b);
762}
763
764static inline SkPMColor16 SkDitherPixel32To4444(SkPMColor c) {
765    return SkDitherARGB32To4444(SkGetPackedA32(c), SkGetPackedR32(c),
766                                SkGetPackedG32(c), SkGetPackedB32(c));
767}
768
769/*  Assumes 16bit is in standard RGBA order.
770    Transforms a normal ARGB_8888 into the same byte order as
771    expanded ARGB_4444, but keeps each component 8bits
772*/
773static inline uint32_t SkExpand_8888(SkPMColor c) {
774    return  (((c >> SK_R32_SHIFT) & 0xFF) << 24) |
775            (((c >> SK_G32_SHIFT) & 0xFF) <<  8) |
776            (((c >> SK_B32_SHIFT) & 0xFF) << 16) |
777            (((c >> SK_A32_SHIFT) & 0xFF) <<  0);
778}
779
780/*  Undo the operation of SkExpand_8888, turning the argument back into
781    a SkPMColor.
782*/
783static inline SkPMColor SkCompact_8888(uint32_t c) {
784    return  (((c >> 24) & 0xFF) << SK_R32_SHIFT) |
785            (((c >>  8) & 0xFF) << SK_G32_SHIFT) |
786            (((c >> 16) & 0xFF) << SK_B32_SHIFT) |
787            (((c >>  0) & 0xFF) << SK_A32_SHIFT);
788}
789
790/*  Like SkExpand_8888, this transforms a pmcolor into the expanded 4444 format,
791    but this routine just keeps the high 4bits of each component in the low
792    4bits of the result (just like a newly expanded PMColor16).
793*/
794static inline uint32_t SkExpand32_4444(SkPMColor c) {
795    return  (((c >> (SK_R32_SHIFT + 4)) & 0xF) << 24) |
796            (((c >> (SK_G32_SHIFT + 4)) & 0xF) <<  8) |
797            (((c >> (SK_B32_SHIFT + 4)) & 0xF) << 16) |
798            (((c >> (SK_A32_SHIFT + 4)) & 0xF) <<  0);
799}
800
801// takes two values and alternamtes them as part of a memset16
802// used for cheap 2x2 dithering when the colors are opaque
803void sk_dither_memset16(uint16_t dst[], uint16_t value, uint16_t other, int n);
804
805///////////////////////////////////////////////////////////////////////////////
806
807static inline int SkUpscale31To32(int value) {
808    SkASSERT((unsigned)value <= 31);
809    return value + (value >> 4);
810}
811
812static inline int SkBlend32(int src, int dst, int scale) {
813    SkASSERT((unsigned)src <= 0xFF);
814    SkASSERT((unsigned)dst <= 0xFF);
815    SkASSERT((unsigned)scale <= 32);
816    return dst + ((src - dst) * scale >> 5);
817}
818
819static inline SkPMColor SkBlendLCD16(int srcA, int srcR, int srcG, int srcB,
820                                     SkPMColor dst, uint16_t mask) {
821    if (mask == 0) {
822        return dst;
823    }
824
825    /*  We want all of these in 5bits, hence the shifts in case one of them
826     *  (green) is 6bits.
827     */
828    int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
829    int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
830    int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
831
832    // Now upscale them to 0..32, so we can use blend32
833    maskR = SkUpscale31To32(maskR);
834    maskG = SkUpscale31To32(maskG);
835    maskB = SkUpscale31To32(maskB);
836
837    // srcA has been upscaled to 256 before passed into this function
838    maskR = maskR * srcA >> 8;
839    maskG = maskG * srcA >> 8;
840    maskB = maskB * srcA >> 8;
841
842    int dstR = SkGetPackedR32(dst);
843    int dstG = SkGetPackedG32(dst);
844    int dstB = SkGetPackedB32(dst);
845
846    // LCD blitting is only supported if the dst is known/required
847    // to be opaque
848    return SkPackARGB32(0xFF,
849                        SkBlend32(srcR, dstR, maskR),
850                        SkBlend32(srcG, dstG, maskG),
851                        SkBlend32(srcB, dstB, maskB));
852}
853
854static inline SkPMColor SkBlendLCD16Opaque(int srcR, int srcG, int srcB,
855                                           SkPMColor dst, uint16_t mask,
856                                           SkPMColor opaqueDst) {
857    if (mask == 0) {
858        return dst;
859    }
860
861    if (0xFFFF == mask) {
862        return opaqueDst;
863    }
864
865    /*  We want all of these in 5bits, hence the shifts in case one of them
866     *  (green) is 6bits.
867     */
868    int maskR = SkGetPackedR16(mask) >> (SK_R16_BITS - 5);
869    int maskG = SkGetPackedG16(mask) >> (SK_G16_BITS - 5);
870    int maskB = SkGetPackedB16(mask) >> (SK_B16_BITS - 5);
871
872    // Now upscale them to 0..32, so we can use blend32
873    maskR = SkUpscale31To32(maskR);
874    maskG = SkUpscale31To32(maskG);
875    maskB = SkUpscale31To32(maskB);
876
877    int dstR = SkGetPackedR32(dst);
878    int dstG = SkGetPackedG32(dst);
879    int dstB = SkGetPackedB32(dst);
880
881    // LCD blitting is only supported if the dst is known/required
882    // to be opaque
883    return SkPackARGB32(0xFF,
884                        SkBlend32(srcR, dstR, maskR),
885                        SkBlend32(srcG, dstG, maskG),
886                        SkBlend32(srcB, dstB, maskB));
887}
888
889static inline void SkBlitLCD16Row(SkPMColor dst[], const uint16_t mask[],
890                                  SkColor src, int width, SkPMColor) {
891    int srcA = SkColorGetA(src);
892    int srcR = SkColorGetR(src);
893    int srcG = SkColorGetG(src);
894    int srcB = SkColorGetB(src);
895
896    srcA = SkAlpha255To256(srcA);
897
898    for (int i = 0; i < width; i++) {
899        dst[i] = SkBlendLCD16(srcA, srcR, srcG, srcB, dst[i], mask[i]);
900    }
901}
902
903static inline void SkBlitLCD16OpaqueRow(SkPMColor dst[], const uint16_t mask[],
904                                        SkColor src, int width,
905                                        SkPMColor opaqueDst) {
906    int srcR = SkColorGetR(src);
907    int srcG = SkColorGetG(src);
908    int srcB = SkColorGetB(src);
909
910    for (int i = 0; i < width; i++) {
911        dst[i] = SkBlendLCD16Opaque(srcR, srcG, srcB, dst[i], mask[i],
912                                    opaqueDst);
913    }
914}
915
916#endif
917