pixman-inlines.h revision 1176bdada62cabc6ec4b0308a930e83b679d5d36
1/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
2/*
3 * Copyright © 2000 SuSE, Inc.
4 * Copyright © 2007 Red Hat, Inc.
5 *
6 * Permission to use, copy, modify, distribute, and sell this software and its
7 * documentation for any purpose is hereby granted without fee, provided that
8 * the above copyright notice appear in all copies and that both that
9 * copyright notice and this permission notice appear in supporting
10 * documentation, and that the name of SuSE not be used in advertising or
11 * publicity pertaining to distribution of the software without specific,
12 * written prior permission.  SuSE makes no representations about the
13 * suitability of this software for any purpose.  It is provided "as is"
14 * without express or implied warranty.
15 *
16 * SuSE DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE, INCLUDING ALL
17 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT SHALL SuSE
18 * BE LIABLE FOR ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
20 * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
21 * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
22 *
23 * Author:  Keith Packard, SuSE, Inc.
24 */
25
26#ifndef PIXMAN_FAST_PATH_H__
27#define PIXMAN_FAST_PATH_H__
28
29#include "pixman-private.h"
30
31#define PIXMAN_REPEAT_COVER -1
32
33/* Flags describing input parameters to fast path macro template.
34 * Turning on some flag values may indicate that
35 * "some property X is available so template can use this" or
36 * "some property X should be handled by template".
37 *
38 * FLAG_HAVE_SOLID_MASK
39 *  Input mask is solid so template should handle this.
40 *
41 * FLAG_HAVE_NON_SOLID_MASK
42 *  Input mask is bits mask so template should handle this.
43 *
44 * FLAG_HAVE_SOLID_MASK and FLAG_HAVE_NON_SOLID_MASK are mutually
45 * exclusive. (It's not allowed to turn both flags on)
46 */
47#define FLAG_NONE				(0)
48#define FLAG_HAVE_SOLID_MASK			(1 <<   1)
49#define FLAG_HAVE_NON_SOLID_MASK		(1 <<   2)
50
51/* To avoid too short repeated scanline function calls, extend source
52 * scanlines having width less than below constant value.
53 */
54#define REPEAT_NORMAL_MIN_WIDTH			64
55
56static force_inline pixman_bool_t
57repeat (pixman_repeat_t repeat, int *c, int size)
58{
59    if (repeat == PIXMAN_REPEAT_NONE)
60    {
61	if (*c < 0 || *c >= size)
62	    return FALSE;
63    }
64    else if (repeat == PIXMAN_REPEAT_NORMAL)
65    {
66	while (*c >= size)
67	    *c -= size;
68	while (*c < 0)
69	    *c += size;
70    }
71    else if (repeat == PIXMAN_REPEAT_PAD)
72    {
73	*c = CLIP (*c, 0, size - 1);
74    }
75    else /* REFLECT */
76    {
77	*c = MOD (*c, size * 2);
78	if (*c >= size)
79	    *c = size * 2 - *c - 1;
80    }
81    return TRUE;
82}
83
84static force_inline int
85pixman_fixed_to_bilinear_weight (pixman_fixed_t x)
86{
87    return (x >> (16 - BILINEAR_INTERPOLATION_BITS)) &
88	   ((1 << BILINEAR_INTERPOLATION_BITS) - 1);
89}
90
91#if BILINEAR_INTERPOLATION_BITS <= 4
92/* Inspired by Filter_32_opaque from Skia */
93static force_inline uint32_t
94bilinear_interpolation (uint32_t tl, uint32_t tr,
95			uint32_t bl, uint32_t br,
96			int distx, int disty)
97{
98    int distxy, distxiy, distixy, distixiy;
99    uint32_t lo, hi;
100
101    distx <<= (4 - BILINEAR_INTERPOLATION_BITS);
102    disty <<= (4 - BILINEAR_INTERPOLATION_BITS);
103
104    distxy = distx * disty;
105    distxiy = (distx << 4) - distxy;	/* distx * (16 - disty) */
106    distixy = (disty << 4) - distxy;	/* disty * (16 - distx) */
107    distixiy =
108	16 * 16 - (disty << 4) -
109	(distx << 4) + distxy; /* (16 - distx) * (16 - disty) */
110
111    lo = (tl & 0xff00ff) * distixiy;
112    hi = ((tl >> 8) & 0xff00ff) * distixiy;
113
114    lo += (tr & 0xff00ff) * distxiy;
115    hi += ((tr >> 8) & 0xff00ff) * distxiy;
116
117    lo += (bl & 0xff00ff) * distixy;
118    hi += ((bl >> 8) & 0xff00ff) * distixy;
119
120    lo += (br & 0xff00ff) * distxy;
121    hi += ((br >> 8) & 0xff00ff) * distxy;
122
123    return ((lo >> 8) & 0xff00ff) | (hi & ~0xff00ff);
124}
125
126#else
127#if SIZEOF_LONG > 4
128
129static force_inline uint32_t
130bilinear_interpolation (uint32_t tl, uint32_t tr,
131			uint32_t bl, uint32_t br,
132			int distx, int disty)
133{
134    uint64_t distxy, distxiy, distixy, distixiy;
135    uint64_t tl64, tr64, bl64, br64;
136    uint64_t f, r;
137
138    distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
139    disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
140
141    distxy = distx * disty;
142    distxiy = distx * (256 - disty);
143    distixy = (256 - distx) * disty;
144    distixiy = (256 - distx) * (256 - disty);
145
146    /* Alpha and Blue */
147    tl64 = tl & 0xff0000ff;
148    tr64 = tr & 0xff0000ff;
149    bl64 = bl & 0xff0000ff;
150    br64 = br & 0xff0000ff;
151
152    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
153    r = f & 0x0000ff0000ff0000ull;
154
155    /* Red and Green */
156    tl64 = tl;
157    tl64 = ((tl64 << 16) & 0x000000ff00000000ull) | (tl64 & 0x0000ff00ull);
158
159    tr64 = tr;
160    tr64 = ((tr64 << 16) & 0x000000ff00000000ull) | (tr64 & 0x0000ff00ull);
161
162    bl64 = bl;
163    bl64 = ((bl64 << 16) & 0x000000ff00000000ull) | (bl64 & 0x0000ff00ull);
164
165    br64 = br;
166    br64 = ((br64 << 16) & 0x000000ff00000000ull) | (br64 & 0x0000ff00ull);
167
168    f = tl64 * distixiy + tr64 * distxiy + bl64 * distixy + br64 * distxy;
169    r |= ((f >> 16) & 0x000000ff00000000ull) | (f & 0xff000000ull);
170
171    return (uint32_t)(r >> 16);
172}
173
174#else
175
176static force_inline uint32_t
177bilinear_interpolation (uint32_t tl, uint32_t tr,
178			uint32_t bl, uint32_t br,
179			int distx, int disty)
180{
181    int distxy, distxiy, distixy, distixiy;
182    uint32_t f, r;
183
184    distx <<= (8 - BILINEAR_INTERPOLATION_BITS);
185    disty <<= (8 - BILINEAR_INTERPOLATION_BITS);
186
187    distxy = distx * disty;
188    distxiy = (distx << 8) - distxy;	/* distx * (256 - disty) */
189    distixy = (disty << 8) - distxy;	/* disty * (256 - distx) */
190    distixiy =
191	256 * 256 - (disty << 8) -
192	(distx << 8) + distxy;		/* (256 - distx) * (256 - disty) */
193
194    /* Blue */
195    r = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
196      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
197
198    /* Green */
199    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
200      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
201    r |= f & 0xff000000;
202
203    tl >>= 16;
204    tr >>= 16;
205    bl >>= 16;
206    br >>= 16;
207    r >>= 16;
208
209    /* Red */
210    f = (tl & 0x000000ff) * distixiy + (tr & 0x000000ff) * distxiy
211      + (bl & 0x000000ff) * distixy  + (br & 0x000000ff) * distxy;
212    r |= f & 0x00ff0000;
213
214    /* Alpha */
215    f = (tl & 0x0000ff00) * distixiy + (tr & 0x0000ff00) * distxiy
216      + (bl & 0x0000ff00) * distixy  + (br & 0x0000ff00) * distxy;
217    r |= f & 0xff000000;
218
219    return r;
220}
221
222#endif
223#endif // BILINEAR_INTERPOLATION_BITS <= 4
224
225/*
226 * For each scanline fetched from source image with PAD repeat:
227 * - calculate how many pixels need to be padded on the left side
228 * - calculate how many pixels need to be padded on the right side
229 * - update width to only count pixels which are fetched from the image
230 * All this information is returned via 'width', 'left_pad', 'right_pad'
231 * arguments. The code is assuming that 'unit_x' is positive.
232 *
233 * Note: 64-bit math is used in order to avoid potential overflows, which
234 *       is probably excessive in many cases. This particular function
235 *       may need its own correctness test and performance tuning.
236 */
237static force_inline void
238pad_repeat_get_scanline_bounds (int32_t         source_image_width,
239				pixman_fixed_t  vx,
240				pixman_fixed_t  unit_x,
241				int32_t *       width,
242				int32_t *       left_pad,
243				int32_t *       right_pad)
244{
245    int64_t max_vx = (int64_t) source_image_width << 16;
246    int64_t tmp;
247    if (vx < 0)
248    {
249	tmp = ((int64_t) unit_x - 1 - vx) / unit_x;
250	if (tmp > *width)
251	{
252	    *left_pad = *width;
253	    *width = 0;
254	}
255	else
256	{
257	    *left_pad = (int32_t) tmp;
258	    *width -= (int32_t) tmp;
259	}
260    }
261    else
262    {
263	*left_pad = 0;
264    }
265    tmp = ((int64_t) unit_x - 1 - vx + max_vx) / unit_x - *left_pad;
266    if (tmp < 0)
267    {
268	*right_pad = *width;
269	*width = 0;
270    }
271    else if (tmp >= *width)
272    {
273	*right_pad = 0;
274    }
275    else
276    {
277	*right_pad = *width - (int32_t) tmp;
278	*width = (int32_t) tmp;
279    }
280}
281
282/* A macroified version of specialized nearest scalers for some
283 * common 8888 and 565 formats. It supports SRC and OVER ops.
284 *
285 * There are two repeat versions, one that handles repeat normal,
286 * and one without repeat handling that only works if the src region
287 * used is completely covered by the pre-repeated source samples.
288 *
289 * The loops are unrolled to process two pixels per iteration for better
290 * performance on most CPU architectures (superscalar processors
291 * can issue several operations simultaneously, other processors can hide
292 * instructions latencies by pipelining operations). Unrolling more
293 * does not make much sense because the compiler will start running out
294 * of spare registers soon.
295 */
296
297#define GET_8888_ALPHA(s) ((s) >> 24)
298 /* This is not actually used since we don't have an OVER with
299    565 source, but it is needed to build. */
300#define GET_0565_ALPHA(s) 0xff
301#define GET_x888_ALPHA(s) 0xff
302
303#define FAST_NEAREST_SCANLINE(scanline_func_name, SRC_FORMAT, DST_FORMAT,			\
304			      src_type_t, dst_type_t, OP, repeat_mode)				\
305static force_inline void									\
306scanline_func_name (dst_type_t       *dst,							\
307		    const src_type_t *src,							\
308		    int32_t           w,							\
309		    pixman_fixed_t    vx,							\
310		    pixman_fixed_t    unit_x,							\
311		    pixman_fixed_t    src_width_fixed,						\
312		    pixman_bool_t     fully_transparent_src)					\
313{												\
314	uint32_t   d;										\
315	src_type_t s1, s2;									\
316	uint8_t    a1, a2;									\
317	int        x1, x2;									\
318												\
319	if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER && fully_transparent_src)			\
320	    return;										\
321												\
322	if (PIXMAN_OP_ ## OP != PIXMAN_OP_SRC && PIXMAN_OP_ ## OP != PIXMAN_OP_OVER)		\
323	    abort();										\
324												\
325	while ((w -= 2) >= 0)									\
326	{											\
327	    x1 = pixman_fixed_to_int (vx);							\
328	    vx += unit_x;									\
329	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
330	    {											\
331		/* This works because we know that unit_x is positive */			\
332		while (vx >= 0)									\
333		    vx -= src_width_fixed;							\
334	    }											\
335	    s1 = *(src + x1);									\
336												\
337	    x2 = pixman_fixed_to_int (vx);							\
338	    vx += unit_x;									\
339	    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
340	    {											\
341		/* This works because we know that unit_x is positive */			\
342		while (vx >= 0)									\
343		    vx -= src_width_fixed;							\
344	    }											\
345	    s2 = *(src + x2);									\
346												\
347	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
348	    {											\
349		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
350		a2 = GET_ ## SRC_FORMAT ## _ALPHA(s2);						\
351												\
352		if (a1 == 0xff)									\
353		{										\
354		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
355		}										\
356		else if (s1)									\
357		{										\
358		    d = convert_ ## DST_FORMAT ## _to_8888 (*dst);				\
359		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
360		    a1 ^= 0xff;									\
361		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
362		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
363		}										\
364		dst++;										\
365												\
366		if (a2 == 0xff)									\
367		{										\
368		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
369		}										\
370		else if (s2)									\
371		{										\
372		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
373		    s2 = convert_## SRC_FORMAT ## _to_8888 (s2);				\
374		    a2 ^= 0xff;									\
375		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a2, s2);					\
376		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
377		}										\
378		dst++;										\
379	    }											\
380	    else /* PIXMAN_OP_SRC */								\
381	    {											\
382		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
383		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s2);			\
384	    }											\
385	}											\
386												\
387	if (w & 1)										\
388	{											\
389	    x1 = pixman_fixed_to_int (vx);							\
390	    s1 = *(src + x1);									\
391												\
392	    if (PIXMAN_OP_ ## OP == PIXMAN_OP_OVER)						\
393	    {											\
394		a1 = GET_ ## SRC_FORMAT ## _ALPHA(s1);						\
395												\
396		if (a1 == 0xff)									\
397		{										\
398		    *dst = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
399		}										\
400		else if (s1)									\
401		{										\
402		    d = convert_## DST_FORMAT ## _to_8888 (*dst);				\
403		    s1 = convert_ ## SRC_FORMAT ## _to_8888 (s1);				\
404		    a1 ^= 0xff;									\
405		    UN8x4_MUL_UN8_ADD_UN8x4 (d, a1, s1);					\
406		    *dst = convert_8888_to_ ## DST_FORMAT (d);					\
407		}										\
408		dst++;										\
409	    }											\
410	    else /* PIXMAN_OP_SRC */								\
411	    {											\
412		*dst++ = convert_ ## SRC_FORMAT ## _to_ ## DST_FORMAT (s1);			\
413	    }											\
414	}											\
415}
416
417#define FAST_NEAREST_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
418				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
419static void											\
420fast_composite_scaled_nearest  ## scale_func_name (pixman_implementation_t *imp,		\
421						   pixman_composite_info_t *info)               \
422{												\
423    PIXMAN_COMPOSITE_ARGS (info);					                        \
424    dst_type_t *dst_line;						                        \
425    mask_type_t *mask_line;									\
426    src_type_t *src_first_line;									\
427    int       y;										\
428    pixman_fixed_t src_width_fixed = pixman_int_to_fixed (src_image->bits.width);		\
429    pixman_fixed_t max_vy;									\
430    pixman_vector_t v;										\
431    pixman_fixed_t vx, vy;									\
432    pixman_fixed_t unit_x, unit_y;								\
433    int32_t left_pad, right_pad;								\
434												\
435    src_type_t *src;										\
436    dst_type_t *dst;										\
437    mask_type_t solid_mask;									\
438    const mask_type_t *mask = &solid_mask;							\
439    int src_stride, mask_stride, dst_stride;							\
440												\
441    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
442    if (have_mask)										\
443    {												\
444	if (mask_is_solid)									\
445	    solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
446	else											\
447	    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,			\
448				   mask_stride, mask_line, 1);					\
449    }												\
450    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
451     * transformed from destination space to source space */					\
452    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
453												\
454    /* reference point is the center of the pixel */						\
455    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
456    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
457    v.vector[2] = pixman_fixed_1;								\
458												\
459    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
460	return;											\
461												\
462    unit_x = src_image->common.transform->matrix[0][0];						\
463    unit_y = src_image->common.transform->matrix[1][1];						\
464												\
465    /* Round down to closest integer, ensuring that 0.5 rounds to 0, not 1 */			\
466    v.vector[0] -= pixman_fixed_e;								\
467    v.vector[1] -= pixman_fixed_e;								\
468												\
469    vx = v.vector[0];										\
470    vy = v.vector[1];										\
471												\
472    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
473    {												\
474	max_vy = pixman_int_to_fixed (src_image->bits.height);					\
475												\
476	/* Clamp repeating positions inside the actual samples */				\
477	repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);					\
478	repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
479    }												\
480												\
481    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
482	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
483    {												\
484	pad_repeat_get_scanline_bounds (src_image->bits.width, vx, unit_x,			\
485					&width, &left_pad, &right_pad);				\
486	vx += left_pad * unit_x;								\
487    }												\
488												\
489    while (--height >= 0)									\
490    {												\
491	dst = dst_line;										\
492	dst_line += dst_stride;									\
493	if (have_mask && !mask_is_solid)							\
494	{											\
495	    mask = mask_line;									\
496	    mask_line += mask_stride;								\
497	}											\
498												\
499	y = pixman_fixed_to_int (vy);								\
500	vy += unit_y;										\
501	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
502	    repeat (PIXMAN_REPEAT_NORMAL, &vy, max_vy);						\
503	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
504	{											\
505	    repeat (PIXMAN_REPEAT_PAD, &y, src_image->bits.height);				\
506	    src = src_first_line + src_stride * y;						\
507	    if (left_pad > 0)									\
508	    {											\
509		scanline_func (mask, dst,							\
510			       src + src_image->bits.width - src_image->bits.width + 1,		\
511			       left_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
512	    }											\
513	    if (width > 0)									\
514	    {											\
515		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
516			       dst + left_pad, src + src_image->bits.width, width,		\
517			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
518	    }											\
519	    if (right_pad > 0)									\
520	    {											\
521		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
522			       dst + left_pad + width, src + src_image->bits.width,		\
523			       right_pad, -pixman_fixed_e, 0, src_width_fixed, FALSE);		\
524	    }											\
525	}											\
526	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
527	{											\
528	    static const src_type_t zero[1] = { 0 };						\
529	    if (y < 0 || y >= src_image->bits.height)						\
530	    {											\
531		scanline_func (mask, dst, zero + 1, left_pad + width + right_pad,		\
532			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
533		continue;									\
534	    }											\
535	    src = src_first_line + src_stride * y;						\
536	    if (left_pad > 0)									\
537	    {											\
538		scanline_func (mask, dst, zero + 1, left_pad,					\
539			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
540	    }											\
541	    if (width > 0)									\
542	    {											\
543		scanline_func (mask + (mask_is_solid ? 0 : left_pad),				\
544			       dst + left_pad, src + src_image->bits.width, width,		\
545			       vx - src_width_fixed, unit_x, src_width_fixed, FALSE);		\
546	    }											\
547	    if (right_pad > 0)									\
548	    {											\
549		scanline_func (mask + (mask_is_solid ? 0 : left_pad + width),			\
550			       dst + left_pad + width, zero + 1, right_pad,			\
551			       -pixman_fixed_e, 0, src_width_fixed, TRUE);			\
552	    }											\
553	}											\
554	else											\
555	{											\
556	    src = src_first_line + src_stride * y;						\
557	    scanline_func (mask, dst, src + src_image->bits.width, width, vx - src_width_fixed,	\
558			   unit_x, src_width_fixed, FALSE);					\
559	}											\
560    }												\
561}
562
563/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
564#define FAST_NEAREST_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
565				  dst_type_t, repeat_mode, have_mask, mask_is_solid)		\
566	FAST_NEAREST_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,	\
567				  dst_type_t, repeat_mode, have_mask, mask_is_solid)
568
569#define FAST_NEAREST_MAINLOOP_NOMASK(scale_func_name, scanline_func, src_type_t, dst_type_t,	\
570			      repeat_mode)							\
571    static force_inline void									\
572    scanline_func##scale_func_name##_wrapper (							\
573		    const uint8_t    *mask,							\
574		    dst_type_t       *dst,							\
575		    const src_type_t *src,							\
576		    int32_t          w,								\
577		    pixman_fixed_t   vx,							\
578		    pixman_fixed_t   unit_x,							\
579		    pixman_fixed_t   max_vx,							\
580		    pixman_bool_t    fully_transparent_src)					\
581    {												\
582	scanline_func (dst, src, w, vx, unit_x, max_vx, fully_transparent_src);			\
583    }												\
584    FAST_NEAREST_MAINLOOP_INT (scale_func_name, scanline_func##scale_func_name##_wrapper,	\
585			       src_type_t, uint8_t, dst_type_t, repeat_mode, FALSE, FALSE)
586
587#define FAST_NEAREST_MAINLOOP(scale_func_name, scanline_func, src_type_t, dst_type_t,		\
588			      repeat_mode)							\
589	FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name, scanline_func, src_type_t,		\
590			      dst_type_t, repeat_mode)
591
592#define FAST_NEAREST(scale_func_name, SRC_FORMAT, DST_FORMAT,				\
593		     src_type_t, dst_type_t, OP, repeat_mode)				\
594    FAST_NEAREST_SCANLINE(scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
595			  SRC_FORMAT, DST_FORMAT, src_type_t, dst_type_t,		\
596			  OP, repeat_mode)						\
597    FAST_NEAREST_MAINLOOP_NOMASK(_ ## scale_func_name ## _ ## OP,			\
598			  scaled_nearest_scanline_ ## scale_func_name ## _ ## OP,	\
599			  src_type_t, dst_type_t, repeat_mode)
600
601
602#define SCALED_NEAREST_FLAGS						\
603    (FAST_PATH_SCALE_TRANSFORM	|					\
604     FAST_PATH_NO_ALPHA_MAP	|					\
605     FAST_PATH_NEAREST_FILTER	|					\
606     FAST_PATH_NO_ACCESSORS	|					\
607     FAST_PATH_NARROW_FORMAT)
608
609#define SIMPLE_NEAREST_FAST_PATH_NORMAL(op,s,d,func)			\
610    {   PIXMAN_OP_ ## op,						\
611	PIXMAN_ ## s,							\
612	(SCALED_NEAREST_FLAGS		|				\
613	 FAST_PATH_NORMAL_REPEAT	|				\
614	 FAST_PATH_X_UNIT_POSITIVE),					\
615	PIXMAN_null, 0,							\
616	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
617	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
618    }
619
620#define SIMPLE_NEAREST_FAST_PATH_PAD(op,s,d,func)			\
621    {   PIXMAN_OP_ ## op,						\
622	PIXMAN_ ## s,							\
623	(SCALED_NEAREST_FLAGS		|				\
624	 FAST_PATH_PAD_REPEAT		|				\
625	 FAST_PATH_X_UNIT_POSITIVE),					\
626	PIXMAN_null, 0,							\
627	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
628	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
629    }
630
631#define SIMPLE_NEAREST_FAST_PATH_NONE(op,s,d,func)			\
632    {   PIXMAN_OP_ ## op,						\
633	PIXMAN_ ## s,							\
634	(SCALED_NEAREST_FLAGS		|				\
635	 FAST_PATH_NONE_REPEAT		|				\
636	 FAST_PATH_X_UNIT_POSITIVE),					\
637	PIXMAN_null, 0,							\
638	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
639	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
640    }
641
642#define SIMPLE_NEAREST_FAST_PATH_COVER(op,s,d,func)			\
643    {   PIXMAN_OP_ ## op,						\
644	PIXMAN_ ## s,							\
645	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,    \
646	PIXMAN_null, 0,							\
647	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
648	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
649    }
650
651#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
652    {   PIXMAN_OP_ ## op,						\
653	PIXMAN_ ## s,							\
654	(SCALED_NEAREST_FLAGS		|				\
655	 FAST_PATH_NORMAL_REPEAT	|				\
656	 FAST_PATH_X_UNIT_POSITIVE),					\
657	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
658	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
659	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
660    }
661
662#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
663    {   PIXMAN_OP_ ## op,						\
664	PIXMAN_ ## s,							\
665	(SCALED_NEAREST_FLAGS		|				\
666	 FAST_PATH_PAD_REPEAT		|				\
667	 FAST_PATH_X_UNIT_POSITIVE),					\
668	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
669	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
670	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
671    }
672
673#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
674    {   PIXMAN_OP_ ## op,						\
675	PIXMAN_ ## s,							\
676	(SCALED_NEAREST_FLAGS		|				\
677	 FAST_PATH_NONE_REPEAT		|				\
678	 FAST_PATH_X_UNIT_POSITIVE),					\
679	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
680	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
681	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
682    }
683
684#define SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
685    {   PIXMAN_OP_ ## op,						\
686	PIXMAN_ ## s,							\
687	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
688	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
689	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
690	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
691    }
692
693#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
694    {   PIXMAN_OP_ ## op,						\
695	PIXMAN_ ## s,							\
696	(SCALED_NEAREST_FLAGS		|				\
697	 FAST_PATH_NORMAL_REPEAT	|				\
698	 FAST_PATH_X_UNIT_POSITIVE),					\
699	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
700	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
701	fast_composite_scaled_nearest_ ## func ## _normal ## _ ## op,	\
702    }
703
704#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
705    {   PIXMAN_OP_ ## op,						\
706	PIXMAN_ ## s,							\
707	(SCALED_NEAREST_FLAGS		|				\
708	 FAST_PATH_PAD_REPEAT		|				\
709	 FAST_PATH_X_UNIT_POSITIVE),					\
710	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
711	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
712	fast_composite_scaled_nearest_ ## func ## _pad ## _ ## op,	\
713    }
714
715#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
716    {   PIXMAN_OP_ ## op,						\
717	PIXMAN_ ## s,							\
718	(SCALED_NEAREST_FLAGS		|				\
719	 FAST_PATH_NONE_REPEAT		|				\
720	 FAST_PATH_X_UNIT_POSITIVE),					\
721	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
722	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
723	fast_composite_scaled_nearest_ ## func ## _none ## _ ## op,	\
724    }
725
726#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
727    {   PIXMAN_OP_ ## op,						\
728	PIXMAN_ ## s,							\
729	SCALED_NEAREST_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST,	\
730	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
731	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
732	fast_composite_scaled_nearest_ ## func ## _cover ## _ ## op,	\
733    }
734
735/* Prefer the use of 'cover' variant, because it is faster */
736#define SIMPLE_NEAREST_FAST_PATH(op,s,d,func)				\
737    SIMPLE_NEAREST_FAST_PATH_COVER (op,s,d,func),			\
738    SIMPLE_NEAREST_FAST_PATH_NONE (op,s,d,func),			\
739    SIMPLE_NEAREST_FAST_PATH_PAD (op,s,d,func),				\
740    SIMPLE_NEAREST_FAST_PATH_NORMAL (op,s,d,func)
741
742#define SIMPLE_NEAREST_A8_MASK_FAST_PATH(op,s,d,func)			\
743    SIMPLE_NEAREST_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
744    SIMPLE_NEAREST_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
745    SIMPLE_NEAREST_A8_MASK_FAST_PATH_PAD (op,s,d,func)
746
747#define SIMPLE_NEAREST_SOLID_MASK_FAST_PATH(op,s,d,func)		\
748    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
749    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
750    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_PAD (op,s,d,func)
751
752/*****************************************************************************/
753
754/*
755 * Identify 5 zones in each scanline for bilinear scaling. Depending on
756 * whether 2 pixels to be interpolated are fetched from the image itself,
757 * from the padding area around it or from both image and padding area.
758 */
759static force_inline void
760bilinear_pad_repeat_get_scanline_bounds (int32_t         source_image_width,
761					 pixman_fixed_t  vx,
762					 pixman_fixed_t  unit_x,
763					 int32_t *       left_pad,
764					 int32_t *       left_tz,
765					 int32_t *       width,
766					 int32_t *       right_tz,
767					 int32_t *       right_pad)
768{
769	int width1 = *width, left_pad1, right_pad1;
770	int width2 = *width, left_pad2, right_pad2;
771
772	pad_repeat_get_scanline_bounds (source_image_width, vx, unit_x,
773					&width1, &left_pad1, &right_pad1);
774	pad_repeat_get_scanline_bounds (source_image_width, vx + pixman_fixed_1,
775					unit_x, &width2, &left_pad2, &right_pad2);
776
777	*left_pad = left_pad2;
778	*left_tz = left_pad1 - left_pad2;
779	*right_tz = right_pad2 - right_pad1;
780	*right_pad = right_pad1;
781	*width -= *left_pad + *left_tz + *right_tz + *right_pad;
782}
783
784/*
785 * Main loop template for single pass bilinear scaling. It needs to be
786 * provided with 'scanline_func' which should do the compositing operation.
787 * The needed function has the following prototype:
788 *
789 *	scanline_func (dst_type_t *       dst,
790 *		       const mask_type_ * mask,
791 *		       const src_type_t * src_top,
792 *		       const src_type_t * src_bottom,
793 *		       int32_t            width,
794 *		       int                weight_top,
795 *		       int                weight_bottom,
796 *		       pixman_fixed_t     vx,
797 *		       pixman_fixed_t     unit_x,
798 *		       pixman_fixed_t     max_vx,
799 *		       pixman_bool_t      zero_src)
800 *
801 * Where:
802 *  dst                 - destination scanline buffer for storing results
803 *  mask                - mask buffer (or single value for solid mask)
804 *  src_top, src_bottom - two source scanlines
805 *  width               - number of pixels to process
806 *  weight_top          - weight of the top row for interpolation
807 *  weight_bottom       - weight of the bottom row for interpolation
808 *  vx                  - initial position for fetching the first pair of
809 *                        pixels from the source buffer
810 *  unit_x              - position increment needed to move to the next pair
811 *                        of pixels
812 *  max_vx              - image size as a fixed point value, can be used for
813 *                        implementing NORMAL repeat (when it is supported)
814 *  zero_src            - boolean hint variable, which is set to TRUE when
815 *                        all source pixels are fetched from zero padding
816 *                        zone for NONE repeat
817 *
818 * Note: normally the sum of 'weight_top' and 'weight_bottom' is equal to
819 *       BILINEAR_INTERPOLATION_RANGE, but sometimes it may be less than that
820 *       for NONE repeat when handling fuzzy antialiased top or bottom image
821 *       edges. Also both top and bottom weight variables are guaranteed to
822 *       have value, which is less than BILINEAR_INTERPOLATION_RANGE.
823 *       For example, the weights can fit into unsigned byte or be used
824 *       with 8-bit SIMD multiplication instructions for 8-bit interpolation
825 *       precision.
826 */
827#define FAST_BILINEAR_MAINLOOP_INT(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
828				  dst_type_t, repeat_mode, flags)				\
829static void											\
830fast_composite_scaled_bilinear ## scale_func_name (pixman_implementation_t *imp,		\
831						   pixman_composite_info_t *info)		\
832{												\
833    PIXMAN_COMPOSITE_ARGS (info);								\
834    dst_type_t *dst_line;									\
835    mask_type_t *mask_line;									\
836    src_type_t *src_first_line;									\
837    int       y1, y2;										\
838    pixman_fixed_t max_vx = INT32_MAX; /* suppress uninitialized variable warning */		\
839    pixman_vector_t v;										\
840    pixman_fixed_t vx, vy;									\
841    pixman_fixed_t unit_x, unit_y;								\
842    int32_t left_pad, left_tz, right_tz, right_pad;						\
843												\
844    dst_type_t *dst;										\
845    mask_type_t solid_mask;									\
846    const mask_type_t *mask = &solid_mask;							\
847    int src_stride, mask_stride, dst_stride;							\
848												\
849    int src_width;										\
850    pixman_fixed_t src_width_fixed;								\
851    int max_x;											\
852    pixman_bool_t need_src_extension;								\
853												\
854    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, dst_type_t, dst_stride, dst_line, 1);	\
855    if (flags & FLAG_HAVE_SOLID_MASK)								\
856    {												\
857	solid_mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);	\
858	mask_stride = 0;									\
859    }												\
860    else if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
861    {												\
862	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, mask_type_t,				\
863			       mask_stride, mask_line, 1);					\
864    }												\
865												\
866    /* pass in 0 instead of src_x and src_y because src_x and src_y need to be			\
867     * transformed from destination space to source space */					\
868    PIXMAN_IMAGE_GET_LINE (src_image, 0, 0, src_type_t, src_stride, src_first_line, 1);		\
869												\
870    /* reference point is the center of the pixel */						\
871    v.vector[0] = pixman_int_to_fixed (src_x) + pixman_fixed_1 / 2;				\
872    v.vector[1] = pixman_int_to_fixed (src_y) + pixman_fixed_1 / 2;				\
873    v.vector[2] = pixman_fixed_1;								\
874												\
875    if (!pixman_transform_point_3d (src_image->common.transform, &v))				\
876	return;											\
877												\
878    unit_x = src_image->common.transform->matrix[0][0];						\
879    unit_y = src_image->common.transform->matrix[1][1];						\
880												\
881    v.vector[0] -= pixman_fixed_1 / 2;								\
882    v.vector[1] -= pixman_fixed_1 / 2;								\
883												\
884    vy = v.vector[1];										\
885												\
886    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD ||					\
887	PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)					\
888    {												\
889	bilinear_pad_repeat_get_scanline_bounds (src_image->bits.width, v.vector[0], unit_x,	\
890					&left_pad, &left_tz, &width, &right_tz, &right_pad);	\
891	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
892	{											\
893	    /* PAD repeat does not need special handling for 'transition zones' and */		\
894	    /* they can be combined with 'padding zones' safely */				\
895	    left_pad += left_tz;								\
896	    right_pad += right_tz;								\
897	    left_tz = right_tz = 0;								\
898	}											\
899	v.vector[0] += left_pad * unit_x;							\
900    }												\
901												\
902    if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)					\
903    {												\
904	vx = v.vector[0];									\
905	repeat (PIXMAN_REPEAT_NORMAL, &vx, pixman_int_to_fixed(src_image->bits.width));		\
906	max_x = pixman_fixed_to_int (vx + (width - 1) * (int64_t)unit_x) + 1;			\
907												\
908	if (src_image->bits.width < REPEAT_NORMAL_MIN_WIDTH)					\
909	{											\
910	    src_width = 0;									\
911												\
912	    while (src_width < REPEAT_NORMAL_MIN_WIDTH && src_width <= max_x)			\
913		src_width += src_image->bits.width;						\
914												\
915	    need_src_extension = TRUE;								\
916	}											\
917	else											\
918	{											\
919	    src_width = src_image->bits.width;							\
920	    need_src_extension = FALSE;								\
921	}											\
922												\
923	src_width_fixed = pixman_int_to_fixed (src_width);					\
924    }												\
925												\
926    while (--height >= 0)									\
927    {												\
928	int weight1, weight2;									\
929	dst = dst_line;										\
930	dst_line += dst_stride;									\
931	vx = v.vector[0];									\
932	if (flags & FLAG_HAVE_NON_SOLID_MASK)							\
933	{											\
934	    mask = mask_line;									\
935	    mask_line += mask_stride;								\
936	}											\
937												\
938	y1 = pixman_fixed_to_int (vy);								\
939	weight2 = pixman_fixed_to_bilinear_weight (vy);						\
940	if (weight2)										\
941	{											\
942	    /* both weight1 and weight2 are smaller than BILINEAR_INTERPOLATION_RANGE */	\
943	    y2 = y1 + 1;									\
944	    weight1 = BILINEAR_INTERPOLATION_RANGE - weight2;					\
945	}											\
946	else											\
947	{											\
948	    /* set both top and bottom row to the same scanline and tweak weights */		\
949	    y2 = y1;										\
950	    weight1 = weight2 = BILINEAR_INTERPOLATION_RANGE / 2;				\
951	}											\
952	vy += unit_y;										\
953	if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_PAD)					\
954	{											\
955	    src_type_t *src1, *src2;								\
956	    src_type_t buf1[2];									\
957	    src_type_t buf2[2];									\
958	    repeat (PIXMAN_REPEAT_PAD, &y1, src_image->bits.height);				\
959	    repeat (PIXMAN_REPEAT_PAD, &y2, src_image->bits.height);				\
960	    src1 = src_first_line + src_stride * y1;						\
961	    src2 = src_first_line + src_stride * y2;						\
962												\
963	    if (left_pad > 0)									\
964	    {											\
965		buf1[0] = buf1[1] = src1[0];							\
966		buf2[0] = buf2[1] = src2[0];							\
967		scanline_func (dst, mask,							\
968			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, FALSE);		\
969		dst += left_pad;								\
970		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
971		    mask += left_pad;								\
972	    }											\
973	    if (width > 0)									\
974	    {											\
975		scanline_func (dst, mask,							\
976			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
977		dst += width;									\
978		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
979		    mask += width;								\
980	    }											\
981	    if (right_pad > 0)									\
982	    {											\
983		buf1[0] = buf1[1] = src1[src_image->bits.width - 1];				\
984		buf2[0] = buf2[1] = src2[src_image->bits.width - 1];				\
985		scanline_func (dst, mask,							\
986			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, FALSE);	\
987	    }											\
988	}											\
989	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NONE)				\
990	{											\
991	    src_type_t *src1, *src2;								\
992	    src_type_t buf1[2];									\
993	    src_type_t buf2[2];									\
994	    /* handle top/bottom zero padding by just setting weights to 0 if needed */		\
995	    if (y1 < 0)										\
996	    {											\
997		weight1 = 0;									\
998		y1 = 0;										\
999	    }											\
1000	    if (y1 >= src_image->bits.height)							\
1001	    {											\
1002		weight1 = 0;									\
1003		y1 = src_image->bits.height - 1;						\
1004	    }											\
1005	    if (y2 < 0)										\
1006	    {											\
1007		weight2 = 0;									\
1008		y2 = 0;										\
1009	    }											\
1010	    if (y2 >= src_image->bits.height)							\
1011	    {											\
1012		weight2 = 0;									\
1013		y2 = src_image->bits.height - 1;						\
1014	    }											\
1015	    src1 = src_first_line + src_stride * y1;						\
1016	    src2 = src_first_line + src_stride * y2;						\
1017												\
1018	    if (left_pad > 0)									\
1019	    {											\
1020		buf1[0] = buf1[1] = 0;								\
1021		buf2[0] = buf2[1] = 0;								\
1022		scanline_func (dst, mask,							\
1023			       buf1, buf2, left_pad, weight1, weight2, 0, 0, 0, TRUE);		\
1024		dst += left_pad;								\
1025		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1026		    mask += left_pad;								\
1027	    }											\
1028	    if (left_tz > 0)									\
1029	    {											\
1030		buf1[0] = 0;									\
1031		buf1[1] = src1[0];								\
1032		buf2[0] = 0;									\
1033		buf2[1] = src2[0];								\
1034		scanline_func (dst, mask,							\
1035			       buf1, buf2, left_tz, weight1, weight2,				\
1036			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
1037		dst += left_tz;									\
1038		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1039		    mask += left_tz;								\
1040		vx += left_tz * unit_x;								\
1041	    }											\
1042	    if (width > 0)									\
1043	    {											\
1044		scanline_func (dst, mask,							\
1045			       src1, src2, width, weight1, weight2, vx, unit_x, 0, FALSE);	\
1046		dst += width;									\
1047		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1048		    mask += width;								\
1049		vx += width * unit_x;								\
1050	    }											\
1051	    if (right_tz > 0)									\
1052	    {											\
1053		buf1[0] = src1[src_image->bits.width - 1];					\
1054		buf1[1] = 0;									\
1055		buf2[0] = src2[src_image->bits.width - 1];					\
1056		buf2[1] = 0;									\
1057		scanline_func (dst, mask,							\
1058			       buf1, buf2, right_tz, weight1, weight2,				\
1059			       pixman_fixed_frac (vx), unit_x, 0, FALSE);			\
1060		dst += right_tz;								\
1061		if (flags & FLAG_HAVE_NON_SOLID_MASK)						\
1062		    mask += right_tz;								\
1063	    }											\
1064	    if (right_pad > 0)									\
1065	    {											\
1066		buf1[0] = buf1[1] = 0;								\
1067		buf2[0] = buf2[1] = 0;								\
1068		scanline_func (dst, mask,							\
1069			       buf1, buf2, right_pad, weight1, weight2, 0, 0, 0, TRUE);		\
1070	    }											\
1071	}											\
1072	else if (PIXMAN_REPEAT_ ## repeat_mode == PIXMAN_REPEAT_NORMAL)				\
1073	{											\
1074	    int32_t	    num_pixels;								\
1075	    int32_t	    width_remain;							\
1076	    src_type_t *    src_line_top;							\
1077	    src_type_t *    src_line_bottom;							\
1078	    src_type_t	    buf1[2];								\
1079	    src_type_t	    buf2[2];								\
1080	    src_type_t	    extended_src_line0[REPEAT_NORMAL_MIN_WIDTH*2];			\
1081	    src_type_t	    extended_src_line1[REPEAT_NORMAL_MIN_WIDTH*2];			\
1082	    int		    i, j;								\
1083												\
1084	    repeat (PIXMAN_REPEAT_NORMAL, &y1, src_image->bits.height);				\
1085	    repeat (PIXMAN_REPEAT_NORMAL, &y2, src_image->bits.height);				\
1086	    src_line_top = src_first_line + src_stride * y1;					\
1087	    src_line_bottom = src_first_line + src_stride * y2;					\
1088												\
1089	    if (need_src_extension)								\
1090	    {											\
1091		for (i=0; i<src_width;)								\
1092		{										\
1093		    for (j=0; j<src_image->bits.width; j++, i++)				\
1094		    {										\
1095			extended_src_line0[i] = src_line_top[j];				\
1096			extended_src_line1[i] = src_line_bottom[j];				\
1097		    }										\
1098		}										\
1099												\
1100		src_line_top = &extended_src_line0[0];						\
1101		src_line_bottom = &extended_src_line1[0];					\
1102	    }											\
1103												\
1104	    /* Top & Bottom wrap around buffer */						\
1105	    buf1[0] = src_line_top[src_width - 1];						\
1106	    buf1[1] = src_line_top[0];								\
1107	    buf2[0] = src_line_bottom[src_width - 1];						\
1108	    buf2[1] = src_line_bottom[0];							\
1109												\
1110	    width_remain = width;								\
1111												\
1112	    while (width_remain > 0)								\
1113	    {											\
1114		/* We use src_width_fixed because it can make vx in original source range */	\
1115		repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);				\
1116												\
1117		/* Wrap around part */								\
1118		if (pixman_fixed_to_int (vx) == src_width - 1)					\
1119		{										\
1120		    /* for positive unit_x							\
1121		     * num_pixels = max(n) + 1, where vx + n*unit_x < src_width_fixed		\
1122		     *										\
1123		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
1124		     * So we are safe from overflow.						\
1125		     */										\
1126		    num_pixels = ((src_width_fixed - vx - pixman_fixed_e) / unit_x) + 1;	\
1127												\
1128		    if (num_pixels > width_remain)						\
1129			num_pixels = width_remain;						\
1130												\
1131		    scanline_func (dst, mask, buf1, buf2, num_pixels,				\
1132				   weight1, weight2, pixman_fixed_frac(vx),			\
1133				   unit_x, src_width_fixed, FALSE);				\
1134												\
1135		    width_remain -= num_pixels;							\
1136		    vx += num_pixels * unit_x;							\
1137		    dst += num_pixels;								\
1138												\
1139		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
1140			mask += num_pixels;							\
1141												\
1142		    repeat (PIXMAN_REPEAT_NORMAL, &vx, src_width_fixed);			\
1143		}										\
1144												\
1145		/* Normal scanline composite */							\
1146		if (pixman_fixed_to_int (vx) != src_width - 1 && width_remain > 0)		\
1147		{										\
1148		    /* for positive unit_x							\
1149		     * num_pixels = max(n) + 1, where vx + n*unit_x < (src_width_fixed - 1)	\
1150		     *										\
1151		     * vx is in range [0, src_width_fixed - pixman_fixed_e]			\
1152		     * So we are safe from overflow here.					\
1153		     */										\
1154		    num_pixels = ((src_width_fixed - pixman_fixed_1 - vx - pixman_fixed_e)	\
1155				  / unit_x) + 1;						\
1156												\
1157		    if (num_pixels > width_remain)						\
1158			num_pixels = width_remain;						\
1159												\
1160		    scanline_func (dst, mask, src_line_top, src_line_bottom, num_pixels,	\
1161				   weight1, weight2, vx, unit_x, src_width_fixed, FALSE);	\
1162												\
1163		    width_remain -= num_pixels;							\
1164		    vx += num_pixels * unit_x;							\
1165		    dst += num_pixels;								\
1166												\
1167		    if (flags & FLAG_HAVE_NON_SOLID_MASK)					\
1168		        mask += num_pixels;							\
1169		}										\
1170	    }											\
1171	}											\
1172	else											\
1173	{											\
1174	    scanline_func (dst, mask, src_first_line + src_stride * y1,				\
1175			   src_first_line + src_stride * y2, width,				\
1176			   weight1, weight2, vx, unit_x, max_vx, FALSE);			\
1177	}											\
1178    }												\
1179}
1180
1181/* A workaround for old sun studio, see: https://bugs.freedesktop.org/show_bug.cgi?id=32764 */
1182#define FAST_BILINEAR_MAINLOOP_COMMON(scale_func_name, scanline_func, src_type_t, mask_type_t,	\
1183				  dst_type_t, repeat_mode, flags)				\
1184	FAST_BILINEAR_MAINLOOP_INT(_ ## scale_func_name, scanline_func, src_type_t, mask_type_t,\
1185				  dst_type_t, repeat_mode, flags)
1186
1187#define SCALED_BILINEAR_FLAGS						\
1188    (FAST_PATH_SCALE_TRANSFORM	|					\
1189     FAST_PATH_NO_ALPHA_MAP	|					\
1190     FAST_PATH_BILINEAR_FILTER	|					\
1191     FAST_PATH_NO_ACCESSORS	|					\
1192     FAST_PATH_NARROW_FORMAT)
1193
1194#define SIMPLE_BILINEAR_FAST_PATH_PAD(op,s,d,func)			\
1195    {   PIXMAN_OP_ ## op,						\
1196	PIXMAN_ ## s,							\
1197	(SCALED_BILINEAR_FLAGS		|				\
1198	 FAST_PATH_PAD_REPEAT		|				\
1199	 FAST_PATH_X_UNIT_POSITIVE),					\
1200	PIXMAN_null, 0,							\
1201	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1202	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
1203    }
1204
1205#define SIMPLE_BILINEAR_FAST_PATH_NONE(op,s,d,func)			\
1206    {   PIXMAN_OP_ ## op,						\
1207	PIXMAN_ ## s,							\
1208	(SCALED_BILINEAR_FLAGS		|				\
1209	 FAST_PATH_NONE_REPEAT		|				\
1210	 FAST_PATH_X_UNIT_POSITIVE),					\
1211	PIXMAN_null, 0,							\
1212	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1213	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
1214    }
1215
1216#define SIMPLE_BILINEAR_FAST_PATH_COVER(op,s,d,func)			\
1217    {   PIXMAN_OP_ ## op,						\
1218	PIXMAN_ ## s,							\
1219	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
1220	PIXMAN_null, 0,							\
1221	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1222	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
1223    }
1224
1225#define SIMPLE_BILINEAR_FAST_PATH_NORMAL(op,s,d,func)			\
1226    {   PIXMAN_OP_ ## op,						\
1227	PIXMAN_ ## s,							\
1228	(SCALED_BILINEAR_FLAGS		|				\
1229	 FAST_PATH_NORMAL_REPEAT	|				\
1230	 FAST_PATH_X_UNIT_POSITIVE),					\
1231	PIXMAN_null, 0,							\
1232	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1233	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
1234    }
1235
1236#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD(op,s,d,func)		\
1237    {   PIXMAN_OP_ ## op,						\
1238	PIXMAN_ ## s,							\
1239	(SCALED_BILINEAR_FLAGS		|				\
1240	 FAST_PATH_PAD_REPEAT		|				\
1241	 FAST_PATH_X_UNIT_POSITIVE),					\
1242	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
1243	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1244	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
1245    }
1246
1247#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE(op,s,d,func)		\
1248    {   PIXMAN_OP_ ## op,						\
1249	PIXMAN_ ## s,							\
1250	(SCALED_BILINEAR_FLAGS		|				\
1251	 FAST_PATH_NONE_REPEAT		|				\
1252	 FAST_PATH_X_UNIT_POSITIVE),					\
1253	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
1254	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1255	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
1256    }
1257
1258#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER(op,s,d,func)		\
1259    {   PIXMAN_OP_ ## op,						\
1260	PIXMAN_ ## s,							\
1261	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
1262	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
1263	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1264	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
1265    }
1266
1267#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL(op,s,d,func)		\
1268    {   PIXMAN_OP_ ## op,						\
1269	PIXMAN_ ## s,							\
1270	(SCALED_BILINEAR_FLAGS		|				\
1271	 FAST_PATH_NORMAL_REPEAT	|				\
1272	 FAST_PATH_X_UNIT_POSITIVE),					\
1273	PIXMAN_a8, MASK_FLAGS (a8, FAST_PATH_UNIFIED_ALPHA),		\
1274	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1275	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
1276    }
1277
1278#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD(op,s,d,func)		\
1279    {   PIXMAN_OP_ ## op,						\
1280	PIXMAN_ ## s,							\
1281	(SCALED_BILINEAR_FLAGS		|				\
1282	 FAST_PATH_PAD_REPEAT		|				\
1283	 FAST_PATH_X_UNIT_POSITIVE),					\
1284	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
1285	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1286	fast_composite_scaled_bilinear_ ## func ## _pad ## _ ## op,	\
1287    }
1288
1289#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE(op,s,d,func)		\
1290    {   PIXMAN_OP_ ## op,						\
1291	PIXMAN_ ## s,							\
1292	(SCALED_BILINEAR_FLAGS		|				\
1293	 FAST_PATH_NONE_REPEAT		|				\
1294	 FAST_PATH_X_UNIT_POSITIVE),					\
1295	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
1296	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1297	fast_composite_scaled_bilinear_ ## func ## _none ## _ ## op,	\
1298    }
1299
1300#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER(op,s,d,func)		\
1301    {   PIXMAN_OP_ ## op,						\
1302	PIXMAN_ ## s,							\
1303	SCALED_BILINEAR_FLAGS | FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR,	\
1304	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
1305	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1306	fast_composite_scaled_bilinear_ ## func ## _cover ## _ ## op,	\
1307    }
1308
1309#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL(op,s,d,func)	\
1310    {   PIXMAN_OP_ ## op,						\
1311	PIXMAN_ ## s,							\
1312	(SCALED_BILINEAR_FLAGS		|				\
1313	 FAST_PATH_NORMAL_REPEAT	|				\
1314	 FAST_PATH_X_UNIT_POSITIVE),					\
1315	PIXMAN_solid, MASK_FLAGS (solid, FAST_PATH_UNIFIED_ALPHA),	\
1316	PIXMAN_ ## d, FAST_PATH_STD_DEST_FLAGS,				\
1317	fast_composite_scaled_bilinear_ ## func ## _normal ## _ ## op,	\
1318    }
1319
1320/* Prefer the use of 'cover' variant, because it is faster */
1321#define SIMPLE_BILINEAR_FAST_PATH(op,s,d,func)				\
1322    SIMPLE_BILINEAR_FAST_PATH_COVER (op,s,d,func),			\
1323    SIMPLE_BILINEAR_FAST_PATH_NONE (op,s,d,func),			\
1324    SIMPLE_BILINEAR_FAST_PATH_PAD (op,s,d,func),			\
1325    SIMPLE_BILINEAR_FAST_PATH_NORMAL (op,s,d,func)
1326
1327#define SIMPLE_BILINEAR_A8_MASK_FAST_PATH(op,s,d,func)			\
1328    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_COVER (op,s,d,func),		\
1329    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NONE (op,s,d,func),		\
1330    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_PAD (op,s,d,func),		\
1331    SIMPLE_BILINEAR_A8_MASK_FAST_PATH_NORMAL (op,s,d,func)
1332
1333#define SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH(op,s,d,func)		\
1334    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_COVER (op,s,d,func),		\
1335    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NONE (op,s,d,func),		\
1336    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_PAD (op,s,d,func),		\
1337    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH_NORMAL (op,s,d,func)
1338
1339#endif
1340