1f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/**************************************************************************
2f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *
3f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Copyright 2007-2009 VMware, Inc.
4f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * All Rights Reserved.
5f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *
6f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Permission is hereby granted, free of charge, to any person obtaining a
7f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * copy of this software and associated documentation files (the
8f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * "Software"), to deal in the Software without restriction, including
9f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * without limitation the rights to use, copy, modify, merge, publish,
10f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * distribute, sub license, and/or sell copies of the Software, and to
11f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * permit persons to whom the Software is furnished to do so, subject to
12f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * the following conditions:
13f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *
14f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * The above copyright notice and this permission notice (including the
15f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * next paragraph) shall be included in all copies or substantial portions
16f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * of the Software.
17f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *
18f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
19f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
21f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
22f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
23f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
24f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *
26f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org **************************************************************************/
27f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
28f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/*
29f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Rasterization for binned triangles within a tile
30f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */
31f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
32f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include <limits.h>
33f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_math.h"
34f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_debug.h"
35f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_perf.h"
36f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_priv.h"
37f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_tile_soa.h"
38f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
39f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
40f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
41f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
42f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/**
43f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Shade all pixels in a 4x4 block.
44f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */
45f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic void
46f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgblock_full_4(struct lp_rasterizer_task *task,
47f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org             const struct lp_rast_triangle *tri,
48f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org             int x, int y)
49f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
50f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   lp_rast_shade_quads_all(task, &tri->inputs, x, y);
51f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
52f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
53f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
54f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/**
55f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Shade all pixels in a 16x16 block.
56f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */
57f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic void
58f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgblock_full_16(struct lp_rasterizer_task *task,
59f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org              const struct lp_rast_triangle *tri,
60f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org              int x, int y)
61f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
62f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   unsigned ix, iy;
63f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   assert(x % 16 == 0);
64f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   assert(y % 16 == 0);
65f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   for (iy = 0; iy < 16; iy += 4)
66f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      for (ix = 0; ix < 16; ix += 4)
67f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	 block_full_4(task, tri, x + ix, y + iy);
68f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
69f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
70f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#if !defined(PIPE_ARCH_SSE)
71f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
72f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE unsigned
73f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgbuild_mask_linear(int c, int dcdx, int dcdy)
74f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
75f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   int mask = 0;
76f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
77f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   int c0 = c;
78f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   int c1 = c0 + dcdy;
79f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   int c2 = c1 + dcdy;
80f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   int c3 = c2 + dcdy;
81f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
82f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0);
83f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1);
84f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2);
85f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3);
86f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4);
87f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5);
88f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6);
89f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7);
90f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8);
91f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9);
92f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10);
93f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11);
94f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12);
95f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13);
96f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14);
97f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15);
98f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
99f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   return mask;
100f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
101f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
102f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
103f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE void
104f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgbuild_masks(int c,
105f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    int cdiff,
106f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    int dcdx,
107f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    int dcdy,
108f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    unsigned *outmask,
109f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    unsigned *partmask)
110f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
111f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   *outmask |= build_mask_linear(c, dcdx, dcdy);
112f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy);
113f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
114f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
115f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid
116f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_3_16(struct lp_rasterizer_task *task,
117f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                      const union lp_rast_cmd_arg arg)
118f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
119f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   union lp_rast_cmd_arg arg2;
120f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   arg2.triangle.tri = arg.triangle.tri;
121f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   arg2.triangle.plane_mask = (1<<3)-1;
122f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   lp_rast_triangle_3(task, arg2);
123f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
124f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
125f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid
126f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_4_16(struct lp_rasterizer_task *task,
127f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                      const union lp_rast_cmd_arg arg)
128f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
129f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   union lp_rast_cmd_arg arg2;
130f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   arg2.triangle.tri = arg.triangle.tri;
131f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   arg2.triangle.plane_mask = (1<<4)-1;
132f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   lp_rast_triangle_4(task, arg2);
133f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
134f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
135f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid
136f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_3_4(struct lp_rasterizer_task *task,
137f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                      const union lp_rast_cmd_arg arg)
138f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
139f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   lp_rast_triangle_3_16(task, arg);
140f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
141f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
142f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#else
143f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include <emmintrin.h>
144f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_sse.h"
145f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
146f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
147f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE void
148f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgbuild_masks(int c,
149f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    int cdiff,
150f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    int dcdx,
151f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    int dcdy,
152f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    unsigned *outmask,
153f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org	    unsigned *partmask)
154f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
155f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
156f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i xdcdy = _mm_set1_epi32(dcdy);
157f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
158f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Get values across the quad
159f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
160f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
161f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
162f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
163f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
164f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   {
165f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i cstep01, cstep23, result;
166f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
167f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      cstep01 = _mm_packs_epi32(cstep0, cstep1);
168f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      cstep23 = _mm_packs_epi32(cstep2, cstep3);
169f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      result = _mm_packs_epi16(cstep01, cstep23);
170f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
171f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      *outmask |= _mm_movemask_epi8(result);
172f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   }
173f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
174f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
175f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   {
176f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i cio4 = _mm_set1_epi32(cdiff);
177f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i cstep01, cstep23, result;
178f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
179f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      cstep0 = _mm_add_epi32(cstep0, cio4);
180f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      cstep1 = _mm_add_epi32(cstep1, cio4);
181f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      cstep2 = _mm_add_epi32(cstep2, cio4);
182f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      cstep3 = _mm_add_epi32(cstep3, cio4);
183f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
184f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      cstep01 = _mm_packs_epi32(cstep0, cstep1);
185f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      cstep23 = _mm_packs_epi32(cstep2, cstep3);
186f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      result = _mm_packs_epi16(cstep01, cstep23);
187f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
188f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      *partmask |= _mm_movemask_epi8(result);
189f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   }
190f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
191f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
192f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
193f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE unsigned
194f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgbuild_mask_linear(int c, int dcdx, int dcdy)
195f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
196f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
197f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i xdcdy = _mm_set1_epi32(dcdy);
198f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
199f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Get values across the quad
200f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
201f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
202f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
203f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
204f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
205f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* pack pairs of results into epi16
206f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
207f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
208f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
209f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
210f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* pack into epi8, preserving sign bits
211f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
212f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i result = _mm_packs_epi16(cstep01, cstep23);
213f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
214f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* extract sign bits to create mask
215f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
216f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   return _mm_movemask_epi8(result);
217f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
218f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
219f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE unsigned
220f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgsign_bits4(const __m128i *cstep, int cdiff)
221f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
222f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
223f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Adjust the step values
224f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
225f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cio4 = _mm_set1_epi32(cdiff);
226f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep0 = _mm_add_epi32(cstep[0], cio4);
227f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep1 = _mm_add_epi32(cstep[1], cio4);
228f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep2 = _mm_add_epi32(cstep[2], cio4);
229f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep3 = _mm_add_epi32(cstep[3], cio4);
230f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
231f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Pack down to epi8
232f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
233f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
234f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
235f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i result = _mm_packs_epi16(cstep01, cstep23);
236f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
237f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Extract the sign bits
238f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
239f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   return _mm_movemask_epi8(result);
240f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
241f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
242f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
243f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 3
244f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
245f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
246f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
247f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
248f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
249f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
250f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
251f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid
252f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_3_16(struct lp_rasterizer_task *task,
253f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                      const union lp_rast_cmd_arg arg)
254f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
255f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   const struct lp_rast_triangle *tri = arg.triangle.tri;
256f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   const struct lp_rast_plane *plane = GET_PLANES(tri);
257f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   int x = (arg.triangle.plane_mask & 0xff) + task->x;
258f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   int y = (arg.triangle.plane_mask >> 8) + task->y;
259f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   unsigned i, j;
260f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
261f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
262f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   unsigned nr = 0;
263f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
264f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
265f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
266f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
267f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i zero = _mm_setzero_si128();
268f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
269f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i c;
270f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i dcdx;
271f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i dcdy;
272f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i rej4;
273f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
274f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i dcdx2;
275f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i dcdx3;
276f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
277f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
278f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
279f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
280f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i unused;
281f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
282f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   transpose4_epi32(&p0, &p1, &p2, &zero,
283f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                    &c, &dcdx, &dcdy, &rej4);
284f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
285f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Adjust dcdx;
286f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
287f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   dcdx = _mm_sub_epi32(zero, dcdx);
288f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
289f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
290f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
291f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   rej4 = _mm_slli_epi32(rej4, 2);
292f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
293f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
294f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
295f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1));
296f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
297f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   dcdx2 = _mm_add_epi32(dcdx, dcdx);
298f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
299f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
300f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
301f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                    &span_0, &span_1, &span_2, &unused);
302f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
303f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   for (i = 0; i < 4; i++) {
304f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i cx = c;
305f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
306f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      for (j = 0; j < 4; j++) {
307f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org         __m128i c4rej = _mm_add_epi32(cx, rej4);
308f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
309f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
310f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org         /* if (is_zero(rej_masks)) */
311f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org         if (_mm_movemask_epi8(rej_masks) == 0) {
312f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
313f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
314f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
315f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
316f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
317f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
318f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
319f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
320f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
321f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
322f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
323f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c_01 = _mm_packs_epi32(c_0, c_1);
324f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
325f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
326f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
327f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
328f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
329f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
330f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
331f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
332f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
333f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
334f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
335f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
336f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
337f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
338f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
339f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            unsigned mask = _mm_movemask_epi8(c_0123);
340f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
341f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            out[nr].i = i;
342f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            out[nr].j = j;
343f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            out[nr].mask = mask;
344f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org            if (mask != 0xffff)
345f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org               nr++;
346f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org         }
347f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
348f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      }
349f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
350f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
351f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   }
352f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
353f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   for (i = 0; i < nr; i++)
354f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      lp_rast_shade_quads_mask(task,
355f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                               &tri->inputs,
356f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                               x + 4 * out[i].j,
357f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                               y + 4 * out[i].i,
358f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                               0xffff & ~out[i].mask);
359f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
360f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
361f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
362f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
363f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
364f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
365f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid
366f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_3_4(struct lp_rasterizer_task *task,
367f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                     const union lp_rast_cmd_arg arg)
368f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{
369f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   const struct lp_rast_triangle *tri = arg.triangle.tri;
370f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   const struct lp_rast_plane *plane = GET_PLANES(tri);
371f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   unsigned x = (arg.triangle.plane_mask & 0xff) + task->x;
372f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   unsigned y = (arg.triangle.plane_mask >> 8) + task->y;
373f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
374f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
375f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
376f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
377f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i zero = _mm_setzero_si128();
378f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
379f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i c;
380f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i dcdx;
381f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i dcdy;
382f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
383f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i dcdx2;
384f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i dcdx3;
385f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
386f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
387f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
388f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
389f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   __m128i unused;
390f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
391f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   transpose4_epi32(&p0, &p1, &p2, &zero,
392f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                    &c, &dcdx, &dcdy, &unused);
393f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
394f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Adjust dcdx;
395f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org    */
396f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   dcdx = _mm_sub_epi32(zero, dcdx);
397f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
398f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
399f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
400f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
401f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */
402f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   c = _mm_sub_epi32(c, _mm_set1_epi32(1));
403f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
404f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   dcdx2 = _mm_add_epi32(dcdx, dcdx);
405f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
406f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
407f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
408f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                    &span_0, &span_1, &span_2, &unused);
409f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
410f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
411f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   {
412f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
413f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
414f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
415f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
416f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
417f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
418f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
419f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
420f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
421f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
422f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
423f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c_01 = _mm_packs_epi32(c_0, c_1);
424f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
425f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
426f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
427f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
428f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
429f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
430f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
431f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
432f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
433f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
434f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
435f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
436f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
437f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
438f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
439f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      unsigned mask = _mm_movemask_epi8(c_0123);
440f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
441f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org      if (mask != 0xffff)
442f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org         lp_rast_shade_quads_mask(task,
443f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                                  &tri->inputs,
444f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                                  x,
445f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                                  y,
446f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org                                  0xffff & ~mask);
447f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org   }
448f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org}
449f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
450f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#undef NR_PLANES
451f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#endif
452f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
453f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
454f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
455f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
456f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_1
457f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 1
458f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h"
459f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
460f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_2
461f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 2
462f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h"
463f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
464f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_3
465f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 3
466f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/*#define TRI_4 lp_rast_triangle_3_4*/
467f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/*#define TRI_16 lp_rast_triangle_3_16*/
468f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h"
469f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
470f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_4
471f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 4
472f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TRI_16 lp_rast_triangle_4_16
473f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h"
474f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
475f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_5
476f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 5
477f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h"
478f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
479f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_6
480f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 6
481f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h"
482f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
483f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_7
484f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 7
485f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h"
486f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
487f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_8
488f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 8
489f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h"
490f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org
491