1/************************************************************************** 2 * 3 * Copyright 2007-2009 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28/* 29 * Rasterization for binned triangles within a tile 30 */ 31 32#include <limits.h> 33#include "util/u_math.h" 34#include "lp_debug.h" 35#include "lp_perf.h" 36#include "lp_rast_priv.h" 37#include "lp_tile_soa.h" 38 39 40 41 42/** 43 * Shade all pixels in a 4x4 block. 44 */ 45static void 46block_full_4(struct lp_rasterizer_task *task, 47 const struct lp_rast_triangle *tri, 48 int x, int y) 49{ 50 lp_rast_shade_quads_all(task, &tri->inputs, x, y); 51} 52 53 54/** 55 * Shade all pixels in a 16x16 block. 56 */ 57static void 58block_full_16(struct lp_rasterizer_task *task, 59 const struct lp_rast_triangle *tri, 60 int x, int y) 61{ 62 unsigned ix, iy; 63 assert(x % 16 == 0); 64 assert(y % 16 == 0); 65 for (iy = 0; iy < 16; iy += 4) 66 for (ix = 0; ix < 16; ix += 4) 67 block_full_4(task, tri, x + ix, y + iy); 68} 69 70#if !defined(PIPE_ARCH_SSE) 71 72static INLINE unsigned 73build_mask_linear(int c, int dcdx, int dcdy) 74{ 75 int mask = 0; 76 77 int c0 = c; 78 int c1 = c0 + dcdy; 79 int c2 = c1 + dcdy; 80 int c3 = c2 + dcdy; 81 82 mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); 83 mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); 84 mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); 85 mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); 86 mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); 87 mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); 88 mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); 89 mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); 90 mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); 91 mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); 92 mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); 93 mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); 94 mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); 95 mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); 96 mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); 97 mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); 98 99 return mask; 100} 101 102 103static INLINE void 104build_masks(int c, 105 int cdiff, 106 int dcdx, 107 int dcdy, 108 unsigned *outmask, 109 unsigned *partmask) 110{ 111 *outmask |= build_mask_linear(c, dcdx, dcdy); 112 *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); 113} 114 115void 116lp_rast_triangle_3_16(struct lp_rasterizer_task *task, 117 const union lp_rast_cmd_arg arg) 118{ 119 union lp_rast_cmd_arg arg2; 120 arg2.triangle.tri = arg.triangle.tri; 121 arg2.triangle.plane_mask = (1<<3)-1; 122 lp_rast_triangle_3(task, arg2); 123} 124 125void 126lp_rast_triangle_4_16(struct lp_rasterizer_task *task, 127 const union lp_rast_cmd_arg arg) 128{ 129 union lp_rast_cmd_arg arg2; 130 arg2.triangle.tri = arg.triangle.tri; 131 arg2.triangle.plane_mask = (1<<4)-1; 132 lp_rast_triangle_4(task, arg2); 133} 134 135void 136lp_rast_triangle_3_4(struct lp_rasterizer_task *task, 137 const union lp_rast_cmd_arg arg) 138{ 139 lp_rast_triangle_3_16(task, arg); 140} 141 142#else 143#include <emmintrin.h> 144#include "util/u_sse.h" 145 146 147static INLINE void 148build_masks(int c, 149 int cdiff, 150 int dcdx, 151 int dcdy, 152 unsigned *outmask, 153 unsigned *partmask) 154{ 155 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 156 __m128i xdcdy = _mm_set1_epi32(dcdy); 157 158 /* Get values across the quad 159 */ 160 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 161 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 162 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 163 164 { 165 __m128i cstep01, cstep23, result; 166 167 cstep01 = _mm_packs_epi32(cstep0, cstep1); 168 cstep23 = _mm_packs_epi32(cstep2, cstep3); 169 result = _mm_packs_epi16(cstep01, cstep23); 170 171 *outmask |= _mm_movemask_epi8(result); 172 } 173 174 175 { 176 __m128i cio4 = _mm_set1_epi32(cdiff); 177 __m128i cstep01, cstep23, result; 178 179 cstep0 = _mm_add_epi32(cstep0, cio4); 180 cstep1 = _mm_add_epi32(cstep1, cio4); 181 cstep2 = _mm_add_epi32(cstep2, cio4); 182 cstep3 = _mm_add_epi32(cstep3, cio4); 183 184 cstep01 = _mm_packs_epi32(cstep0, cstep1); 185 cstep23 = _mm_packs_epi32(cstep2, cstep3); 186 result = _mm_packs_epi16(cstep01, cstep23); 187 188 *partmask |= _mm_movemask_epi8(result); 189 } 190} 191 192 193static INLINE unsigned 194build_mask_linear(int c, int dcdx, int dcdy) 195{ 196 __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 197 __m128i xdcdy = _mm_set1_epi32(dcdy); 198 199 /* Get values across the quad 200 */ 201 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 202 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 203 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 204 205 /* pack pairs of results into epi16 206 */ 207 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 208 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 209 210 /* pack into epi8, preserving sign bits 211 */ 212 __m128i result = _mm_packs_epi16(cstep01, cstep23); 213 214 /* extract sign bits to create mask 215 */ 216 return _mm_movemask_epi8(result); 217} 218 219static INLINE unsigned 220sign_bits4(const __m128i *cstep, int cdiff) 221{ 222 223 /* Adjust the step values 224 */ 225 __m128i cio4 = _mm_set1_epi32(cdiff); 226 __m128i cstep0 = _mm_add_epi32(cstep[0], cio4); 227 __m128i cstep1 = _mm_add_epi32(cstep[1], cio4); 228 __m128i cstep2 = _mm_add_epi32(cstep[2], cio4); 229 __m128i cstep3 = _mm_add_epi32(cstep[3], cio4); 230 231 /* Pack down to epi8 232 */ 233 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 234 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 235 __m128i result = _mm_packs_epi16(cstep01, cstep23); 236 237 /* Extract the sign bits 238 */ 239 return _mm_movemask_epi8(result); 240} 241 242 243#define NR_PLANES 3 244 245 246 247 248 249 250 251void 252lp_rast_triangle_3_16(struct lp_rasterizer_task *task, 253 const union lp_rast_cmd_arg arg) 254{ 255 const struct lp_rast_triangle *tri = arg.triangle.tri; 256 const struct lp_rast_plane *plane = GET_PLANES(tri); 257 int x = (arg.triangle.plane_mask & 0xff) + task->x; 258 int y = (arg.triangle.plane_mask >> 8) + task->y; 259 unsigned i, j; 260 261 struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 262 unsigned nr = 0; 263 264 __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ 265 __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ 266 __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ 267 __m128i zero = _mm_setzero_si128(); 268 269 __m128i c; 270 __m128i dcdx; 271 __m128i dcdy; 272 __m128i rej4; 273 274 __m128i dcdx2; 275 __m128i dcdx3; 276 277 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 278 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 279 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 280 __m128i unused; 281 282 transpose4_epi32(&p0, &p1, &p2, &zero, 283 &c, &dcdx, &dcdy, &rej4); 284 285 /* Adjust dcdx; 286 */ 287 dcdx = _mm_sub_epi32(zero, dcdx); 288 289 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 290 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 291 rej4 = _mm_slli_epi32(rej4, 2); 292 293 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 294 c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 295 rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); 296 297 dcdx2 = _mm_add_epi32(dcdx, dcdx); 298 dcdx3 = _mm_add_epi32(dcdx2, dcdx); 299 300 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 301 &span_0, &span_1, &span_2, &unused); 302 303 for (i = 0; i < 4; i++) { 304 __m128i cx = c; 305 306 for (j = 0; j < 4; j++) { 307 __m128i c4rej = _mm_add_epi32(cx, rej4); 308 __m128i rej_masks = _mm_srai_epi32(c4rej, 31); 309 310 /* if (is_zero(rej_masks)) */ 311 if (_mm_movemask_epi8(rej_masks) == 0) { 312 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); 313 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); 314 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); 315 316 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 317 318 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 319 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 320 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 321 322 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 323 __m128i c_01 = _mm_packs_epi32(c_0, c_1); 324 325 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 326 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 327 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 328 329 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 330 331 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 332 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 333 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 334 335 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 336 __m128i c_23 = _mm_packs_epi32(c_2, c_3); 337 __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 338 339 unsigned mask = _mm_movemask_epi8(c_0123); 340 341 out[nr].i = i; 342 out[nr].j = j; 343 out[nr].mask = mask; 344 if (mask != 0xffff) 345 nr++; 346 } 347 cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); 348 } 349 350 c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); 351 } 352 353 for (i = 0; i < nr; i++) 354 lp_rast_shade_quads_mask(task, 355 &tri->inputs, 356 x + 4 * out[i].j, 357 y + 4 * out[i].i, 358 0xffff & ~out[i].mask); 359} 360 361 362 363 364 365void 366lp_rast_triangle_3_4(struct lp_rasterizer_task *task, 367 const union lp_rast_cmd_arg arg) 368{ 369 const struct lp_rast_triangle *tri = arg.triangle.tri; 370 const struct lp_rast_plane *plane = GET_PLANES(tri); 371 unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; 372 unsigned y = (arg.triangle.plane_mask >> 8) + task->y; 373 374 __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ 375 __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ 376 __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ 377 __m128i zero = _mm_setzero_si128(); 378 379 __m128i c; 380 __m128i dcdx; 381 __m128i dcdy; 382 383 __m128i dcdx2; 384 __m128i dcdx3; 385 386 __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 387 __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 388 __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 389 __m128i unused; 390 391 transpose4_epi32(&p0, &p1, &p2, &zero, 392 &c, &dcdx, &dcdy, &unused); 393 394 /* Adjust dcdx; 395 */ 396 dcdx = _mm_sub_epi32(zero, dcdx); 397 398 c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 399 c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 400 401 /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 402 c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 403 404 dcdx2 = _mm_add_epi32(dcdx, dcdx); 405 dcdx3 = _mm_add_epi32(dcdx2, dcdx); 406 407 transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 408 &span_0, &span_1, &span_2, &unused); 409 410 411 { 412 __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); 413 __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); 414 __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); 415 416 __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 417 418 __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 419 __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 420 __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 421 422 __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 423 __m128i c_01 = _mm_packs_epi32(c_0, c_1); 424 425 __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 426 __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 427 __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 428 429 __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 430 431 __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 432 __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 433 __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 434 435 __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 436 __m128i c_23 = _mm_packs_epi32(c_2, c_3); 437 __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 438 439 unsigned mask = _mm_movemask_epi8(c_0123); 440 441 if (mask != 0xffff) 442 lp_rast_shade_quads_mask(task, 443 &tri->inputs, 444 x, 445 y, 446 0xffff & ~mask); 447 } 448} 449 450#undef NR_PLANES 451#endif 452 453 454 455 456#define TAG(x) x##_1 457#define NR_PLANES 1 458#include "lp_rast_tri_tmp.h" 459 460#define TAG(x) x##_2 461#define NR_PLANES 2 462#include "lp_rast_tri_tmp.h" 463 464#define TAG(x) x##_3 465#define NR_PLANES 3 466/*#define TRI_4 lp_rast_triangle_3_4*/ 467/*#define TRI_16 lp_rast_triangle_3_16*/ 468#include "lp_rast_tri_tmp.h" 469 470#define TAG(x) x##_4 471#define NR_PLANES 4 472#define TRI_16 lp_rast_triangle_4_16 473#include "lp_rast_tri_tmp.h" 474 475#define TAG(x) x##_5 476#define NR_PLANES 5 477#include "lp_rast_tri_tmp.h" 478 479#define TAG(x) x##_6 480#define NR_PLANES 6 481#include "lp_rast_tri_tmp.h" 482 483#define TAG(x) x##_7 484#define NR_PLANES 7 485#include "lp_rast_tri_tmp.h" 486 487#define TAG(x) x##_8 488#define NR_PLANES 8 489#include "lp_rast_tri_tmp.h" 490 491