1f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/************************************************************************** 2f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 3f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Copyright 2007-2009 VMware, Inc. 4f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * All Rights Reserved. 5f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 6f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Permission is hereby granted, free of charge, to any person obtaining a 7f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * copy of this software and associated documentation files (the 8f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * "Software"), to deal in the Software without restriction, including 9f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * without limitation the rights to use, copy, modify, merge, publish, 10f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * distribute, sub license, and/or sell copies of the Software, and to 11f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * permit persons to whom the Software is furnished to do so, subject to 12f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * the following conditions: 13f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 14f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * The above copyright notice and this permission notice (including the 15f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * next paragraph) shall be included in all copies or substantial portions 16f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * of the Software. 17f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 18f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * 26f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org **************************************************************************/ 27f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 28f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/* 29f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Rasterization for binned triangles within a tile 30f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 31f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 32f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include <limits.h> 33f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_math.h" 34f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_debug.h" 35f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_perf.h" 36f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_priv.h" 37f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_tile_soa.h" 38f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 39f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 40f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 41f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 42f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 43f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Shade all pixels in a 4x4 block. 44f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 45f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic void 46f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgblock_full_4(struct lp_rasterizer_task *task, 47f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_rast_triangle *tri, 48f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int x, int y) 49f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 50f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_rast_shade_quads_all(task, &tri->inputs, x, y); 51f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 52f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 53f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 54f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/** 55f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org * Shade all pixels in a 16x16 block. 56f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 57f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic void 58f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgblock_full_16(struct lp_rasterizer_task *task, 59f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_rast_triangle *tri, 60f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int x, int y) 61f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 62f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned ix, iy; 63f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(x % 16 == 0); 64f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org assert(y % 16 == 0); 65f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (iy = 0; iy < 16; iy += 4) 66f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (ix = 0; ix < 16; ix += 4) 67f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org block_full_4(task, tri, x + ix, y + iy); 68f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 69f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 70f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#if !defined(PIPE_ARCH_SSE) 71f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 72f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE unsigned 73f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgbuild_mask_linear(int c, int dcdx, int dcdy) 74f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 75f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int mask = 0; 76f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 77f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int c0 = c; 78f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int c1 = c0 + dcdy; 79f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int c2 = c1 + dcdy; 80f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int c3 = c2 + dcdy; 81f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 82f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c0 + 0 * dcdx) >> 31) & (1 << 0); 83f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c0 + 1 * dcdx) >> 31) & (1 << 1); 84f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c0 + 2 * dcdx) >> 31) & (1 << 2); 85f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c0 + 3 * dcdx) >> 31) & (1 << 3); 86f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c1 + 0 * dcdx) >> 31) & (1 << 4); 87f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c1 + 1 * dcdx) >> 31) & (1 << 5); 88f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c1 + 2 * dcdx) >> 31) & (1 << 6); 89f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c1 + 3 * dcdx) >> 31) & (1 << 7); 90f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c2 + 0 * dcdx) >> 31) & (1 << 8); 91f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c2 + 1 * dcdx) >> 31) & (1 << 9); 92f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c2 + 2 * dcdx) >> 31) & (1 << 10); 93f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c2 + 3 * dcdx) >> 31) & (1 << 11); 94f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c3 + 0 * dcdx) >> 31) & (1 << 12); 95f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c3 + 1 * dcdx) >> 31) & (1 << 13); 96f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c3 + 2 * dcdx) >> 31) & (1 << 14); 97f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org mask |= ((c3 + 3 * dcdx) >> 31) & (1 << 15); 98f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 99f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return mask; 100f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 101f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 102f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 103f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE void 104f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgbuild_masks(int c, 105f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int cdiff, 106f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int dcdx, 107f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int dcdy, 108f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned *outmask, 109f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned *partmask) 110f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 111f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *outmask |= build_mask_linear(c, dcdx, dcdy); 112f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *partmask |= build_mask_linear(c + cdiff, dcdx, dcdy); 113f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 114f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 115f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 116f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_3_16(struct lp_rasterizer_task *task, 117f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const union lp_rast_cmd_arg arg) 118f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 119f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org union lp_rast_cmd_arg arg2; 120f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org arg2.triangle.tri = arg.triangle.tri; 121f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org arg2.triangle.plane_mask = (1<<3)-1; 122f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_rast_triangle_3(task, arg2); 123f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 124f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 125f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 126f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_4_16(struct lp_rasterizer_task *task, 127f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const union lp_rast_cmd_arg arg) 128f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 129f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org union lp_rast_cmd_arg arg2; 130f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org arg2.triangle.tri = arg.triangle.tri; 131f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org arg2.triangle.plane_mask = (1<<4)-1; 132f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_rast_triangle_4(task, arg2); 133f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 134f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 135f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 136f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_3_4(struct lp_rasterizer_task *task, 137f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const union lp_rast_cmd_arg arg) 138f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 139f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_rast_triangle_3_16(task, arg); 140f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 141f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 142f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#else 143f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include <emmintrin.h> 144f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "util/u_sse.h" 145f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 146f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 147f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE void 148f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgbuild_masks(int c, 149f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int cdiff, 150f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int dcdx, 151f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int dcdy, 152f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned *outmask, 153f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned *partmask) 154f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 155f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 156f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i xdcdy = _mm_set1_epi32(dcdy); 157f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 158f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Get values across the quad 159f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 160f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 161f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 162f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 163f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 164f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org { 165f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep01, cstep23, result; 166f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 167f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cstep01 = _mm_packs_epi32(cstep0, cstep1); 168f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cstep23 = _mm_packs_epi32(cstep2, cstep3); 169f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org result = _mm_packs_epi16(cstep01, cstep23); 170f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 171f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *outmask |= _mm_movemask_epi8(result); 172f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 173f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 174f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 175f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org { 176f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cio4 = _mm_set1_epi32(cdiff); 177f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep01, cstep23, result; 178f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 179f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cstep0 = _mm_add_epi32(cstep0, cio4); 180f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cstep1 = _mm_add_epi32(cstep1, cio4); 181f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cstep2 = _mm_add_epi32(cstep2, cio4); 182f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cstep3 = _mm_add_epi32(cstep3, cio4); 183f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 184f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cstep01 = _mm_packs_epi32(cstep0, cstep1); 185f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cstep23 = _mm_packs_epi32(cstep2, cstep3); 186f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org result = _mm_packs_epi16(cstep01, cstep23); 187f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 188f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org *partmask |= _mm_movemask_epi8(result); 189f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 190f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 191f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 192f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 193f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE unsigned 194f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgbuild_mask_linear(int c, int dcdx, int dcdy) 195f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 196f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3); 197f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i xdcdy = _mm_set1_epi32(dcdy); 198f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 199f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Get values across the quad 200f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 201f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy); 202f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy); 203f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy); 204f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 205f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* pack pairs of results into epi16 206f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 207f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 208f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 209f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 210f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* pack into epi8, preserving sign bits 211f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 212f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i result = _mm_packs_epi16(cstep01, cstep23); 213f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 214f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* extract sign bits to create mask 215f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 216f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return _mm_movemask_epi8(result); 217f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 218f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 219f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgstatic INLINE unsigned 220f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgsign_bits4(const __m128i *cstep, int cdiff) 221f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 222f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 223f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Adjust the step values 224f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 225f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cio4 = _mm_set1_epi32(cdiff); 226f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep0 = _mm_add_epi32(cstep[0], cio4); 227f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep1 = _mm_add_epi32(cstep[1], cio4); 228f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep2 = _mm_add_epi32(cstep[2], cio4); 229f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep3 = _mm_add_epi32(cstep[3], cio4); 230f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 231f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Pack down to epi8 232f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 233f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1); 234f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3); 235f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i result = _mm_packs_epi16(cstep01, cstep23); 236f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 237f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Extract the sign bits 238f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 239f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org return _mm_movemask_epi8(result); 240f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 241f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 242f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 243f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 3 244f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 245f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 246f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 247f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 248f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 249f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 250f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 251f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 252f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_3_16(struct lp_rasterizer_task *task, 253f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const union lp_rast_cmd_arg arg) 254f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 255f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_rast_triangle *tri = arg.triangle.tri; 256f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_rast_plane *plane = GET_PLANES(tri); 257f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int x = (arg.triangle.plane_mask & 0xff) + task->x; 258f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org int y = (arg.triangle.plane_mask >> 8) + task->y; 259f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned i, j; 260f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 261f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; 262f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned nr = 0; 263f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 264f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ 265f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ 266f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ 267f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i zero = _mm_setzero_si128(); 268f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 269f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c; 270f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i dcdx; 271f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i dcdy; 272f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i rej4; 273f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 274f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i dcdx2; 275f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i dcdx3; 276f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 277f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 278f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 279f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 280f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i unused; 281f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 282f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org transpose4_epi32(&p0, &p1, &p2, &zero, 283f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org &c, &dcdx, &dcdy, &rej4); 284f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 285f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Adjust dcdx; 286f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 287f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org dcdx = _mm_sub_epi32(zero, dcdx); 288f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 289f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 290f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 291f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org rej4 = _mm_slli_epi32(rej4, 2); 292f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 293f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 294f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 295f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); 296f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 297f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org dcdx2 = _mm_add_epi32(dcdx, dcdx); 298f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org dcdx3 = _mm_add_epi32(dcdx2, dcdx); 299f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 300f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 301f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org &span_0, &span_1, &span_2, &unused); 302f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 303f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (i = 0; i < 4; i++) { 304f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i cx = c; 305f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 306f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (j = 0; j < 4; j++) { 307f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c4rej = _mm_add_epi32(cx, rej4); 308f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i rej_masks = _mm_srai_epi32(c4rej, 31); 309f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 310f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* if (is_zero(rej_masks)) */ 311f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (_mm_movemask_epi8(rej_masks) == 0) { 312f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); 313f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); 314f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); 315f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 316f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 317f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 318f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 319f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 320f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 321f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 322f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 323f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_01 = _mm_packs_epi32(c_0, c_1); 324f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 325f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 326f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 327f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 328f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 329f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 330f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 331f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 332f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 333f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 334f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 335f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 336f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_23 = _mm_packs_epi32(c_2, c_3); 337f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 338f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 339f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned mask = _mm_movemask_epi8(c_0123); 340f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 341f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org out[nr].i = i; 342f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org out[nr].j = j; 343f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org out[nr].mask = mask; 344f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (mask != 0xffff) 345f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org nr++; 346f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 347f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); 348f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 349f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 350f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); 351f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 352f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 353f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org for (i = 0; i < nr; i++) 354f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_rast_shade_quads_mask(task, 355f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org &tri->inputs, 356f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org x + 4 * out[i].j, 357f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org y + 4 * out[i].i, 358f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0xffff & ~out[i].mask); 359f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 360f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 361f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 362f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 363f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 364f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 365f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orgvoid 366f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.orglp_rast_triangle_3_4(struct lp_rasterizer_task *task, 367f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const union lp_rast_cmd_arg arg) 368f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org{ 369f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_rast_triangle *tri = arg.triangle.tri; 370f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org const struct lp_rast_plane *plane = GET_PLANES(tri); 371f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned x = (arg.triangle.plane_mask & 0xff) + task->x; 372f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned y = (arg.triangle.plane_mask >> 8) + task->y; 373f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 374f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ 375f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ 376f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ 377f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i zero = _mm_setzero_si128(); 378f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 379f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c; 380f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i dcdx; 381f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i dcdy; 382f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 383f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i dcdx2; 384f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i dcdx3; 385f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 386f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ 387f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ 388f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ 389f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i unused; 390f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 391f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org transpose4_epi32(&p0, &p1, &p2, &zero, 392f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org &c, &dcdx, &dcdy, &unused); 393f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 394f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Adjust dcdx; 395f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org */ 396f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org dcdx = _mm_sub_epi32(zero, dcdx); 397f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 398f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); 399f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); 400f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 401f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ 402f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org c = _mm_sub_epi32(c, _mm_set1_epi32(1)); 403f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 404f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org dcdx2 = _mm_add_epi32(dcdx, dcdx); 405f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org dcdx3 = _mm_add_epi32(dcdx2, dcdx); 406f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 407f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, 408f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org &span_0, &span_1, &span_2, &unused); 409f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 410f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 411f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org { 412f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0); 413f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1); 414f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2); 415f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 416f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); 417f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 418f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); 419f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); 420f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); 421f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 422f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); 423f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_01 = _mm_packs_epi32(c_0, c_1); 424f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 425f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); 426f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); 427f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); 428f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 429f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); 430f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 431f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); 432f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); 433f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); 434f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 435f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); 436f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_23 = _mm_packs_epi32(c_2, c_3); 437f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org __m128i c_0123 = _mm_packs_epi16(c_01, c_23); 438f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 439f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org unsigned mask = _mm_movemask_epi8(c_0123); 440f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 441f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org if (mask != 0xffff) 442f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org lp_rast_shade_quads_mask(task, 443f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org &tri->inputs, 444f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org x, 445f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org y, 446f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 0xffff & ~mask); 447f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org } 448f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org} 449f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 450f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#undef NR_PLANES 451f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#endif 452f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 453f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 454f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 455f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 456f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_1 457f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 1 458f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h" 459f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 460f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_2 461f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 2 462f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h" 463f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 464f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_3 465f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 3 466f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/*#define TRI_4 lp_rast_triangle_3_4*/ 467f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org/*#define TRI_16 lp_rast_triangle_3_16*/ 468f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h" 469f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 470f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_4 471f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 4 472f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TRI_16 lp_rast_triangle_4_16 473f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h" 474f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 475f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_5 476f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 5 477f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h" 478f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 479f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_6 480f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 6 481f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h" 482f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 483f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_7 484f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 7 485f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h" 486f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 487f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define TAG(x) x##_8 488f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#define NR_PLANES 8 489f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org#include "lp_rast_tri_tmp.h" 490f2ba7591b1407a7ee9209f842c50696914dc2dedkbr@chromium.org 491