1/************************************************************************** 2 * 3 * Copyright 2010 VMware, Inc. 4 * All Rights Reserved. 5 * 6 * Permission is hereby granted, free of charge, to any person obtaining a 7 * copy of this software and associated documentation files (the 8 * "Software"), to deal in the Software without restriction, including 9 * without limitation the rights to use, copy, modify, merge, publish, 10 * distribute, sub license, and/or sell copies of the Software, and to 11 * permit persons to whom the Software is furnished to do so, subject to 12 * the following conditions: 13 * 14 * The above copyright notice and this permission notice (including the 15 * next paragraph) shall be included in all copies or substantial portions 16 * of the Software. 17 * 18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 19 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. 21 * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR 22 * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 23 * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 24 * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 25 * 26 **************************************************************************/ 27 28/** 29 * @file 30 * Texture sampling -- AoS. 31 * 32 * @author Jose Fonseca <jfonseca@vmware.com> 33 * @author Brian Paul <brianp@vmware.com> 34 */ 35 36#include "pipe/p_defines.h" 37#include "pipe/p_state.h" 38#include "util/u_debug.h" 39#include "util/u_dump.h" 40#include "util/u_memory.h" 41#include "util/u_math.h" 42#include "util/u_format.h" 43#include "util/u_cpu_detect.h" 44#include "lp_bld_debug.h" 45#include "lp_bld_type.h" 46#include "lp_bld_const.h" 47#include "lp_bld_conv.h" 48#include "lp_bld_arit.h" 49#include "lp_bld_bitarit.h" 50#include "lp_bld_logic.h" 51#include "lp_bld_swizzle.h" 52#include "lp_bld_pack.h" 53#include "lp_bld_flow.h" 54#include "lp_bld_gather.h" 55#include "lp_bld_format.h" 56#include "lp_bld_init.h" 57#include "lp_bld_sample.h" 58#include "lp_bld_sample_aos.h" 59#include "lp_bld_quad.h" 60 61 62/** 63 * Build LLVM code for texture coord wrapping, for nearest filtering, 64 * for scaled integer texcoords. 65 * \param block_length is the length of the pixel block along the 66 * coordinate axis 67 * \param coord the incoming texcoord (s,t or r) scaled to the texture size 68 * \param coord_f the incoming texcoord (s,t or r) as float vec 69 * \param length the texture size along one dimension 70 * \param stride pixel stride along the coordinate axis (in bytes) 71 * \param offset the texel offset along the coord axis 72 * \param is_pot if TRUE, length is a power of two 73 * \param wrap_mode one of PIPE_TEX_WRAP_x 74 * \param out_offset byte offset for the wrapped coordinate 75 * \param out_i resulting sub-block pixel coordinate for coord0 76 */ 77static void 78lp_build_sample_wrap_nearest_int(struct lp_build_sample_context *bld, 79 unsigned block_length, 80 LLVMValueRef coord, 81 LLVMValueRef coord_f, 82 LLVMValueRef length, 83 LLVMValueRef stride, 84 LLVMValueRef offset, 85 boolean is_pot, 86 unsigned wrap_mode, 87 LLVMValueRef *out_offset, 88 LLVMValueRef *out_i) 89{ 90 struct lp_build_context *int_coord_bld = &bld->int_coord_bld; 91 LLVMBuilderRef builder = bld->gallivm->builder; 92 LLVMValueRef length_minus_one; 93 94 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); 95 96 switch(wrap_mode) { 97 case PIPE_TEX_WRAP_REPEAT: 98 if(is_pot) 99 coord = LLVMBuildAnd(builder, coord, length_minus_one, ""); 100 else { 101 struct lp_build_context *coord_bld = &bld->coord_bld; 102 LLVMValueRef length_f = lp_build_int_to_float(coord_bld, length); 103 if (offset) { 104 offset = lp_build_int_to_float(coord_bld, offset); 105 offset = lp_build_div(coord_bld, offset, length_f); 106 coord_f = lp_build_add(coord_bld, coord_f, offset); 107 } 108 coord = lp_build_fract_safe(coord_bld, coord_f); 109 coord = lp_build_mul(coord_bld, coord, length_f); 110 coord = lp_build_itrunc(coord_bld, coord); 111 } 112 break; 113 114 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 115 coord = lp_build_max(int_coord_bld, coord, int_coord_bld->zero); 116 coord = lp_build_min(int_coord_bld, coord, length_minus_one); 117 break; 118 119 case PIPE_TEX_WRAP_CLAMP: 120 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: 121 case PIPE_TEX_WRAP_MIRROR_REPEAT: 122 case PIPE_TEX_WRAP_MIRROR_CLAMP: 123 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: 124 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 125 default: 126 assert(0); 127 } 128 129 lp_build_sample_partial_offset(int_coord_bld, block_length, coord, stride, 130 out_offset, out_i); 131} 132 133 134/** 135 * Build LLVM code for texture coord wrapping, for nearest filtering, 136 * for float texcoords. 137 * \param coord the incoming texcoord (s,t or r) 138 * \param length the texture size along one dimension 139 * \param offset the texel offset along the coord axis 140 * \param is_pot if TRUE, length is a power of two 141 * \param wrap_mode one of PIPE_TEX_WRAP_x 142 * \param icoord the texcoord after wrapping, as int 143 */ 144static void 145lp_build_sample_wrap_nearest_float(struct lp_build_sample_context *bld, 146 LLVMValueRef coord, 147 LLVMValueRef length, 148 LLVMValueRef offset, 149 boolean is_pot, 150 unsigned wrap_mode, 151 LLVMValueRef *icoord) 152{ 153 struct lp_build_context *coord_bld = &bld->coord_bld; 154 LLVMValueRef length_minus_one; 155 156 switch(wrap_mode) { 157 case PIPE_TEX_WRAP_REPEAT: 158 if (offset) { 159 /* this is definitely not ideal for POT case */ 160 offset = lp_build_int_to_float(coord_bld, offset); 161 offset = lp_build_div(coord_bld, offset, length); 162 coord = lp_build_add(coord_bld, coord, offset); 163 } 164 /* take fraction, unnormalize */ 165 coord = lp_build_fract_safe(coord_bld, coord); 166 coord = lp_build_mul(coord_bld, coord, length); 167 *icoord = lp_build_itrunc(coord_bld, coord); 168 break; 169 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 170 length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one); 171 if (bld->static_sampler_state->normalized_coords) { 172 /* scale coord to length */ 173 coord = lp_build_mul(coord_bld, coord, length); 174 } 175 if (offset) { 176 offset = lp_build_int_to_float(coord_bld, offset); 177 coord = lp_build_add(coord_bld, coord, offset); 178 } 179 coord = lp_build_clamp(coord_bld, coord, coord_bld->zero, 180 length_minus_one); 181 *icoord = lp_build_itrunc(coord_bld, coord); 182 break; 183 184 case PIPE_TEX_WRAP_CLAMP: 185 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: 186 case PIPE_TEX_WRAP_MIRROR_REPEAT: 187 case PIPE_TEX_WRAP_MIRROR_CLAMP: 188 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: 189 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 190 default: 191 assert(0); 192 } 193} 194 195 196/** 197 * Helper to compute the first coord and the weight for 198 * linear wrap repeat npot textures 199 */ 200static void 201lp_build_coord_repeat_npot_linear_int(struct lp_build_sample_context *bld, 202 LLVMValueRef coord_f, 203 LLVMValueRef length_i, 204 LLVMValueRef length_f, 205 LLVMValueRef *coord0_i, 206 LLVMValueRef *weight_i) 207{ 208 struct lp_build_context *coord_bld = &bld->coord_bld; 209 struct lp_build_context *int_coord_bld = &bld->int_coord_bld; 210 struct lp_build_context abs_coord_bld; 211 struct lp_type abs_type; 212 LLVMValueRef length_minus_one = lp_build_sub(int_coord_bld, length_i, 213 int_coord_bld->one); 214 LLVMValueRef mask, i32_c8, i32_c128, i32_c255; 215 216 /* wrap with normalized floats is just fract */ 217 coord_f = lp_build_fract(coord_bld, coord_f); 218 /* mul by size */ 219 coord_f = lp_build_mul(coord_bld, coord_f, length_f); 220 /* convert to int, compute lerp weight */ 221 coord_f = lp_build_mul_imm(&bld->coord_bld, coord_f, 256); 222 223 /* At this point we don't have any negative numbers so use non-signed 224 * build context which might help on some archs. 225 */ 226 abs_type = coord_bld->type; 227 abs_type.sign = 0; 228 lp_build_context_init(&abs_coord_bld, bld->gallivm, abs_type); 229 *coord0_i = lp_build_iround(&abs_coord_bld, coord_f); 230 231 /* subtract 0.5 (add -128) */ 232 i32_c128 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, -128); 233 *coord0_i = LLVMBuildAdd(bld->gallivm->builder, *coord0_i, i32_c128, ""); 234 235 /* compute fractional part (AND with 0xff) */ 236 i32_c255 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 255); 237 *weight_i = LLVMBuildAnd(bld->gallivm->builder, *coord0_i, i32_c255, ""); 238 239 /* compute floor (shift right 8) */ 240 i32_c8 = lp_build_const_int_vec(bld->gallivm, bld->int_coord_type, 8); 241 *coord0_i = LLVMBuildAShr(bld->gallivm->builder, *coord0_i, i32_c8, ""); 242 /* 243 * we avoided the 0.5/length division before the repeat wrap, 244 * now need to fix up edge cases with selects 245 */ 246 mask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, 247 PIPE_FUNC_LESS, *coord0_i, int_coord_bld->zero); 248 *coord0_i = lp_build_select(int_coord_bld, mask, length_minus_one, *coord0_i); 249 /* 250 * We should never get values too large - except if coord was nan or inf, 251 * in which case things go terribly wrong... 252 * Alternatively, could use fract_safe above... 253 */ 254 *coord0_i = lp_build_min(int_coord_bld, *coord0_i, length_minus_one); 255} 256 257 258/** 259 * Build LLVM code for texture coord wrapping, for linear filtering, 260 * for scaled integer texcoords. 261 * \param block_length is the length of the pixel block along the 262 * coordinate axis 263 * \param coord0 the incoming texcoord (s,t or r) scaled to the texture size 264 * \param coord_f the incoming texcoord (s,t or r) as float vec 265 * \param length the texture size along one dimension 266 * \param stride pixel stride along the coordinate axis (in bytes) 267 * \param offset the texel offset along the coord axis 268 * \param is_pot if TRUE, length is a power of two 269 * \param wrap_mode one of PIPE_TEX_WRAP_x 270 * \param offset0 resulting relative offset for coord0 271 * \param offset1 resulting relative offset for coord0 + 1 272 * \param i0 resulting sub-block pixel coordinate for coord0 273 * \param i1 resulting sub-block pixel coordinate for coord0 + 1 274 */ 275static void 276lp_build_sample_wrap_linear_int(struct lp_build_sample_context *bld, 277 unsigned block_length, 278 LLVMValueRef coord0, 279 LLVMValueRef *weight_i, 280 LLVMValueRef coord_f, 281 LLVMValueRef length, 282 LLVMValueRef stride, 283 LLVMValueRef offset, 284 boolean is_pot, 285 unsigned wrap_mode, 286 LLVMValueRef *offset0, 287 LLVMValueRef *offset1, 288 LLVMValueRef *i0, 289 LLVMValueRef *i1) 290{ 291 struct lp_build_context *int_coord_bld = &bld->int_coord_bld; 292 LLVMBuilderRef builder = bld->gallivm->builder; 293 LLVMValueRef length_minus_one; 294 LLVMValueRef lmask, umask, mask; 295 296 /* 297 * If the pixel block covers more than one pixel then there is no easy 298 * way to calculate offset1 relative to offset0. Instead, compute them 299 * independently. Otherwise, try to compute offset0 and offset1 with 300 * a single stride multiplication. 301 */ 302 303 length_minus_one = lp_build_sub(int_coord_bld, length, int_coord_bld->one); 304 305 if (block_length != 1) { 306 LLVMValueRef coord1; 307 switch(wrap_mode) { 308 case PIPE_TEX_WRAP_REPEAT: 309 if (is_pot) { 310 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); 311 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); 312 coord1 = LLVMBuildAnd(builder, coord1, length_minus_one, ""); 313 } 314 else { 315 LLVMValueRef mask; 316 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length); 317 if (offset) { 318 offset = lp_build_int_to_float(&bld->coord_bld, offset); 319 offset = lp_build_div(&bld->coord_bld, offset, length_f); 320 coord_f = lp_build_add(&bld->coord_bld, coord_f, offset); 321 } 322 lp_build_coord_repeat_npot_linear_int(bld, coord_f, 323 length, length_f, 324 &coord0, weight_i); 325 mask = lp_build_compare(bld->gallivm, int_coord_bld->type, 326 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); 327 coord1 = LLVMBuildAnd(builder, 328 lp_build_add(int_coord_bld, coord0, 329 int_coord_bld->one), 330 mask, ""); 331 } 332 break; 333 334 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 335 coord1 = lp_build_add(int_coord_bld, coord0, int_coord_bld->one); 336 coord0 = lp_build_clamp(int_coord_bld, coord0, int_coord_bld->zero, 337 length_minus_one); 338 coord1 = lp_build_clamp(int_coord_bld, coord1, int_coord_bld->zero, 339 length_minus_one); 340 break; 341 342 case PIPE_TEX_WRAP_CLAMP: 343 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: 344 case PIPE_TEX_WRAP_MIRROR_REPEAT: 345 case PIPE_TEX_WRAP_MIRROR_CLAMP: 346 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: 347 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 348 default: 349 assert(0); 350 coord0 = int_coord_bld->zero; 351 coord1 = int_coord_bld->zero; 352 break; 353 } 354 lp_build_sample_partial_offset(int_coord_bld, block_length, coord0, stride, 355 offset0, i0); 356 lp_build_sample_partial_offset(int_coord_bld, block_length, coord1, stride, 357 offset1, i1); 358 return; 359 } 360 361 *i0 = int_coord_bld->zero; 362 *i1 = int_coord_bld->zero; 363 364 switch(wrap_mode) { 365 case PIPE_TEX_WRAP_REPEAT: 366 if (is_pot) { 367 coord0 = LLVMBuildAnd(builder, coord0, length_minus_one, ""); 368 } 369 else { 370 LLVMValueRef length_f = lp_build_int_to_float(&bld->coord_bld, length); 371 if (offset) { 372 offset = lp_build_int_to_float(&bld->coord_bld, offset); 373 offset = lp_build_div(&bld->coord_bld, offset, length_f); 374 coord_f = lp_build_add(&bld->coord_bld, coord_f, offset); 375 } 376 lp_build_coord_repeat_npot_linear_int(bld, coord_f, 377 length, length_f, 378 &coord0, weight_i); 379 } 380 381 mask = lp_build_compare(bld->gallivm, int_coord_bld->type, 382 PIPE_FUNC_NOTEQUAL, coord0, length_minus_one); 383 384 *offset0 = lp_build_mul(int_coord_bld, coord0, stride); 385 *offset1 = LLVMBuildAnd(builder, 386 lp_build_add(int_coord_bld, *offset0, stride), 387 mask, ""); 388 break; 389 390 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 391 /* XXX this might be slower than the separate path 392 * on some newer cpus. With sse41 this is 8 instructions vs. 7 393 * - at least on SNB this is almost certainly slower since 394 * min/max are cheaper than selects, and the muls aren't bad. 395 */ 396 lmask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, 397 PIPE_FUNC_GEQUAL, coord0, int_coord_bld->zero); 398 umask = lp_build_compare(int_coord_bld->gallivm, int_coord_bld->type, 399 PIPE_FUNC_LESS, coord0, length_minus_one); 400 401 coord0 = lp_build_select(int_coord_bld, lmask, coord0, int_coord_bld->zero); 402 coord0 = lp_build_select(int_coord_bld, umask, coord0, length_minus_one); 403 404 mask = LLVMBuildAnd(builder, lmask, umask, ""); 405 406 *offset0 = lp_build_mul(int_coord_bld, coord0, stride); 407 *offset1 = lp_build_add(int_coord_bld, 408 *offset0, 409 LLVMBuildAnd(builder, stride, mask, "")); 410 break; 411 412 case PIPE_TEX_WRAP_CLAMP: 413 case PIPE_TEX_WRAP_CLAMP_TO_BORDER: 414 case PIPE_TEX_WRAP_MIRROR_REPEAT: 415 case PIPE_TEX_WRAP_MIRROR_CLAMP: 416 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE: 417 case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER: 418 default: 419 assert(0); 420 *offset0 = int_coord_bld->zero; 421 *offset1 = int_coord_bld->zero; 422 break; 423 } 424} 425 426 427/** 428 * Build LLVM code for texture coord wrapping, for linear filtering, 429 * for float texcoords. 430 * \param block_length is the length of the pixel block along the 431 * coordinate axis 432 * \param coord the incoming texcoord (s,t or r) 433 * \param length the texture size along one dimension 434 * \param offset the texel offset along the coord axis 435 * \param is_pot if TRUE, length is a power of two 436 * \param wrap_mode one of PIPE_TEX_WRAP_x 437 * \param coord0 the first texcoord after wrapping, as int 438 * \param coord1 the second texcoord after wrapping, as int 439 * \param weight the filter weight as int (0-255) 440 * \param force_nearest if this coord actually uses nearest filtering 441 */ 442static void 443lp_build_sample_wrap_linear_float(struct lp_build_sample_context *bld, 444 unsigned block_length, 445 LLVMValueRef coord, 446 LLVMValueRef length, 447 LLVMValueRef offset, 448 boolean is_pot, 449 unsigned wrap_mode, 450 LLVMValueRef *coord0, 451 LLVMValueRef *coord1, 452 LLVMValueRef *weight, 453 unsigned force_nearest) 454{ 455 struct lp_build_context *int_coord_bld = &bld->int_coord_bld; 456 struct lp_build_context *coord_bld = &bld->coord_bld; 457 LLVMBuilderRef builder = bld->gallivm->builder; 458 LLVMValueRef half = lp_build_const_vec(bld->gallivm, coord_bld->type, 0.5); 459 LLVMValueRef length_minus_one = lp_build_sub(coord_bld, length, coord_bld->one); 460 461 switch(wrap_mode) { 462 case PIPE_TEX_WRAP_REPEAT: 463 if (is_pot) { 464 /* mul by size and subtract 0.5 */ 465 coord = lp_build_mul(coord_bld, coord, length); 466 if (offset) { 467 offset = lp_build_int_to_float(coord_bld, offset); 468 coord = lp_build_add(coord_bld, coord, offset); 469 } 470 if (!force_nearest) 471 coord = lp_build_sub(coord_bld, coord, half); 472 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one); 473 /* convert to int, compute lerp weight */ 474 lp_build_ifloor_fract(coord_bld, coord, coord0, weight); 475 *coord1 = lp_build_ifloor(coord_bld, *coord1); 476 /* repeat wrap */ 477 length_minus_one = lp_build_itrunc(coord_bld, length_minus_one); 478 *coord0 = LLVMBuildAnd(builder, *coord0, length_minus_one, ""); 479 *coord1 = LLVMBuildAnd(builder, *coord1, length_minus_one, ""); 480 } 481 else { 482 LLVMValueRef mask; 483 if (offset) { 484 offset = lp_build_int_to_float(coord_bld, offset); 485 offset = lp_build_div(coord_bld, offset, length); 486 coord = lp_build_add(coord_bld, coord, offset); 487 } 488 /* wrap with normalized floats is just fract */ 489 coord = lp_build_fract(coord_bld, coord); 490 /* unnormalize */ 491 coord = lp_build_mul(coord_bld, coord, length); 492 /* 493 * we avoided the 0.5/length division, have to fix up wrong 494 * edge cases with selects 495 */ 496 *coord1 = lp_build_add(coord_bld, coord, half); 497 coord = lp_build_sub(coord_bld, coord, half); 498 *weight = lp_build_fract(coord_bld, coord); 499 /* 500 * It is important for this comparison to be unordered 501 * (or need fract_safe above). 502 */ 503 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type, 504 PIPE_FUNC_LESS, coord, coord_bld->zero); 505 *coord0 = lp_build_select(coord_bld, mask, length_minus_one, coord); 506 *coord0 = lp_build_itrunc(coord_bld, *coord0); 507 mask = lp_build_compare(coord_bld->gallivm, coord_bld->type, 508 PIPE_FUNC_LESS, *coord1, length); 509 *coord1 = lp_build_select(coord_bld, mask, *coord1, coord_bld->zero); 510 *coord1 = lp_build_itrunc(coord_bld, *coord1); 511 } 512 break; 513 case PIPE_TEX_WRAP_CLAMP_TO_EDGE: 514 if (bld->static_sampler_state->normalized_coords) { 515 /* mul by tex size */ 516 coord = lp_build_mul(coord_bld, coord, length); 517 } 518 if (offset) { 519 offset = lp_build_int_to_float(coord_bld, offset); 520 coord = lp_build_add(coord_bld, coord, offset); 521 } 522 /* subtract 0.5 */ 523 if (!force_nearest) { 524 coord = lp_build_sub(coord_bld, coord, half); 525 } 526 /* clamp to [0, length - 1] */ 527 coord = lp_build_min_ext(coord_bld, coord, length_minus_one, 528 GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN); 529 coord = lp_build_max(coord_bld, coord, coord_bld->zero); 530 *coord1 = lp_build_add(coord_bld, coord, coord_bld->one); 531 /* convert to int, compute lerp weight */ 532 lp_build_ifloor_fract(coord_bld, coord, coord0, weight); 533 /* coord1 = min(coord1, length-1) */ 534 *coord1 = lp_build_min(coord_bld, *coord1, length_minus_one); 535 *coord1 = lp_build_itrunc(coord_bld, *coord1); 536 break; 537 default: 538 assert(0); 539 *coord0 = int_coord_bld->zero; 540 *coord1 = int_coord_bld->zero; 541 *weight = coord_bld->zero; 542 break; 543 } 544 *weight = lp_build_mul_imm(coord_bld, *weight, 256); 545 *weight = lp_build_itrunc(coord_bld, *weight); 546 return; 547} 548 549 550/** 551 * Fetch texels for image with nearest sampling. 552 * Return filtered color as two vectors of 16-bit fixed point values. 553 */ 554static void 555lp_build_sample_fetch_image_nearest(struct lp_build_sample_context *bld, 556 LLVMValueRef data_ptr, 557 LLVMValueRef offset, 558 LLVMValueRef x_subcoord, 559 LLVMValueRef y_subcoord, 560 LLVMValueRef *colors) 561{ 562 /* 563 * Fetch the pixels as 4 x 32bit (rgba order might differ): 564 * 565 * rgba0 rgba1 rgba2 rgba3 566 * 567 * bit cast them into 16 x u8 568 * 569 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 570 * 571 * unpack them into two 8 x i16: 572 * 573 * r0 g0 b0 a0 r1 g1 b1 a1 574 * r2 g2 b2 a2 r3 g3 b3 a3 575 * 576 * The higher 8 bits of the resulting elements will be zero. 577 */ 578 LLVMBuilderRef builder = bld->gallivm->builder; 579 LLVMValueRef rgba8; 580 struct lp_build_context u8n; 581 LLVMTypeRef u8n_vec_type; 582 struct lp_type fetch_type; 583 584 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width)); 585 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); 586 587 fetch_type = lp_type_uint(bld->texel_type.width); 588 if (util_format_is_rgba8_variant(bld->format_desc)) { 589 /* 590 * Given the format is a rgba8, just read the pixels as is, 591 * without any swizzling. Swizzling will be done later. 592 */ 593 rgba8 = lp_build_gather(bld->gallivm, 594 bld->texel_type.length, 595 bld->format_desc->block.bits, 596 fetch_type, 597 TRUE, 598 data_ptr, offset, TRUE); 599 600 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); 601 } 602 else { 603 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, 604 bld->format_desc, 605 u8n.type, 606 TRUE, 607 data_ptr, offset, 608 x_subcoord, 609 y_subcoord, 610 bld->cache); 611 } 612 613 *colors = rgba8; 614} 615 616 617/** 618 * Sample a single texture image with nearest sampling. 619 * If sampling a cube texture, r = cube face in [0,5]. 620 * Return filtered color as two vectors of 16-bit fixed point values. 621 */ 622static void 623lp_build_sample_image_nearest(struct lp_build_sample_context *bld, 624 LLVMValueRef int_size, 625 LLVMValueRef row_stride_vec, 626 LLVMValueRef img_stride_vec, 627 LLVMValueRef data_ptr, 628 LLVMValueRef mipoffsets, 629 LLVMValueRef s, 630 LLVMValueRef t, 631 LLVMValueRef r, 632 const LLVMValueRef *offsets, 633 LLVMValueRef *colors) 634{ 635 const unsigned dims = bld->dims; 636 struct lp_build_context i32; 637 LLVMValueRef width_vec, height_vec, depth_vec; 638 LLVMValueRef s_ipart, t_ipart = NULL, r_ipart = NULL; 639 LLVMValueRef s_float, t_float = NULL, r_float = NULL; 640 LLVMValueRef x_stride; 641 LLVMValueRef x_offset, offset; 642 LLVMValueRef x_subcoord, y_subcoord, z_subcoord; 643 644 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width)); 645 646 lp_build_extract_image_sizes(bld, 647 &bld->int_size_bld, 648 bld->int_coord_type, 649 int_size, 650 &width_vec, 651 &height_vec, 652 &depth_vec); 653 654 s_float = s; t_float = t; r_float = r; 655 656 if (bld->static_sampler_state->normalized_coords) { 657 LLVMValueRef flt_size; 658 659 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size); 660 661 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r); 662 } 663 664 /* convert float to int */ 665 /* For correct rounding, need floor, not truncation here. 666 * Note that in some cases (clamp to edge, no texel offsets) we 667 * could use a non-signed build context which would help archs 668 * greatly which don't have arch rounding. 669 */ 670 s_ipart = lp_build_ifloor(&bld->coord_bld, s); 671 if (dims >= 2) 672 t_ipart = lp_build_ifloor(&bld->coord_bld, t); 673 if (dims >= 3) 674 r_ipart = lp_build_ifloor(&bld->coord_bld, r); 675 676 /* add texel offsets */ 677 if (offsets[0]) { 678 s_ipart = lp_build_add(&i32, s_ipart, offsets[0]); 679 if (dims >= 2) { 680 t_ipart = lp_build_add(&i32, t_ipart, offsets[1]); 681 if (dims >= 3) { 682 r_ipart = lp_build_add(&i32, r_ipart, offsets[2]); 683 } 684 } 685 } 686 687 /* get pixel, row, image strides */ 688 x_stride = lp_build_const_vec(bld->gallivm, 689 bld->int_coord_bld.type, 690 bld->format_desc->block.bits/8); 691 692 /* Do texcoord wrapping, compute texel offset */ 693 lp_build_sample_wrap_nearest_int(bld, 694 bld->format_desc->block.width, 695 s_ipart, s_float, 696 width_vec, x_stride, offsets[0], 697 bld->static_texture_state->pot_width, 698 bld->static_sampler_state->wrap_s, 699 &x_offset, &x_subcoord); 700 offset = x_offset; 701 if (dims >= 2) { 702 LLVMValueRef y_offset; 703 lp_build_sample_wrap_nearest_int(bld, 704 bld->format_desc->block.height, 705 t_ipart, t_float, 706 height_vec, row_stride_vec, offsets[1], 707 bld->static_texture_state->pot_height, 708 bld->static_sampler_state->wrap_t, 709 &y_offset, &y_subcoord); 710 offset = lp_build_add(&bld->int_coord_bld, offset, y_offset); 711 if (dims >= 3) { 712 LLVMValueRef z_offset; 713 lp_build_sample_wrap_nearest_int(bld, 714 1, /* block length (depth) */ 715 r_ipart, r_float, 716 depth_vec, img_stride_vec, offsets[2], 717 bld->static_texture_state->pot_depth, 718 bld->static_sampler_state->wrap_r, 719 &z_offset, &z_subcoord); 720 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset); 721 } 722 } 723 if (has_layer_coord(bld->static_texture_state->target)) { 724 LLVMValueRef z_offset; 725 /* The r coord is the cube face in [0,5] or array layer */ 726 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); 727 offset = lp_build_add(&bld->int_coord_bld, offset, z_offset); 728 } 729 if (mipoffsets) { 730 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets); 731 } 732 733 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset, 734 x_subcoord, y_subcoord, 735 colors); 736} 737 738 739/** 740 * Sample a single texture image with nearest sampling. 741 * If sampling a cube texture, r = cube face in [0,5]. 742 * Return filtered color as two vectors of 16-bit fixed point values. 743 * Does address calcs (except offsets) with floats. 744 * Useful for AVX which has support for 8x32 floats but not 8x32 ints. 745 */ 746static void 747lp_build_sample_image_nearest_afloat(struct lp_build_sample_context *bld, 748 LLVMValueRef int_size, 749 LLVMValueRef row_stride_vec, 750 LLVMValueRef img_stride_vec, 751 LLVMValueRef data_ptr, 752 LLVMValueRef mipoffsets, 753 LLVMValueRef s, 754 LLVMValueRef t, 755 LLVMValueRef r, 756 const LLVMValueRef *offsets, 757 LLVMValueRef *colors) 758 { 759 const unsigned dims = bld->dims; 760 LLVMValueRef width_vec, height_vec, depth_vec; 761 LLVMValueRef offset; 762 LLVMValueRef x_subcoord, y_subcoord; 763 LLVMValueRef x_icoord = NULL, y_icoord = NULL, z_icoord = NULL; 764 LLVMValueRef flt_size; 765 766 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size); 767 768 lp_build_extract_image_sizes(bld, 769 &bld->float_size_bld, 770 bld->coord_type, 771 flt_size, 772 &width_vec, 773 &height_vec, 774 &depth_vec); 775 776 /* Do texcoord wrapping */ 777 lp_build_sample_wrap_nearest_float(bld, 778 s, width_vec, offsets[0], 779 bld->static_texture_state->pot_width, 780 bld->static_sampler_state->wrap_s, 781 &x_icoord); 782 783 if (dims >= 2) { 784 lp_build_sample_wrap_nearest_float(bld, 785 t, height_vec, offsets[1], 786 bld->static_texture_state->pot_height, 787 bld->static_sampler_state->wrap_t, 788 &y_icoord); 789 790 if (dims >= 3) { 791 lp_build_sample_wrap_nearest_float(bld, 792 r, depth_vec, offsets[2], 793 bld->static_texture_state->pot_depth, 794 bld->static_sampler_state->wrap_r, 795 &z_icoord); 796 } 797 } 798 if (has_layer_coord(bld->static_texture_state->target)) { 799 z_icoord = r; 800 } 801 802 /* 803 * From here on we deal with ints, and we should split up the 256bit 804 * vectors manually for better generated code. 805 */ 806 807 /* 808 * compute texel offsets - 809 * cannot do offset calc with floats, difficult for block-based formats, 810 * and not enough precision anyway. 811 */ 812 lp_build_sample_offset(&bld->int_coord_bld, 813 bld->format_desc, 814 x_icoord, y_icoord, 815 z_icoord, 816 row_stride_vec, img_stride_vec, 817 &offset, 818 &x_subcoord, &y_subcoord); 819 if (mipoffsets) { 820 offset = lp_build_add(&bld->int_coord_bld, offset, mipoffsets); 821 } 822 823 lp_build_sample_fetch_image_nearest(bld, data_ptr, offset, 824 x_subcoord, y_subcoord, 825 colors); 826} 827 828 829/** 830 * Fetch texels for image with linear sampling. 831 * Return filtered color as two vectors of 16-bit fixed point values. 832 */ 833static void 834lp_build_sample_fetch_image_linear(struct lp_build_sample_context *bld, 835 LLVMValueRef data_ptr, 836 LLVMValueRef offset[2][2][2], 837 LLVMValueRef x_subcoord[2], 838 LLVMValueRef y_subcoord[2], 839 LLVMValueRef s_fpart, 840 LLVMValueRef t_fpart, 841 LLVMValueRef r_fpart, 842 LLVMValueRef *colors) 843{ 844 const unsigned dims = bld->dims; 845 LLVMBuilderRef builder = bld->gallivm->builder; 846 struct lp_build_context u8n; 847 LLVMTypeRef u8n_vec_type; 848 LLVMTypeRef elem_type = LLVMInt32TypeInContext(bld->gallivm->context); 849 LLVMValueRef shuffles[LP_MAX_VECTOR_LENGTH]; 850 LLVMValueRef shuffle; 851 LLVMValueRef neighbors[2][2][2]; /* [z][y][x] */ 852 LLVMValueRef packed; 853 unsigned i, j, k; 854 unsigned numj, numk; 855 856 lp_build_context_init(&u8n, bld->gallivm, lp_type_unorm(8, bld->vector_width)); 857 u8n_vec_type = lp_build_vec_type(bld->gallivm, u8n.type); 858 859 /* 860 * Transform 4 x i32 in 861 * 862 * s_fpart = {s0, s1, s2, s3} 863 * 864 * where each value is between 0 and 0xff, 865 * 866 * into one 16 x i20 867 * 868 * s_fpart = {s0, s0, s0, s0, s1, s1, s1, s1, s2, s2, s2, s2, s3, s3, s3, s3} 869 * 870 * and likewise for t_fpart. There is no risk of loosing precision here 871 * since the fractional parts only use the lower 8bits. 872 */ 873 s_fpart = LLVMBuildBitCast(builder, s_fpart, u8n_vec_type, ""); 874 if (dims >= 2) 875 t_fpart = LLVMBuildBitCast(builder, t_fpart, u8n_vec_type, ""); 876 if (dims >= 3) 877 r_fpart = LLVMBuildBitCast(builder, r_fpart, u8n_vec_type, ""); 878 879 for (j = 0; j < u8n.type.length; j += 4) { 880#ifdef PIPE_ARCH_LITTLE_ENDIAN 881 unsigned subindex = 0; 882#else 883 unsigned subindex = 3; 884#endif 885 LLVMValueRef index; 886 887 index = LLVMConstInt(elem_type, j + subindex, 0); 888 for (i = 0; i < 4; ++i) 889 shuffles[j + i] = index; 890 } 891 892 shuffle = LLVMConstVector(shuffles, u8n.type.length); 893 894 s_fpart = LLVMBuildShuffleVector(builder, s_fpart, u8n.undef, 895 shuffle, ""); 896 if (dims >= 2) { 897 t_fpart = LLVMBuildShuffleVector(builder, t_fpart, u8n.undef, 898 shuffle, ""); 899 } 900 if (dims >= 3) { 901 r_fpart = LLVMBuildShuffleVector(builder, r_fpart, u8n.undef, 902 shuffle, ""); 903 } 904 905 /* 906 * Fetch the pixels as 4 x 32bit (rgba order might differ): 907 * 908 * rgba0 rgba1 rgba2 rgba3 909 * 910 * bit cast them into 16 x u8 911 * 912 * r0 g0 b0 a0 r1 g1 b1 a1 r2 g2 b2 a2 r3 g3 b3 a3 913 * 914 * unpack them into two 8 x i16: 915 * 916 * r0 g0 b0 a0 r1 g1 b1 a1 917 * r2 g2 b2 a2 r3 g3 b3 a3 918 * 919 * The higher 8 bits of the resulting elements will be zero. 920 */ 921 numj = 1 + (dims >= 2); 922 numk = 1 + (dims >= 3); 923 924 for (k = 0; k < numk; k++) { 925 for (j = 0; j < numj; j++) { 926 for (i = 0; i < 2; i++) { 927 LLVMValueRef rgba8; 928 929 if (util_format_is_rgba8_variant(bld->format_desc)) { 930 struct lp_type fetch_type; 931 /* 932 * Given the format is a rgba8, just read the pixels as is, 933 * without any swizzling. Swizzling will be done later. 934 */ 935 fetch_type = lp_type_uint(bld->texel_type.width); 936 rgba8 = lp_build_gather(bld->gallivm, 937 bld->texel_type.length, 938 bld->format_desc->block.bits, 939 fetch_type, 940 TRUE, 941 data_ptr, offset[k][j][i], TRUE); 942 943 rgba8 = LLVMBuildBitCast(builder, rgba8, u8n_vec_type, ""); 944 } 945 else { 946 rgba8 = lp_build_fetch_rgba_aos(bld->gallivm, 947 bld->format_desc, 948 u8n.type, 949 TRUE, 950 data_ptr, offset[k][j][i], 951 x_subcoord[i], 952 y_subcoord[j], 953 bld->cache); 954 } 955 956 neighbors[k][j][i] = rgba8; 957 } 958 } 959 } 960 961 /* 962 * Linear interpolation with 8.8 fixed point. 963 */ 964 if (bld->static_sampler_state->force_nearest_s) { 965 /* special case 1-D lerp */ 966 packed = lp_build_lerp(&u8n, 967 t_fpart, 968 neighbors[0][0][0], 969 neighbors[0][0][1], 970 LP_BLD_LERP_PRESCALED_WEIGHTS); 971 } 972 else if (bld->static_sampler_state->force_nearest_t) { 973 /* special case 1-D lerp */ 974 packed = lp_build_lerp(&u8n, 975 s_fpart, 976 neighbors[0][0][0], 977 neighbors[0][0][1], 978 LP_BLD_LERP_PRESCALED_WEIGHTS); 979 } 980 else { 981 /* general 1/2/3-D lerping */ 982 if (dims == 1) { 983 packed = lp_build_lerp(&u8n, 984 s_fpart, 985 neighbors[0][0][0], 986 neighbors[0][0][1], 987 LP_BLD_LERP_PRESCALED_WEIGHTS); 988 } else if (dims == 2) { 989 /* 2-D lerp */ 990 packed = lp_build_lerp_2d(&u8n, 991 s_fpart, t_fpart, 992 neighbors[0][0][0], 993 neighbors[0][0][1], 994 neighbors[0][1][0], 995 neighbors[0][1][1], 996 LP_BLD_LERP_PRESCALED_WEIGHTS); 997 } else { 998 /* 3-D lerp */ 999 assert(dims == 3); 1000 packed = lp_build_lerp_3d(&u8n, 1001 s_fpart, t_fpart, r_fpart, 1002 neighbors[0][0][0], 1003 neighbors[0][0][1], 1004 neighbors[0][1][0], 1005 neighbors[0][1][1], 1006 neighbors[1][0][0], 1007 neighbors[1][0][1], 1008 neighbors[1][1][0], 1009 neighbors[1][1][1], 1010 LP_BLD_LERP_PRESCALED_WEIGHTS); 1011 } 1012 } 1013 1014 *colors = packed; 1015} 1016 1017/** 1018 * Sample a single texture image with (bi-)(tri-)linear sampling. 1019 * Return filtered color as two vectors of 16-bit fixed point values. 1020 */ 1021static void 1022lp_build_sample_image_linear(struct lp_build_sample_context *bld, 1023 LLVMValueRef int_size, 1024 LLVMValueRef row_stride_vec, 1025 LLVMValueRef img_stride_vec, 1026 LLVMValueRef data_ptr, 1027 LLVMValueRef mipoffsets, 1028 LLVMValueRef s, 1029 LLVMValueRef t, 1030 LLVMValueRef r, 1031 const LLVMValueRef *offsets, 1032 LLVMValueRef *colors) 1033{ 1034 const unsigned dims = bld->dims; 1035 LLVMBuilderRef builder = bld->gallivm->builder; 1036 struct lp_build_context i32; 1037 LLVMValueRef i32_c8, i32_c128, i32_c255; 1038 LLVMValueRef width_vec, height_vec, depth_vec; 1039 LLVMValueRef s_ipart, s_fpart, s_float; 1040 LLVMValueRef t_ipart = NULL, t_fpart = NULL, t_float = NULL; 1041 LLVMValueRef r_ipart = NULL, r_fpart = NULL, r_float = NULL; 1042 LLVMValueRef x_stride, y_stride, z_stride; 1043 LLVMValueRef x_offset0, x_offset1; 1044 LLVMValueRef y_offset0, y_offset1; 1045 LLVMValueRef z_offset0, z_offset1; 1046 LLVMValueRef offset[2][2][2]; /* [z][y][x] */ 1047 LLVMValueRef x_subcoord[2], y_subcoord[2], z_subcoord[2]; 1048 unsigned x, y, z; 1049 1050 lp_build_context_init(&i32, bld->gallivm, lp_type_int_vec(32, bld->vector_width)); 1051 1052 lp_build_extract_image_sizes(bld, 1053 &bld->int_size_bld, 1054 bld->int_coord_type, 1055 int_size, 1056 &width_vec, 1057 &height_vec, 1058 &depth_vec); 1059 1060 s_float = s; t_float = t; r_float = r; 1061 1062 if (bld->static_sampler_state->normalized_coords) { 1063 LLVMValueRef scaled_size; 1064 LLVMValueRef flt_size; 1065 1066 /* scale size by 256 (8 fractional bits) */ 1067 scaled_size = lp_build_shl_imm(&bld->int_size_bld, int_size, 8); 1068 1069 flt_size = lp_build_int_to_float(&bld->float_size_bld, scaled_size); 1070 1071 lp_build_unnormalized_coords(bld, flt_size, &s, &t, &r); 1072 } 1073 else { 1074 /* scale coords by 256 (8 fractional bits) */ 1075 s = lp_build_mul_imm(&bld->coord_bld, s, 256); 1076 if (dims >= 2) 1077 t = lp_build_mul_imm(&bld->coord_bld, t, 256); 1078 if (dims >= 3) 1079 r = lp_build_mul_imm(&bld->coord_bld, r, 256); 1080 } 1081 1082 /* convert float to int */ 1083 /* For correct rounding, need round to nearest, not truncation here. 1084 * Note that in some cases (clamp to edge, no texel offsets) we 1085 * could use a non-signed build context which would help archs which 1086 * don't have fptosi intrinsic with nearest rounding implemented. 1087 */ 1088 s = lp_build_iround(&bld->coord_bld, s); 1089 if (dims >= 2) 1090 t = lp_build_iround(&bld->coord_bld, t); 1091 if (dims >= 3) 1092 r = lp_build_iround(&bld->coord_bld, r); 1093 1094 /* subtract 0.5 (add -128) */ 1095 i32_c128 = lp_build_const_int_vec(bld->gallivm, i32.type, -128); 1096 if (!bld->static_sampler_state->force_nearest_s) { 1097 s = LLVMBuildAdd(builder, s, i32_c128, ""); 1098 } 1099 if (dims >= 2 && !bld->static_sampler_state->force_nearest_t) { 1100 t = LLVMBuildAdd(builder, t, i32_c128, ""); 1101 } 1102 if (dims >= 3) { 1103 r = LLVMBuildAdd(builder, r, i32_c128, ""); 1104 } 1105 1106 /* compute floor (shift right 8) */ 1107 i32_c8 = lp_build_const_int_vec(bld->gallivm, i32.type, 8); 1108 s_ipart = LLVMBuildAShr(builder, s, i32_c8, ""); 1109 if (dims >= 2) 1110 t_ipart = LLVMBuildAShr(builder, t, i32_c8, ""); 1111 if (dims >= 3) 1112 r_ipart = LLVMBuildAShr(builder, r, i32_c8, ""); 1113 1114 /* add texel offsets */ 1115 if (offsets[0]) { 1116 s_ipart = lp_build_add(&i32, s_ipart, offsets[0]); 1117 if (dims >= 2) { 1118 t_ipart = lp_build_add(&i32, t_ipart, offsets[1]); 1119 if (dims >= 3) { 1120 r_ipart = lp_build_add(&i32, r_ipart, offsets[2]); 1121 } 1122 } 1123 } 1124 1125 /* compute fractional part (AND with 0xff) */ 1126 i32_c255 = lp_build_const_int_vec(bld->gallivm, i32.type, 255); 1127 s_fpart = LLVMBuildAnd(builder, s, i32_c255, ""); 1128 if (dims >= 2) 1129 t_fpart = LLVMBuildAnd(builder, t, i32_c255, ""); 1130 if (dims >= 3) 1131 r_fpart = LLVMBuildAnd(builder, r, i32_c255, ""); 1132 1133 /* get pixel, row and image strides */ 1134 x_stride = lp_build_const_vec(bld->gallivm, bld->int_coord_bld.type, 1135 bld->format_desc->block.bits/8); 1136 y_stride = row_stride_vec; 1137 z_stride = img_stride_vec; 1138 1139 /* do texcoord wrapping and compute texel offsets */ 1140 lp_build_sample_wrap_linear_int(bld, 1141 bld->format_desc->block.width, 1142 s_ipart, &s_fpart, s_float, 1143 width_vec, x_stride, offsets[0], 1144 bld->static_texture_state->pot_width, 1145 bld->static_sampler_state->wrap_s, 1146 &x_offset0, &x_offset1, 1147 &x_subcoord[0], &x_subcoord[1]); 1148 1149 /* add potential cube/array/mip offsets now as they are constant per pixel */ 1150 if (has_layer_coord(bld->static_texture_state->target)) { 1151 LLVMValueRef z_offset; 1152 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); 1153 /* The r coord is the cube face in [0,5] or array layer */ 1154 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset); 1155 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset); 1156 } 1157 if (mipoffsets) { 1158 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets); 1159 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets); 1160 } 1161 1162 for (z = 0; z < 2; z++) { 1163 for (y = 0; y < 2; y++) { 1164 offset[z][y][0] = x_offset0; 1165 offset[z][y][1] = x_offset1; 1166 } 1167 } 1168 1169 if (dims >= 2) { 1170 lp_build_sample_wrap_linear_int(bld, 1171 bld->format_desc->block.height, 1172 t_ipart, &t_fpart, t_float, 1173 height_vec, y_stride, offsets[1], 1174 bld->static_texture_state->pot_height, 1175 bld->static_sampler_state->wrap_t, 1176 &y_offset0, &y_offset1, 1177 &y_subcoord[0], &y_subcoord[1]); 1178 1179 for (z = 0; z < 2; z++) { 1180 for (x = 0; x < 2; x++) { 1181 offset[z][0][x] = lp_build_add(&bld->int_coord_bld, 1182 offset[z][0][x], y_offset0); 1183 offset[z][1][x] = lp_build_add(&bld->int_coord_bld, 1184 offset[z][1][x], y_offset1); 1185 } 1186 } 1187 } 1188 1189 if (dims >= 3) { 1190 lp_build_sample_wrap_linear_int(bld, 1191 1, /* block length (depth) */ 1192 r_ipart, &r_fpart, r_float, 1193 depth_vec, z_stride, offsets[2], 1194 bld->static_texture_state->pot_depth, 1195 bld->static_sampler_state->wrap_r, 1196 &z_offset0, &z_offset1, 1197 &z_subcoord[0], &z_subcoord[1]); 1198 for (y = 0; y < 2; y++) { 1199 for (x = 0; x < 2; x++) { 1200 offset[0][y][x] = lp_build_add(&bld->int_coord_bld, 1201 offset[0][y][x], z_offset0); 1202 offset[1][y][x] = lp_build_add(&bld->int_coord_bld, 1203 offset[1][y][x], z_offset1); 1204 } 1205 } 1206 } 1207 1208 lp_build_sample_fetch_image_linear(bld, data_ptr, offset, 1209 x_subcoord, y_subcoord, 1210 s_fpart, t_fpart, r_fpart, 1211 colors); 1212} 1213 1214 1215/** 1216 * Sample a single texture image with (bi-)(tri-)linear sampling. 1217 * Return filtered color as two vectors of 16-bit fixed point values. 1218 * Does address calcs (except offsets) with floats. 1219 * Useful for AVX which has support for 8x32 floats but not 8x32 ints. 1220 */ 1221static void 1222lp_build_sample_image_linear_afloat(struct lp_build_sample_context *bld, 1223 LLVMValueRef int_size, 1224 LLVMValueRef row_stride_vec, 1225 LLVMValueRef img_stride_vec, 1226 LLVMValueRef data_ptr, 1227 LLVMValueRef mipoffsets, 1228 LLVMValueRef s, 1229 LLVMValueRef t, 1230 LLVMValueRef r, 1231 const LLVMValueRef *offsets, 1232 LLVMValueRef *colors) 1233{ 1234 const unsigned dims = bld->dims; 1235 LLVMValueRef width_vec, height_vec, depth_vec; 1236 LLVMValueRef s_fpart; 1237 LLVMValueRef t_fpart = NULL; 1238 LLVMValueRef r_fpart = NULL; 1239 LLVMValueRef x_stride, y_stride, z_stride; 1240 LLVMValueRef x_offset0, x_offset1; 1241 LLVMValueRef y_offset0, y_offset1; 1242 LLVMValueRef z_offset0, z_offset1; 1243 LLVMValueRef offset[2][2][2]; /* [z][y][x] */ 1244 LLVMValueRef x_subcoord[2], y_subcoord[2]; 1245 LLVMValueRef flt_size; 1246 LLVMValueRef x_icoord0, x_icoord1; 1247 LLVMValueRef y_icoord0, y_icoord1; 1248 LLVMValueRef z_icoord0, z_icoord1; 1249 unsigned x, y, z; 1250 1251 flt_size = lp_build_int_to_float(&bld->float_size_bld, int_size); 1252 1253 lp_build_extract_image_sizes(bld, 1254 &bld->float_size_bld, 1255 bld->coord_type, 1256 flt_size, 1257 &width_vec, 1258 &height_vec, 1259 &depth_vec); 1260 1261 /* do texcoord wrapping and compute texel offsets */ 1262 lp_build_sample_wrap_linear_float(bld, 1263 bld->format_desc->block.width, 1264 s, width_vec, offsets[0], 1265 bld->static_texture_state->pot_width, 1266 bld->static_sampler_state->wrap_s, 1267 &x_icoord0, &x_icoord1, 1268 &s_fpart, 1269 bld->static_sampler_state->force_nearest_s); 1270 1271 if (dims >= 2) { 1272 lp_build_sample_wrap_linear_float(bld, 1273 bld->format_desc->block.height, 1274 t, height_vec, offsets[1], 1275 bld->static_texture_state->pot_height, 1276 bld->static_sampler_state->wrap_t, 1277 &y_icoord0, &y_icoord1, 1278 &t_fpart, 1279 bld->static_sampler_state->force_nearest_t); 1280 1281 if (dims >= 3) { 1282 lp_build_sample_wrap_linear_float(bld, 1283 1, /* block length (depth) */ 1284 r, depth_vec, offsets[2], 1285 bld->static_texture_state->pot_depth, 1286 bld->static_sampler_state->wrap_r, 1287 &z_icoord0, &z_icoord1, 1288 &r_fpart, 0); 1289 } 1290 } 1291 1292 /* 1293 * From here on we deal with ints, and we should split up the 256bit 1294 * vectors manually for better generated code. 1295 */ 1296 1297 /* get pixel, row and image strides */ 1298 x_stride = lp_build_const_vec(bld->gallivm, 1299 bld->int_coord_bld.type, 1300 bld->format_desc->block.bits/8); 1301 y_stride = row_stride_vec; 1302 z_stride = img_stride_vec; 1303 1304 /* 1305 * compute texel offset - 1306 * cannot do offset calc with floats, difficult for block-based formats, 1307 * and not enough precision anyway. 1308 */ 1309 lp_build_sample_partial_offset(&bld->int_coord_bld, 1310 bld->format_desc->block.width, 1311 x_icoord0, x_stride, 1312 &x_offset0, &x_subcoord[0]); 1313 lp_build_sample_partial_offset(&bld->int_coord_bld, 1314 bld->format_desc->block.width, 1315 x_icoord1, x_stride, 1316 &x_offset1, &x_subcoord[1]); 1317 1318 /* add potential cube/array/mip offsets now as they are constant per pixel */ 1319 if (has_layer_coord(bld->static_texture_state->target)) { 1320 LLVMValueRef z_offset; 1321 z_offset = lp_build_mul(&bld->int_coord_bld, r, img_stride_vec); 1322 /* The r coord is the cube face in [0,5] or array layer */ 1323 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, z_offset); 1324 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, z_offset); 1325 } 1326 if (mipoffsets) { 1327 x_offset0 = lp_build_add(&bld->int_coord_bld, x_offset0, mipoffsets); 1328 x_offset1 = lp_build_add(&bld->int_coord_bld, x_offset1, mipoffsets); 1329 } 1330 1331 for (z = 0; z < 2; z++) { 1332 for (y = 0; y < 2; y++) { 1333 offset[z][y][0] = x_offset0; 1334 offset[z][y][1] = x_offset1; 1335 } 1336 } 1337 1338 if (dims >= 2) { 1339 lp_build_sample_partial_offset(&bld->int_coord_bld, 1340 bld->format_desc->block.height, 1341 y_icoord0, y_stride, 1342 &y_offset0, &y_subcoord[0]); 1343 lp_build_sample_partial_offset(&bld->int_coord_bld, 1344 bld->format_desc->block.height, 1345 y_icoord1, y_stride, 1346 &y_offset1, &y_subcoord[1]); 1347 for (z = 0; z < 2; z++) { 1348 for (x = 0; x < 2; x++) { 1349 offset[z][0][x] = lp_build_add(&bld->int_coord_bld, 1350 offset[z][0][x], y_offset0); 1351 offset[z][1][x] = lp_build_add(&bld->int_coord_bld, 1352 offset[z][1][x], y_offset1); 1353 } 1354 } 1355 } 1356 1357 if (dims >= 3) { 1358 LLVMValueRef z_subcoord[2]; 1359 lp_build_sample_partial_offset(&bld->int_coord_bld, 1360 1, 1361 z_icoord0, z_stride, 1362 &z_offset0, &z_subcoord[0]); 1363 lp_build_sample_partial_offset(&bld->int_coord_bld, 1364 1, 1365 z_icoord1, z_stride, 1366 &z_offset1, &z_subcoord[1]); 1367 for (y = 0; y < 2; y++) { 1368 for (x = 0; x < 2; x++) { 1369 offset[0][y][x] = lp_build_add(&bld->int_coord_bld, 1370 offset[0][y][x], z_offset0); 1371 offset[1][y][x] = lp_build_add(&bld->int_coord_bld, 1372 offset[1][y][x], z_offset1); 1373 } 1374 } 1375 } 1376 1377 lp_build_sample_fetch_image_linear(bld, data_ptr, offset, 1378 x_subcoord, y_subcoord, 1379 s_fpart, t_fpart, r_fpart, 1380 colors); 1381} 1382 1383 1384/** 1385 * Sample the texture/mipmap using given image filter and mip filter. 1386 * data0_ptr and data1_ptr point to the two mipmap levels to sample 1387 * from. width0/1_vec, height0/1_vec, depth0/1_vec indicate their sizes. 1388 * If we're using nearest miplevel sampling the '1' values will be null/unused. 1389 */ 1390static void 1391lp_build_sample_mipmap(struct lp_build_sample_context *bld, 1392 unsigned img_filter, 1393 unsigned mip_filter, 1394 LLVMValueRef s, 1395 LLVMValueRef t, 1396 LLVMValueRef r, 1397 const LLVMValueRef *offsets, 1398 LLVMValueRef ilevel0, 1399 LLVMValueRef ilevel1, 1400 LLVMValueRef lod_fpart, 1401 LLVMValueRef colors_var) 1402{ 1403 LLVMBuilderRef builder = bld->gallivm->builder; 1404 LLVMValueRef size0; 1405 LLVMValueRef size1; 1406 LLVMValueRef row_stride0_vec = NULL; 1407 LLVMValueRef row_stride1_vec = NULL; 1408 LLVMValueRef img_stride0_vec = NULL; 1409 LLVMValueRef img_stride1_vec = NULL; 1410 LLVMValueRef data_ptr0; 1411 LLVMValueRef data_ptr1; 1412 LLVMValueRef mipoff0 = NULL; 1413 LLVMValueRef mipoff1 = NULL; 1414 LLVMValueRef colors0; 1415 LLVMValueRef colors1; 1416 boolean use_floats = util_cpu_caps.has_avx && 1417 !util_cpu_caps.has_avx2 && 1418 bld->coord_type.length > 4; 1419 1420 /* sample the first mipmap level */ 1421 lp_build_mipmap_level_sizes(bld, ilevel0, 1422 &size0, 1423 &row_stride0_vec, &img_stride0_vec); 1424 if (bld->num_mips == 1) { 1425 data_ptr0 = lp_build_get_mipmap_level(bld, ilevel0); 1426 } 1427 else { 1428 /* This path should work for num_lods 1 too but slightly less efficient */ 1429 data_ptr0 = bld->base_ptr; 1430 mipoff0 = lp_build_get_mip_offsets(bld, ilevel0); 1431 } 1432 1433 if (use_floats) { 1434 if (img_filter == PIPE_TEX_FILTER_NEAREST) { 1435 lp_build_sample_image_nearest_afloat(bld, 1436 size0, 1437 row_stride0_vec, img_stride0_vec, 1438 data_ptr0, mipoff0, s, t, r, offsets, 1439 &colors0); 1440 } 1441 else { 1442 assert(img_filter == PIPE_TEX_FILTER_LINEAR); 1443 lp_build_sample_image_linear_afloat(bld, 1444 size0, 1445 row_stride0_vec, img_stride0_vec, 1446 data_ptr0, mipoff0, s, t, r, offsets, 1447 &colors0); 1448 } 1449 } 1450 else { 1451 if (img_filter == PIPE_TEX_FILTER_NEAREST) { 1452 lp_build_sample_image_nearest(bld, 1453 size0, 1454 row_stride0_vec, img_stride0_vec, 1455 data_ptr0, mipoff0, s, t, r, offsets, 1456 &colors0); 1457 } 1458 else { 1459 assert(img_filter == PIPE_TEX_FILTER_LINEAR); 1460 lp_build_sample_image_linear(bld, 1461 size0, 1462 row_stride0_vec, img_stride0_vec, 1463 data_ptr0, mipoff0, s, t, r, offsets, 1464 &colors0); 1465 } 1466 } 1467 1468 /* Store the first level's colors in the output variables */ 1469 LLVMBuildStore(builder, colors0, colors_var); 1470 1471 if (mip_filter == PIPE_TEX_MIPFILTER_LINEAR) { 1472 LLVMValueRef h16vec_scale = lp_build_const_vec(bld->gallivm, 1473 bld->lodf_bld.type, 256.0); 1474 LLVMTypeRef i32vec_type = bld->lodi_bld.vec_type; 1475 struct lp_build_if_state if_ctx; 1476 LLVMValueRef need_lerp; 1477 unsigned num_quads = bld->coord_bld.type.length / 4; 1478 unsigned i; 1479 1480 lod_fpart = LLVMBuildFMul(builder, lod_fpart, h16vec_scale, ""); 1481 lod_fpart = LLVMBuildFPToSI(builder, lod_fpart, i32vec_type, "lod_fpart.fixed16"); 1482 1483 /* need_lerp = lod_fpart > 0 */ 1484 if (bld->num_lods == 1) { 1485 need_lerp = LLVMBuildICmp(builder, LLVMIntSGT, 1486 lod_fpart, bld->lodi_bld.zero, 1487 "need_lerp"); 1488 } 1489 else { 1490 /* 1491 * We'll do mip filtering if any of the quads need it. 1492 * It might be better to split the vectors here and only fetch/filter 1493 * quads which need it. 1494 */ 1495 /* 1496 * We need to clamp lod_fpart here since we can get negative 1497 * values which would screw up filtering if not all 1498 * lod_fpart values have same sign. 1499 * We can however then skip the greater than comparison. 1500 */ 1501 lod_fpart = lp_build_max(&bld->lodi_bld, lod_fpart, 1502 bld->lodi_bld.zero); 1503 need_lerp = lp_build_any_true_range(&bld->lodi_bld, bld->num_lods, lod_fpart); 1504 } 1505 1506 lp_build_if(&if_ctx, bld->gallivm, need_lerp); 1507 { 1508 struct lp_build_context u8n_bld; 1509 1510 lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width)); 1511 1512 /* sample the second mipmap level */ 1513 lp_build_mipmap_level_sizes(bld, ilevel1, 1514 &size1, 1515 &row_stride1_vec, &img_stride1_vec); 1516 if (bld->num_mips == 1) { 1517 data_ptr1 = lp_build_get_mipmap_level(bld, ilevel1); 1518 } 1519 else { 1520 data_ptr1 = bld->base_ptr; 1521 mipoff1 = lp_build_get_mip_offsets(bld, ilevel1); 1522 } 1523 1524 if (use_floats) { 1525 if (img_filter == PIPE_TEX_FILTER_NEAREST) { 1526 lp_build_sample_image_nearest_afloat(bld, 1527 size1, 1528 row_stride1_vec, img_stride1_vec, 1529 data_ptr1, mipoff1, s, t, r, offsets, 1530 &colors1); 1531 } 1532 else { 1533 lp_build_sample_image_linear_afloat(bld, 1534 size1, 1535 row_stride1_vec, img_stride1_vec, 1536 data_ptr1, mipoff1, s, t, r, offsets, 1537 &colors1); 1538 } 1539 } 1540 else { 1541 if (img_filter == PIPE_TEX_FILTER_NEAREST) { 1542 lp_build_sample_image_nearest(bld, 1543 size1, 1544 row_stride1_vec, img_stride1_vec, 1545 data_ptr1, mipoff1, s, t, r, offsets, 1546 &colors1); 1547 } 1548 else { 1549 lp_build_sample_image_linear(bld, 1550 size1, 1551 row_stride1_vec, img_stride1_vec, 1552 data_ptr1, mipoff1, s, t, r, offsets, 1553 &colors1); 1554 } 1555 } 1556 1557 /* interpolate samples from the two mipmap levels */ 1558 1559 if (num_quads == 1 && bld->num_lods == 1) { 1560 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, u8n_bld.elem_type, ""); 1561 lod_fpart = lp_build_broadcast_scalar(&u8n_bld, lod_fpart); 1562 } 1563 else { 1564 unsigned num_chans_per_lod = 4 * bld->coord_type.length / bld->num_lods; 1565 LLVMTypeRef tmp_vec_type = LLVMVectorType(u8n_bld.elem_type, bld->lodi_bld.type.length); 1566 LLVMValueRef shuffle[LP_MAX_VECTOR_LENGTH]; 1567 1568 /* Take the LSB of lod_fpart */ 1569 lod_fpart = LLVMBuildTrunc(builder, lod_fpart, tmp_vec_type, ""); 1570 1571 /* Broadcast each lod weight into their respective channels */ 1572 for (i = 0; i < u8n_bld.type.length; ++i) { 1573 shuffle[i] = lp_build_const_int32(bld->gallivm, i / num_chans_per_lod); 1574 } 1575 lod_fpart = LLVMBuildShuffleVector(builder, lod_fpart, LLVMGetUndef(tmp_vec_type), 1576 LLVMConstVector(shuffle, u8n_bld.type.length), ""); 1577 } 1578 1579 colors0 = lp_build_lerp(&u8n_bld, lod_fpart, 1580 colors0, colors1, 1581 LP_BLD_LERP_PRESCALED_WEIGHTS); 1582 1583 LLVMBuildStore(builder, colors0, colors_var); 1584 } 1585 lp_build_endif(&if_ctx); 1586 } 1587} 1588 1589 1590 1591/** 1592 * Texture sampling in AoS format. Used when sampling common 32-bit/texel 1593 * formats. 1D/2D/3D/cube texture supported. All mipmap sampling modes 1594 * but only limited texture coord wrap modes. 1595 */ 1596void 1597lp_build_sample_aos(struct lp_build_sample_context *bld, 1598 unsigned sampler_unit, 1599 LLVMValueRef s, 1600 LLVMValueRef t, 1601 LLVMValueRef r, 1602 const LLVMValueRef *offsets, 1603 LLVMValueRef lod_positive, 1604 LLVMValueRef lod_fpart, 1605 LLVMValueRef ilevel0, 1606 LLVMValueRef ilevel1, 1607 LLVMValueRef texel_out[4]) 1608{ 1609 LLVMBuilderRef builder = bld->gallivm->builder; 1610 const unsigned mip_filter = bld->static_sampler_state->min_mip_filter; 1611 const unsigned min_filter = bld->static_sampler_state->min_img_filter; 1612 const unsigned mag_filter = bld->static_sampler_state->mag_img_filter; 1613 const unsigned dims = bld->dims; 1614 LLVMValueRef packed_var, packed; 1615 LLVMValueRef unswizzled[4]; 1616 struct lp_build_context u8n_bld; 1617 1618 /* we only support the common/simple wrap modes at this time */ 1619 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_s)); 1620 if (dims >= 2) 1621 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_t)); 1622 if (dims >= 3) 1623 assert(lp_is_simple_wrap_mode(bld->static_sampler_state->wrap_r)); 1624 1625 1626 /* make 8-bit unorm builder context */ 1627 lp_build_context_init(&u8n_bld, bld->gallivm, lp_type_unorm(8, bld->vector_width)); 1628 1629 /* 1630 * Get/interpolate texture colors. 1631 */ 1632 1633 packed_var = lp_build_alloca(bld->gallivm, u8n_bld.vec_type, "packed_var"); 1634 1635 if (min_filter == mag_filter) { 1636 /* no need to distinguish between minification and magnification */ 1637 lp_build_sample_mipmap(bld, 1638 min_filter, mip_filter, 1639 s, t, r, offsets, 1640 ilevel0, ilevel1, lod_fpart, 1641 packed_var); 1642 } 1643 else { 1644 /* Emit conditional to choose min image filter or mag image filter 1645 * depending on the lod being > 0 or <= 0, respectively. 1646 */ 1647 struct lp_build_if_state if_ctx; 1648 1649 /* 1650 * FIXME this should take all lods into account, if some are min 1651 * some max probably could hack up the weights in the linear 1652 * path with selects to work for nearest. 1653 */ 1654 if (bld->num_lods > 1) 1655 lod_positive = LLVMBuildExtractElement(builder, lod_positive, 1656 lp_build_const_int32(bld->gallivm, 0), ""); 1657 1658 lod_positive = LLVMBuildTrunc(builder, lod_positive, 1659 LLVMInt1TypeInContext(bld->gallivm->context), ""); 1660 1661 lp_build_if(&if_ctx, bld->gallivm, lod_positive); 1662 { 1663 /* Use the minification filter */ 1664 lp_build_sample_mipmap(bld, 1665 min_filter, mip_filter, 1666 s, t, r, offsets, 1667 ilevel0, ilevel1, lod_fpart, 1668 packed_var); 1669 } 1670 lp_build_else(&if_ctx); 1671 { 1672 /* Use the magnification filter */ 1673 lp_build_sample_mipmap(bld, 1674 mag_filter, PIPE_TEX_MIPFILTER_NONE, 1675 s, t, r, offsets, 1676 ilevel0, NULL, NULL, 1677 packed_var); 1678 } 1679 lp_build_endif(&if_ctx); 1680 } 1681 1682 packed = LLVMBuildLoad(builder, packed_var, ""); 1683 1684 /* 1685 * Convert to SoA and swizzle. 1686 */ 1687 lp_build_rgba8_to_fi32_soa(bld->gallivm, 1688 bld->texel_type, 1689 packed, unswizzled); 1690 1691 if (util_format_is_rgba8_variant(bld->format_desc)) { 1692 lp_build_format_swizzle_soa(bld->format_desc, 1693 &bld->texel_bld, 1694 unswizzled, texel_out); 1695 } 1696 else { 1697 texel_out[0] = unswizzled[0]; 1698 texel_out[1] = unswizzled[1]; 1699 texel_out[2] = unswizzled[2]; 1700 texel_out[3] = unswizzled[3]; 1701 } 1702} 1703