texturing.cpp revision 96dbb4fc58fe2dcf4390e073dbb42cc77ef2f0b5
1/* libs/pixelflinger/codeflinger/texturing.cpp 2** 3** Copyright 2006, The Android Open Source Project 4** 5** Licensed under the Apache License, Version 2.0 (the "License"); 6** you may not use this file except in compliance with the License. 7** You may obtain a copy of the License at 8** 9** http://www.apache.org/licenses/LICENSE-2.0 10** 11** Unless required by applicable law or agreed to in writing, software 12** distributed under the License is distributed on an "AS IS" BASIS, 13** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14** See the License for the specific language governing permissions and 15** limitations under the License. 16*/ 17 18#include <assert.h> 19#include <stdint.h> 20#include <stdlib.h> 21#include <stdio.h> 22#include <sys/types.h> 23 24#include <cutils/log.h> 25 26#include "codeflinger/GGLAssembler.h" 27 28#include <machine/cpu-features.h> 29 30namespace android { 31 32// --------------------------------------------------------------------------- 33 34// iterators are initialized like this: 35// (intToFixedCenter(x) * dx)>>16 + x0 36// ((x<<16 + 0x8000) * dx)>>16 + x0 37// ((x<<16)*dx + (0x8000*dx))>>16 + x0 38// ( (x*dx) + dx>>1 ) + x0 39// (x*dx) + (dx>>1 + x0) 40 41void GGLAssembler::init_iterated_color(fragment_parts_t& parts, const reg_t& x) 42{ 43 context_t const* c = mBuilderContext.c; 44 const needs_t& needs = mBuilderContext.needs; 45 46 if (mSmooth) { 47 // NOTE: we could take this case in the mDithering + !mSmooth case, 48 // but this would use up to 4 more registers for the color components 49 // for only a little added quality. 50 // Currently, this causes the system to run out of registers in 51 // some case (see issue #719496) 52 53 comment("compute initial iterated color (smooth and/or dither case)"); 54 55 parts.iterated_packed = 0; 56 parts.packed = 0; 57 58 // 0x1: color component 59 // 0x2: iterators 60 const int optReload = mOptLevel >> 1; 61 if (optReload >= 3) parts.reload = 0; // reload nothing 62 else if (optReload == 2) parts.reload = 2; // reload iterators 63 else if (optReload == 1) parts.reload = 1; // reload colors 64 else if (optReload <= 0) parts.reload = 3; // reload both 65 66 if (!mSmooth) { 67 // we're not smoothing (just dithering), we never have to 68 // reload the iterators 69 parts.reload &= ~2; 70 } 71 72 Scratch scratches(registerFile()); 73 const int t0 = (parts.reload & 1) ? scratches.obtain() : 0; 74 const int t1 = (parts.reload & 2) ? scratches.obtain() : 0; 75 for (int i=0 ; i<4 ; i++) { 76 if (!mInfo[i].iterated) 77 continue; 78 79 // this component exists in the destination and is not replaced 80 // by a texture unit. 81 const int c = (parts.reload & 1) ? t0 : obtainReg(); 82 if (i==0) CONTEXT_LOAD(c, iterators.ydady); 83 if (i==1) CONTEXT_LOAD(c, iterators.ydrdy); 84 if (i==2) CONTEXT_LOAD(c, iterators.ydgdy); 85 if (i==3) CONTEXT_LOAD(c, iterators.ydbdy); 86 parts.argb[i].reg = c; 87 88 if (mInfo[i].smooth) { 89 parts.argb_dx[i].reg = (parts.reload & 2) ? t1 : obtainReg(); 90 const int dvdx = parts.argb_dx[i].reg; 91 CONTEXT_LOAD(dvdx, generated_vars.argb[i].dx); 92 MLA(AL, 0, c, x.reg, dvdx, c); 93 94 // adjust the color iterator to make sure it won't overflow 95 if (!mAA) { 96 // this is not needed when we're using anti-aliasing 97 // because we will (have to) clamp the components 98 // anyway. 99 int end = scratches.obtain(); 100 MOV(AL, 0, end, reg_imm(parts.count.reg, LSR, 16)); 101 MLA(AL, 1, end, dvdx, end, c); 102 SUB(MI, 0, c, c, end); 103 BIC(AL, 0, c, c, reg_imm(c, ASR, 31)); 104 scratches.recycle(end); 105 } 106 } 107 108 if (parts.reload & 1) { 109 CONTEXT_STORE(c, generated_vars.argb[i].c); 110 } 111 } 112 } else { 113 // We're not smoothed, so we can 114 // just use a packed version of the color and extract the 115 // components as needed (or not at all if we don't blend) 116 117 // figure out if we need the iterated color 118 int load = 0; 119 for (int i=0 ; i<4 ; i++) { 120 component_info_t& info = mInfo[i]; 121 if ((info.inDest || info.needed) && !info.replaced) 122 load |= 1; 123 } 124 125 parts.iterated_packed = 1; 126 parts.packed = (!mTextureMachine.mask && !mBlending 127 && !mFog && !mDithering); 128 parts.reload = 0; 129 if (load || parts.packed) { 130 if (mBlending || mDithering || mInfo[GGLFormat::ALPHA].needed) { 131 comment("load initial iterated color (8888 packed)"); 132 parts.iterated.setTo(obtainReg(), 133 &(c->formats[GGL_PIXEL_FORMAT_RGBA_8888])); 134 CONTEXT_LOAD(parts.iterated.reg, packed8888); 135 } else { 136 comment("load initial iterated color (dest format packed)"); 137 138 parts.iterated.setTo(obtainReg(), &mCbFormat); 139 140 // pre-mask the iterated color 141 const int bits = parts.iterated.size(); 142 const uint32_t size = ((bits>=32) ? 0 : (1LU << bits)) - 1; 143 uint32_t mask = 0; 144 if (mMasking) { 145 for (int i=0 ; i<4 ; i++) { 146 const int component_mask = 1<<i; 147 const int h = parts.iterated.format.c[i].h; 148 const int l = parts.iterated.format.c[i].l; 149 if (h && (!(mMasking & component_mask))) { 150 mask |= ((1<<(h-l))-1) << l; 151 } 152 } 153 } 154 155 if (mMasking && ((mask & size)==0)) { 156 // none of the components are present in the mask 157 } else { 158 CONTEXT_LOAD(parts.iterated.reg, packed); 159 if (mCbFormat.size == 1) { 160 AND(AL, 0, parts.iterated.reg, 161 parts.iterated.reg, imm(0xFF)); 162 } else if (mCbFormat.size == 2) { 163 MOV(AL, 0, parts.iterated.reg, 164 reg_imm(parts.iterated.reg, LSR, 16)); 165 } 166 } 167 168 // pre-mask the iterated color 169 if (mMasking) { 170 build_and_immediate(parts.iterated.reg, parts.iterated.reg, 171 mask, bits); 172 } 173 } 174 } 175 } 176} 177 178void GGLAssembler::build_iterated_color( 179 component_t& fragment, 180 const fragment_parts_t& parts, 181 int component, 182 Scratch& regs) 183{ 184 fragment.setTo( regs.obtain(), 0, 32, CORRUPTIBLE); 185 186 if (!mInfo[component].iterated) 187 return; 188 189 if (parts.iterated_packed) { 190 // iterated colors are packed, extract the one we need 191 extract(fragment, parts.iterated, component); 192 } else { 193 fragment.h = GGL_COLOR_BITS; 194 fragment.l = GGL_COLOR_BITS - 8; 195 fragment.flags |= CLEAR_LO; 196 // iterated colors are held in their own register, 197 // (smooth and/or dithering case) 198 if (parts.reload==3) { 199 // this implies mSmooth 200 Scratch scratches(registerFile()); 201 int dx = scratches.obtain(); 202 CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c); 203 CONTEXT_LOAD(dx, generated_vars.argb[component].dx); 204 ADD(AL, 0, dx, fragment.reg, dx); 205 CONTEXT_STORE(dx, generated_vars.argb[component].c); 206 } else if (parts.reload & 1) { 207 CONTEXT_LOAD(fragment.reg, generated_vars.argb[component].c); 208 } else { 209 // we don't reload, so simply rename the register and mark as 210 // non CORRUPTIBLE so that the texture env or blending code 211 // won't modify this (renamed) register 212 regs.recycle(fragment.reg); 213 fragment.reg = parts.argb[component].reg; 214 fragment.flags &= ~CORRUPTIBLE; 215 } 216 if (mInfo[component].smooth && mAA) { 217 // when using smooth shading AND anti-aliasing, we need to clamp 218 // the iterators because there is always an extra pixel on the 219 // edges, which most of the time will cause an overflow 220 // (since technically its outside of the domain). 221 BIC(AL, 0, fragment.reg, fragment.reg, 222 reg_imm(fragment.reg, ASR, 31)); 223 component_sat(fragment); 224 } 225 } 226} 227 228// --------------------------------------------------------------------------- 229 230void GGLAssembler::decodeLogicOpNeeds(const needs_t& needs) 231{ 232 // gather some informations about the components we need to process... 233 const int opcode = GGL_READ_NEEDS(LOGIC_OP, needs.n) | GGL_CLEAR; 234 switch(opcode) { 235 case GGL_COPY: 236 mLogicOp = 0; 237 break; 238 case GGL_CLEAR: 239 case GGL_SET: 240 mLogicOp = LOGIC_OP; 241 break; 242 case GGL_AND: 243 case GGL_AND_REVERSE: 244 case GGL_AND_INVERTED: 245 case GGL_XOR: 246 case GGL_OR: 247 case GGL_NOR: 248 case GGL_EQUIV: 249 case GGL_OR_REVERSE: 250 case GGL_OR_INVERTED: 251 case GGL_NAND: 252 mLogicOp = LOGIC_OP|LOGIC_OP_SRC|LOGIC_OP_DST; 253 break; 254 case GGL_NOOP: 255 case GGL_INVERT: 256 mLogicOp = LOGIC_OP|LOGIC_OP_DST; 257 break; 258 case GGL_COPY_INVERTED: 259 mLogicOp = LOGIC_OP|LOGIC_OP_SRC; 260 break; 261 }; 262} 263 264void GGLAssembler::decodeTMUNeeds(const needs_t& needs, context_t const* c) 265{ 266 uint8_t replaced=0; 267 mTextureMachine.mask = 0; 268 mTextureMachine.activeUnits = 0; 269 for (int i=GGL_TEXTURE_UNIT_COUNT-1 ; i>=0 ; i--) { 270 texture_unit_t& tmu = mTextureMachine.tmu[i]; 271 if (replaced == 0xF) { 272 // all components are replaced, skip this TMU. 273 tmu.format_idx = 0; 274 tmu.mask = 0; 275 tmu.replaced = replaced; 276 continue; 277 } 278 tmu.format_idx = GGL_READ_NEEDS(T_FORMAT, needs.t[i]); 279 tmu.format = c->formats[tmu.format_idx]; 280 tmu.bits = tmu.format.size*8; 281 tmu.swrap = GGL_READ_NEEDS(T_S_WRAP, needs.t[i]); 282 tmu.twrap = GGL_READ_NEEDS(T_T_WRAP, needs.t[i]); 283 tmu.env = ggl_needs_to_env(GGL_READ_NEEDS(T_ENV, needs.t[i])); 284 tmu.pot = GGL_READ_NEEDS(T_POT, needs.t[i]); 285 tmu.linear = GGL_READ_NEEDS(T_LINEAR, needs.t[i]) 286 && tmu.format.size!=3; // XXX: only 8, 16 and 32 modes for now 287 288 // 5551 linear filtering is not supported 289 if (tmu.format_idx == GGL_PIXEL_FORMAT_RGBA_5551) 290 tmu.linear = 0; 291 292 tmu.mask = 0; 293 tmu.replaced = replaced; 294 295 if (tmu.format_idx) { 296 mTextureMachine.activeUnits++; 297 if (tmu.format.c[0].h) tmu.mask |= 0x1; 298 if (tmu.format.c[1].h) tmu.mask |= 0x2; 299 if (tmu.format.c[2].h) tmu.mask |= 0x4; 300 if (tmu.format.c[3].h) tmu.mask |= 0x8; 301 if (tmu.env == GGL_REPLACE) { 302 replaced |= tmu.mask; 303 } else if (tmu.env == GGL_DECAL) { 304 if (!tmu.format.c[GGLFormat::ALPHA].h) { 305 // if we don't have alpha, decal does nothing 306 tmu.mask = 0; 307 } else { 308 // decal always ignores At 309 tmu.mask &= ~(1<<GGLFormat::ALPHA); 310 } 311 } 312 } 313 mTextureMachine.mask |= tmu.mask; 314 //printf("%d: mask=%08lx, replaced=%08lx\n", 315 // i, int(tmu.mask), int(tmu.replaced)); 316 } 317 mTextureMachine.replaced = replaced; 318 mTextureMachine.directTexture = 0; 319 //printf("replaced=%08lx\n", mTextureMachine.replaced); 320} 321 322 323void GGLAssembler::init_textures( 324 tex_coord_t* coords, 325 const reg_t& x, const reg_t& y) 326{ 327 context_t const* c = mBuilderContext.c; 328 const needs_t& needs = mBuilderContext.needs; 329 int Rctx = mBuilderContext.Rctx; 330 int Rx = x.reg; 331 int Ry = y.reg; 332 333 if (mTextureMachine.mask) { 334 comment("compute texture coordinates"); 335 } 336 337 // init texture coordinates for each tmu 338 const int cb_format_idx = GGL_READ_NEEDS(CB_FORMAT, needs.n); 339 const bool multiTexture = mTextureMachine.activeUnits > 1; 340 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 341 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 342 if (tmu.format_idx == 0) 343 continue; 344 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 345 (tmu.twrap == GGL_NEEDS_WRAP_11)) 346 { 347 // 1:1 texture 348 pointer_t& txPtr = coords[i].ptr; 349 txPtr.setTo(obtainReg(), tmu.bits); 350 CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydsdy); 351 ADD(AL, 0, Rx, Rx, reg_imm(txPtr.reg, ASR, 16)); // x += (s>>16) 352 CONTEXT_LOAD(txPtr.reg, state.texture[i].iterators.ydtdy); 353 ADD(AL, 0, Ry, Ry, reg_imm(txPtr.reg, ASR, 16)); // y += (t>>16) 354 // merge base & offset 355 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].stride); 356 SMLABB(AL, Rx, Ry, txPtr.reg, Rx); // x+y*stride 357 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data); 358 base_offset(txPtr, txPtr, Rx); 359 } else { 360 Scratch scratches(registerFile()); 361 reg_t& s = coords[i].s; 362 reg_t& t = coords[i].t; 363 // s = (x * dsdx)>>16 + ydsdy 364 // s = (x * dsdx)>>16 + (y*dsdy)>>16 + s0 365 // t = (x * dtdx)>>16 + ydtdy 366 // t = (x * dtdx)>>16 + (y*dtdy)>>16 + t0 367 s.setTo(obtainReg()); 368 t.setTo(obtainReg()); 369 const int need_w = GGL_READ_NEEDS(W, needs.n); 370 if (need_w) { 371 CONTEXT_LOAD(s.reg, state.texture[i].iterators.ydsdy); 372 CONTEXT_LOAD(t.reg, state.texture[i].iterators.ydtdy); 373 } else { 374 int ydsdy = scratches.obtain(); 375 int ydtdy = scratches.obtain(); 376 CONTEXT_LOAD(s.reg, generated_vars.texture[i].dsdx); 377 CONTEXT_LOAD(ydsdy, state.texture[i].iterators.ydsdy); 378 CONTEXT_LOAD(t.reg, generated_vars.texture[i].dtdx); 379 CONTEXT_LOAD(ydtdy, state.texture[i].iterators.ydtdy); 380 MLA(AL, 0, s.reg, Rx, s.reg, ydsdy); 381 MLA(AL, 0, t.reg, Rx, t.reg, ydtdy); 382 } 383 384 if ((mOptLevel&1)==0) { 385 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]); 386 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]); 387 recycleReg(s.reg); 388 recycleReg(t.reg); 389 } 390 } 391 392 // direct texture? 393 if (!multiTexture && !mBlending && !mDithering && !mFog && 394 cb_format_idx == tmu.format_idx && !tmu.linear && 395 mTextureMachine.replaced == tmu.mask) 396 { 397 mTextureMachine.directTexture = i + 1; 398 } 399 } 400} 401 402void GGLAssembler::build_textures( fragment_parts_t& parts, 403 Scratch& regs) 404{ 405 context_t const* c = mBuilderContext.c; 406 const needs_t& needs = mBuilderContext.needs; 407 int Rctx = mBuilderContext.Rctx; 408 409 // We don't have a way to spill registers automatically 410 // spill depth and AA regs, when we know we may have to. 411 // build the spill list... 412 uint32_t spill_list = 0; 413 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 414 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 415 if (tmu.format_idx == 0) 416 continue; 417 if (tmu.linear) { 418 // we may run out of register if we have linear filtering 419 // at 1 or 4 bytes / pixel on any texture unit. 420 if (tmu.format.size == 1) { 421 // if depth and AA enabled, we'll run out of 1 register 422 if (parts.z.reg > 0 && parts.covPtr.reg > 0) 423 spill_list |= 1<<parts.covPtr.reg; 424 } 425 if (tmu.format.size == 4) { 426 // if depth or AA enabled, we'll run out of 1 or 2 registers 427 if (parts.z.reg > 0) 428 spill_list |= 1<<parts.z.reg; 429 if (parts.covPtr.reg > 0) 430 spill_list |= 1<<parts.covPtr.reg; 431 } 432 } 433 } 434 435 Spill spill(registerFile(), *this, spill_list); 436 437 const bool multiTexture = mTextureMachine.activeUnits > 1; 438 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 439 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 440 if (tmu.format_idx == 0) 441 continue; 442 443 pointer_t& txPtr = parts.coords[i].ptr; 444 pixel_t& texel = parts.texel[i]; 445 446 // repeat... 447 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 448 (tmu.twrap == GGL_NEEDS_WRAP_11)) 449 { // 1:1 textures 450 comment("fetch texel"); 451 texel.setTo(regs.obtain(), &tmu.format); 452 load(txPtr, texel, WRITE_BACK); 453 } else { 454 Scratch scratches(registerFile()); 455 reg_t& s = parts.coords[i].s; 456 reg_t& t = parts.coords[i].t; 457 if ((mOptLevel&1)==0) { 458 comment("reload s/t (multitexture or linear filtering)"); 459 s.reg = scratches.obtain(); 460 t.reg = scratches.obtain(); 461 CONTEXT_LOAD(s.reg, generated_vars.texture[i].spill[0]); 462 CONTEXT_LOAD(t.reg, generated_vars.texture[i].spill[1]); 463 } 464 465 comment("compute repeat/clamp"); 466 int u = scratches.obtain(); 467 int v = scratches.obtain(); 468 int width = scratches.obtain(); 469 int height = scratches.obtain(); 470 int U = 0; 471 int V = 0; 472 473 CONTEXT_LOAD(width, generated_vars.texture[i].width); 474 CONTEXT_LOAD(height, generated_vars.texture[i].height); 475 476 int FRAC_BITS = 0; 477 if (tmu.linear) { 478 // linear interpolation 479 if (tmu.format.size == 1) { 480 // for 8-bits textures, we can afford 481 // 7 bits of fractional precision at no 482 // additional cost (we can't do 8 bits 483 // because filter8 uses signed 16 bits muls) 484 FRAC_BITS = 7; 485 } else if (tmu.format.size == 2) { 486 // filter16() is internally limited to 4 bits, so: 487 // FRAC_BITS=2 generates less instructions, 488 // FRAC_BITS=3,4,5 creates unpleasant artifacts, 489 // FRAC_BITS=6+ looks good 490 FRAC_BITS = 6; 491 } else if (tmu.format.size == 4) { 492 // filter32() is internally limited to 8 bits, so: 493 // FRAC_BITS=4 looks good 494 // FRAC_BITS=5+ looks better, but generates 3 extra ipp 495 FRAC_BITS = 6; 496 } else { 497 // for all other cases we use 4 bits. 498 FRAC_BITS = 4; 499 } 500 } 501 wrapping(u, s.reg, width, tmu.swrap, FRAC_BITS); 502 wrapping(v, t.reg, height, tmu.twrap, FRAC_BITS); 503 504 if (tmu.linear) { 505 comment("compute linear filtering offsets"); 506 // pixel size scale 507 const int shift = 31 - gglClz(tmu.format.size); 508 U = scratches.obtain(); 509 V = scratches.obtain(); 510 511 // sample the texel center 512 SUB(AL, 0, u, u, imm(1<<(FRAC_BITS-1))); 513 SUB(AL, 0, v, v, imm(1<<(FRAC_BITS-1))); 514 515 // get the fractionnal part of U,V 516 AND(AL, 0, U, u, imm((1<<FRAC_BITS)-1)); 517 AND(AL, 0, V, v, imm((1<<FRAC_BITS)-1)); 518 519 // compute width-1 and height-1 520 SUB(AL, 0, width, width, imm(1)); 521 SUB(AL, 0, height, height, imm(1)); 522 523 // get the integer part of U,V and clamp/wrap 524 // and compute offset to the next texel 525 if (tmu.swrap == GGL_NEEDS_WRAP_REPEAT) { 526 // u has already been REPEATed 527 MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS)); 528 MOV(MI, 0, u, width); 529 CMP(AL, u, width); 530 MOV(LT, 0, width, imm(1 << shift)); 531 if (shift) 532 MOV(GE, 0, width, reg_imm(width, LSL, shift)); 533 RSB(GE, 0, width, width, imm(0)); 534 } else { 535 // u has not been CLAMPed yet 536 // algorithm: 537 // if ((u>>4) >= width) 538 // u = width<<4 539 // width = 0 540 // else 541 // width = 1<<shift 542 // u = u>>4; // get integer part 543 // if (u<0) 544 // u = 0 545 // width = 0 546 // generated_vars.rt = width 547 548 CMP(AL, width, reg_imm(u, ASR, FRAC_BITS)); 549 MOV(LE, 0, u, reg_imm(width, LSL, FRAC_BITS)); 550 MOV(LE, 0, width, imm(0)); 551 MOV(GT, 0, width, imm(1 << shift)); 552 MOV(AL, 1, u, reg_imm(u, ASR, FRAC_BITS)); 553 MOV(MI, 0, u, imm(0)); 554 MOV(MI, 0, width, imm(0)); 555 } 556 CONTEXT_STORE(width, generated_vars.rt); 557 558 const int stride = width; 559 CONTEXT_LOAD(stride, generated_vars.texture[i].stride); 560 if (tmu.twrap == GGL_NEEDS_WRAP_REPEAT) { 561 // v has already been REPEATed 562 MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS)); 563 MOV(MI, 0, v, height); 564 CMP(AL, v, height); 565 MOV(LT, 0, height, imm(1 << shift)); 566 if (shift) 567 MOV(GE, 0, height, reg_imm(height, LSL, shift)); 568 RSB(GE, 0, height, height, imm(0)); 569 MUL(AL, 0, height, stride, height); 570 } else { 571 // v has not been CLAMPed yet 572 CMP(AL, height, reg_imm(v, ASR, FRAC_BITS)); 573 MOV(LE, 0, v, reg_imm(height, LSL, FRAC_BITS)); 574 MOV(LE, 0, height, imm(0)); 575 if (shift) { 576 MOV(GT, 0, height, reg_imm(stride, LSL, shift)); 577 } else { 578 MOV(GT, 0, height, stride); 579 } 580 MOV(AL, 1, v, reg_imm(v, ASR, FRAC_BITS)); 581 MOV(MI, 0, v, imm(0)); 582 MOV(MI, 0, height, imm(0)); 583 } 584 CONTEXT_STORE(height, generated_vars.lb); 585 } 586 587 scratches.recycle(width); 588 scratches.recycle(height); 589 590 // iterate texture coordinates... 591 comment("iterate s,t"); 592 int dsdx = scratches.obtain(); 593 int dtdx = scratches.obtain(); 594 CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx); 595 CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx); 596 ADD(AL, 0, s.reg, s.reg, dsdx); 597 ADD(AL, 0, t.reg, t.reg, dtdx); 598 if ((mOptLevel&1)==0) { 599 CONTEXT_STORE(s.reg, generated_vars.texture[i].spill[0]); 600 CONTEXT_STORE(t.reg, generated_vars.texture[i].spill[1]); 601 scratches.recycle(s.reg); 602 scratches.recycle(t.reg); 603 } 604 scratches.recycle(dsdx); 605 scratches.recycle(dtdx); 606 607 // merge base & offset... 608 comment("merge base & offset"); 609 texel.setTo(regs.obtain(), &tmu.format); 610 txPtr.setTo(texel.reg, tmu.bits); 611 int stride = scratches.obtain(); 612 CONTEXT_LOAD(stride, generated_vars.texture[i].stride); 613 CONTEXT_LOAD(txPtr.reg, generated_vars.texture[i].data); 614 SMLABB(AL, u, v, stride, u); // u+v*stride 615 base_offset(txPtr, txPtr, u); 616 617 // load texel 618 if (!tmu.linear) { 619 comment("fetch texel"); 620 load(txPtr, texel, 0); 621 } else { 622 // recycle registers we don't need anymore 623 scratches.recycle(u); 624 scratches.recycle(v); 625 scratches.recycle(stride); 626 627 comment("fetch texel, bilinear"); 628 switch (tmu.format.size) { 629 case 1: filter8(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 630 case 2: filter16(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 631 case 3: filter24(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 632 case 4: filter32(parts, texel, tmu, U, V, txPtr, FRAC_BITS); break; 633 } 634 } 635 } 636 } 637} 638 639void GGLAssembler::build_iterate_texture_coordinates( 640 const fragment_parts_t& parts) 641{ 642 const bool multiTexture = mTextureMachine.activeUnits > 1; 643 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT; i++) { 644 const texture_unit_t& tmu = mTextureMachine.tmu[i]; 645 if (tmu.format_idx == 0) 646 continue; 647 648 if ((tmu.swrap == GGL_NEEDS_WRAP_11) && 649 (tmu.twrap == GGL_NEEDS_WRAP_11)) 650 { // 1:1 textures 651 const pointer_t& txPtr = parts.coords[i].ptr; 652 ADD(AL, 0, txPtr.reg, txPtr.reg, imm(txPtr.size>>3)); 653 } else { 654 Scratch scratches(registerFile()); 655 int s = parts.coords[i].s.reg; 656 int t = parts.coords[i].t.reg; 657 if ((mOptLevel&1)==0) { 658 s = scratches.obtain(); 659 t = scratches.obtain(); 660 CONTEXT_LOAD(s, generated_vars.texture[i].spill[0]); 661 CONTEXT_LOAD(t, generated_vars.texture[i].spill[1]); 662 } 663 int dsdx = scratches.obtain(); 664 int dtdx = scratches.obtain(); 665 CONTEXT_LOAD(dsdx, generated_vars.texture[i].dsdx); 666 CONTEXT_LOAD(dtdx, generated_vars.texture[i].dtdx); 667 ADD(AL, 0, s, s, dsdx); 668 ADD(AL, 0, t, t, dtdx); 669 if ((mOptLevel&1)==0) { 670 CONTEXT_STORE(s, generated_vars.texture[i].spill[0]); 671 CONTEXT_STORE(t, generated_vars.texture[i].spill[1]); 672 } 673 } 674 } 675} 676 677void GGLAssembler::filter8( 678 const fragment_parts_t& parts, 679 pixel_t& texel, const texture_unit_t& tmu, 680 int U, int V, pointer_t& txPtr, 681 int FRAC_BITS) 682{ 683 if (tmu.format.components != GGL_ALPHA && 684 tmu.format.components != GGL_LUMINANCE) 685 { 686 // this is a packed format, and we don't support 687 // linear filtering (it's probably RGB 332) 688 // Should not happen with OpenGL|ES 689 LDRB(AL, texel.reg, txPtr.reg); 690 return; 691 } 692 693 // ------------------------ 694 // about ~22 cycles / pixel 695 Scratch scratches(registerFile()); 696 697 int pixel= scratches.obtain(); 698 int d = scratches.obtain(); 699 int u = scratches.obtain(); 700 int k = scratches.obtain(); 701 int rt = scratches.obtain(); 702 int lb = scratches.obtain(); 703 704 // RB -> U * V 705 706 CONTEXT_LOAD(rt, generated_vars.rt); 707 CONTEXT_LOAD(lb, generated_vars.lb); 708 709 int offset = pixel; 710 ADD(AL, 0, offset, lb, rt); 711 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 712 SMULBB(AL, u, U, V); 713 SMULBB(AL, d, pixel, u); 714 RSB(AL, 0, k, u, imm(1<<(FRAC_BITS*2))); 715 716 // LB -> (1-U) * V 717 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 718 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(lb)); 719 SMULBB(AL, u, U, V); 720 SMLABB(AL, d, pixel, u, d); 721 SUB(AL, 0, k, k, u); 722 723 // LT -> (1-U)*(1-V) 724 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 725 LDRB(AL, pixel, txPtr.reg); 726 SMULBB(AL, u, U, V); 727 SMLABB(AL, d, pixel, u, d); 728 729 // RT -> U*(1-V) 730 LDRB(AL, pixel, txPtr.reg, reg_scale_pre(rt)); 731 SUB(AL, 0, u, k, u); 732 SMLABB(AL, texel.reg, pixel, u, d); 733 734 for (int i=0 ; i<4 ; i++) { 735 if (!texel.format.c[i].h) continue; 736 texel.format.c[i].h = FRAC_BITS*2+8; 737 texel.format.c[i].l = FRAC_BITS*2; // keeping 8 bits in enough 738 } 739 texel.format.size = 4; 740 texel.format.bitsPerPixel = 32; 741 texel.flags |= CLEAR_LO; 742} 743 744void GGLAssembler::filter16( 745 const fragment_parts_t& parts, 746 pixel_t& texel, const texture_unit_t& tmu, 747 int U, int V, pointer_t& txPtr, 748 int FRAC_BITS) 749{ 750 // compute the mask 751 // XXX: it would be nice if the mask below could be computed 752 // automatically. 753 uint32_t mask = 0; 754 int shift = 0; 755 int prec = 0; 756 switch (tmu.format_idx) { 757 case GGL_PIXEL_FORMAT_RGB_565: 758 // source: 00000ggg.ggg00000 | rrrrr000.000bbbbb 759 // result: gggggggg.gggrrrrr | rrrrr0bb.bbbbbbbb 760 mask = 0x07E0F81F; 761 shift = 16; 762 prec = 5; 763 break; 764 case GGL_PIXEL_FORMAT_RGBA_4444: 765 // 0000,1111,0000,1111 | 0000,1111,0000,1111 766 mask = 0x0F0F0F0F; 767 shift = 12; 768 prec = 4; 769 break; 770 case GGL_PIXEL_FORMAT_LA_88: 771 // 0000,0000,1111,1111 | 0000,0000,1111,1111 772 // AALL -> 00AA | 00LL 773 mask = 0x00FF00FF; 774 shift = 8; 775 prec = 8; 776 break; 777 default: 778 // unsupported format, do something sensical... 779 LOGE("Unsupported 16-bits texture format (%d)", tmu.format_idx); 780 LDRH(AL, texel.reg, txPtr.reg); 781 return; 782 } 783 784 const int adjust = FRAC_BITS*2 - prec; 785 const int round = 0; 786 787 // update the texel format 788 texel.format.size = 4; 789 texel.format.bitsPerPixel = 32; 790 texel.flags |= CLEAR_HI|CLEAR_LO; 791 for (int i=0 ; i<4 ; i++) { 792 if (!texel.format.c[i].h) continue; 793 const uint32_t offset = (mask & tmu.format.mask(i)) ? 0 : shift; 794 texel.format.c[i].h = tmu.format.c[i].h + offset + prec; 795 texel.format.c[i].l = texel.format.c[i].h - (tmu.format.bits(i) + prec); 796 } 797 798 // ------------------------ 799 // about ~40 cycles / pixel 800 Scratch scratches(registerFile()); 801 802 int pixel= scratches.obtain(); 803 int d = scratches.obtain(); 804 int u = scratches.obtain(); 805 int k = scratches.obtain(); 806 807 // RB -> U * V 808 int offset = pixel; 809 CONTEXT_LOAD(offset, generated_vars.rt); 810 CONTEXT_LOAD(u, generated_vars.lb); 811 ADD(AL, 0, offset, offset, u); 812 813 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 814 SMULBB(AL, u, U, V); 815 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 816 build_and_immediate(pixel, pixel, mask, 32); 817 if (adjust) { 818 if (round) 819 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 820 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 821 } 822 MUL(AL, 0, d, pixel, u); 823 RSB(AL, 0, k, u, imm(1<<prec)); 824 825 // LB -> (1-U) * V 826 CONTEXT_LOAD(offset, generated_vars.lb); 827 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 828 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 829 SMULBB(AL, u, U, V); 830 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 831 build_and_immediate(pixel, pixel, mask, 32); 832 if (adjust) { 833 if (round) 834 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 835 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 836 } 837 MLA(AL, 0, d, pixel, u, d); 838 SUB(AL, 0, k, k, u); 839 840 // LT -> (1-U)*(1-V) 841 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 842 LDRH(AL, pixel, txPtr.reg); 843 SMULBB(AL, u, U, V); 844 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 845 build_and_immediate(pixel, pixel, mask, 32); 846 if (adjust) { 847 if (round) 848 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 849 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 850 } 851 MLA(AL, 0, d, pixel, u, d); 852 853 // RT -> U*(1-V) 854 CONTEXT_LOAD(offset, generated_vars.rt); 855 LDRH(AL, pixel, txPtr.reg, reg_pre(offset)); 856 SUB(AL, 0, u, k, u); 857 ORR(AL, 0, pixel, pixel, reg_imm(pixel, LSL, shift)); 858 build_and_immediate(pixel, pixel, mask, 32); 859 MLA(AL, 0, texel.reg, pixel, u, d); 860} 861 862void GGLAssembler::filter24( 863 const fragment_parts_t& parts, 864 pixel_t& texel, const texture_unit_t& tmu, 865 int U, int V, pointer_t& txPtr, 866 int FRAC_BITS) 867{ 868 // not supported yet (currently disabled) 869 load(txPtr, texel, 0); 870} 871 872#if __ARM_ARCH__ >= 6 873// ARMv6 version, using UXTB16, and scheduled for Cortex-A8 pipeline 874void GGLAssembler::filter32( 875 const fragment_parts_t& parts, 876 pixel_t& texel, const texture_unit_t& tmu, 877 int U, int V, pointer_t& txPtr, 878 int FRAC_BITS) 879{ 880 const int adjust = FRAC_BITS*2 - 8; 881 const int round = 0; 882 const int prescale = 16 - adjust; 883 884 Scratch scratches(registerFile()); 885 886 int pixel= scratches.obtain(); 887 int dh = scratches.obtain(); 888 int u = scratches.obtain(); 889 int k = scratches.obtain(); 890 891 int temp = scratches.obtain(); 892 int dl = scratches.obtain(); 893 894 int offsetrt = scratches.obtain(); 895 int offsetlb = scratches.obtain(); 896 897 int pixellb = offsetlb; 898 899 // RB -> U * V 900 CONTEXT_LOAD(offsetrt, generated_vars.rt); 901 CONTEXT_LOAD(offsetlb, generated_vars.lb); 902 if(!round) { 903 MOV(AL, 0, U, reg_imm(U, LSL, prescale)); 904 } 905 ADD(AL, 0, u, offsetrt, offsetlb); 906 907 LDR(AL, pixel, txPtr.reg, reg_scale_pre(u)); 908 if (round) { 909 SMULBB(AL, u, U, V); 910 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 911 } else { 912 SMULWB(AL, u, U, V); 913 RSB(AL, 0, U, U, imm(1<<(FRAC_BITS+prescale))); 914 } 915 UXTB16(AL, temp, pixel, 0); 916 if (round) { 917 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 918 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 919 } 920 LDR(AL, pixellb, txPtr.reg, reg_scale_pre(offsetlb)); 921 MUL(AL, 0, dh, temp, u); 922 UXTB16(AL, temp, pixel, 8); 923 MUL(AL, 0, dl, temp, u); 924 RSB(AL, 0, k, u, imm(0x100)); 925 926 // LB -> (1-U) * V 927 if (round) { 928 SMULBB(AL, u, U, V); 929 } else { 930 SMULWB(AL, u, U, V); 931 } 932 UXTB16(AL, temp, pixellb, 0); 933 if (round) { 934 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 935 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 936 } 937 MLA(AL, 0, dh, temp, u, dh); 938 UXTB16(AL, temp, pixellb, 8); 939 MLA(AL, 0, dl, temp, u, dl); 940 SUB(AL, 0, k, k, u); 941 942 // LT -> (1-U)*(1-V) 943 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 944 LDR(AL, pixel, txPtr.reg); 945 if (round) { 946 SMULBB(AL, u, U, V); 947 } else { 948 SMULWB(AL, u, U, V); 949 } 950 UXTB16(AL, temp, pixel, 0); 951 if (round) { 952 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 953 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 954 } 955 MLA(AL, 0, dh, temp, u, dh); 956 UXTB16(AL, temp, pixel, 8); 957 MLA(AL, 0, dl, temp, u, dl); 958 959 // RT -> U*(1-V) 960 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offsetrt)); 961 SUB(AL, 0, u, k, u); 962 UXTB16(AL, temp, pixel, 0); 963 MLA(AL, 0, dh, temp, u, dh); 964 UXTB16(AL, temp, pixel, 8); 965 MLA(AL, 0, dl, temp, u, dl); 966 967 UXTB16(AL, dh, dh, 8); 968 UXTB16(AL, dl, dl, 8); 969 ORR(AL, 0, texel.reg, dh, reg_imm(dl, LSL, 8)); 970} 971#else 972void GGLAssembler::filter32( 973 const fragment_parts_t& parts, 974 pixel_t& texel, const texture_unit_t& tmu, 975 int U, int V, pointer_t& txPtr, 976 int FRAC_BITS) 977{ 978 const int adjust = FRAC_BITS*2 - 8; 979 const int round = 0; 980 981 // ------------------------ 982 // about ~38 cycles / pixel 983 Scratch scratches(registerFile()); 984 985 int pixel= scratches.obtain(); 986 int dh = scratches.obtain(); 987 int u = scratches.obtain(); 988 int k = scratches.obtain(); 989 990 int temp = scratches.obtain(); 991 int dl = scratches.obtain(); 992 int mask = scratches.obtain(); 993 994 MOV(AL, 0, mask, imm(0xFF)); 995 ORR(AL, 0, mask, mask, imm(0xFF0000)); 996 997 // RB -> U * V 998 int offset = pixel; 999 CONTEXT_LOAD(offset, generated_vars.rt); 1000 CONTEXT_LOAD(u, generated_vars.lb); 1001 ADD(AL, 0, offset, offset, u); 1002 1003 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1004 SMULBB(AL, u, U, V); 1005 AND(AL, 0, temp, mask, pixel); 1006 if (adjust) { 1007 if (round) 1008 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1009 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1010 } 1011 MUL(AL, 0, dh, temp, u); 1012 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1013 MUL(AL, 0, dl, temp, u); 1014 RSB(AL, 0, k, u, imm(0x100)); 1015 1016 // LB -> (1-U) * V 1017 CONTEXT_LOAD(offset, generated_vars.lb); 1018 RSB(AL, 0, U, U, imm(1<<FRAC_BITS)); 1019 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1020 SMULBB(AL, u, U, V); 1021 AND(AL, 0, temp, mask, pixel); 1022 if (adjust) { 1023 if (round) 1024 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1025 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1026 } 1027 MLA(AL, 0, dh, temp, u, dh); 1028 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1029 MLA(AL, 0, dl, temp, u, dl); 1030 SUB(AL, 0, k, k, u); 1031 1032 // LT -> (1-U)*(1-V) 1033 RSB(AL, 0, V, V, imm(1<<FRAC_BITS)); 1034 LDR(AL, pixel, txPtr.reg); 1035 SMULBB(AL, u, U, V); 1036 AND(AL, 0, temp, mask, pixel); 1037 if (adjust) { 1038 if (round) 1039 ADD(AL, 0, u, u, imm(1<<(adjust-1))); 1040 MOV(AL, 0, u, reg_imm(u, LSR, adjust)); 1041 } 1042 MLA(AL, 0, dh, temp, u, dh); 1043 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1044 MLA(AL, 0, dl, temp, u, dl); 1045 1046 // RT -> U*(1-V) 1047 CONTEXT_LOAD(offset, generated_vars.rt); 1048 LDR(AL, pixel, txPtr.reg, reg_scale_pre(offset)); 1049 SUB(AL, 0, u, k, u); 1050 AND(AL, 0, temp, mask, pixel); 1051 MLA(AL, 0, dh, temp, u, dh); 1052 AND(AL, 0, temp, mask, reg_imm(pixel, LSR, 8)); 1053 MLA(AL, 0, dl, temp, u, dl); 1054 1055 AND(AL, 0, dh, mask, reg_imm(dh, LSR, 8)); 1056 AND(AL, 0, dl, dl, reg_imm(mask, LSL, 8)); 1057 ORR(AL, 0, texel.reg, dh, dl); 1058} 1059#endif 1060 1061void GGLAssembler::build_texture_environment( 1062 component_t& fragment, 1063 const fragment_parts_t& parts, 1064 int component, 1065 Scratch& regs) 1066{ 1067 const uint32_t component_mask = 1<<component; 1068 const bool multiTexture = mTextureMachine.activeUnits > 1; 1069 for (int i=0 ; i<GGL_TEXTURE_UNIT_COUNT ; i++) { 1070 texture_unit_t& tmu = mTextureMachine.tmu[i]; 1071 1072 if (tmu.mask & component_mask) { 1073 // replace or modulate with this texture 1074 if ((tmu.replaced & component_mask) == 0) { 1075 // not replaced by a later tmu... 1076 1077 Scratch scratches(registerFile()); 1078 pixel_t texel(parts.texel[i]); 1079 if (multiTexture && 1080 tmu.swrap == GGL_NEEDS_WRAP_11 && 1081 tmu.twrap == GGL_NEEDS_WRAP_11) 1082 { 1083 texel.reg = scratches.obtain(); 1084 texel.flags |= CORRUPTIBLE; 1085 comment("fetch texel (multitexture 1:1)"); 1086 load(parts.coords[i].ptr, texel, WRITE_BACK); 1087 } 1088 1089 component_t incoming(fragment); 1090 modify(fragment, regs); 1091 1092 switch (tmu.env) { 1093 case GGL_REPLACE: 1094 extract(fragment, texel, component); 1095 break; 1096 case GGL_MODULATE: 1097 modulate(fragment, incoming, texel, component); 1098 break; 1099 case GGL_DECAL: 1100 decal(fragment, incoming, texel, component); 1101 break; 1102 case GGL_BLEND: 1103 blend(fragment, incoming, texel, component, i); 1104 break; 1105 case GGL_ADD: 1106 add(fragment, incoming, texel, component); 1107 break; 1108 } 1109 } 1110 } 1111 } 1112} 1113 1114// --------------------------------------------------------------------------- 1115 1116void GGLAssembler::wrapping( 1117 int d, 1118 int coord, int size, 1119 int tx_wrap, int tx_linear) 1120{ 1121 // notes: 1122 // if tx_linear is set, we need 4 extra bits of precision on the result 1123 // SMULL/UMULL is 3 cycles 1124 Scratch scratches(registerFile()); 1125 int c = coord; 1126 if (tx_wrap == GGL_NEEDS_WRAP_REPEAT) { 1127 // UMULL takes 4 cycles (interlocked), and we can get away with 1128 // 2 cycles using SMULWB, but we're loosing 16 bits of precision 1129 // out of 32 (this is not a problem because the iterator keeps 1130 // its full precision) 1131 // UMULL(AL, 0, size, d, c, size); 1132 // note: we can't use SMULTB because it's signed. 1133 MOV(AL, 0, d, reg_imm(c, LSR, 16-tx_linear)); 1134 SMULWB(AL, d, d, size); 1135 } else if (tx_wrap == GGL_NEEDS_WRAP_CLAMP_TO_EDGE) { 1136 if (tx_linear) { 1137 // 1 cycle 1138 MOV(AL, 0, d, reg_imm(coord, ASR, 16-tx_linear)); 1139 } else { 1140 // 4 cycles (common case) 1141 MOV(AL, 0, d, reg_imm(coord, ASR, 16)); 1142 BIC(AL, 0, d, d, reg_imm(d, ASR, 31)); 1143 CMP(AL, d, size); 1144 SUB(GE, 0, d, size, imm(1)); 1145 } 1146 } 1147} 1148 1149// --------------------------------------------------------------------------- 1150 1151void GGLAssembler::modulate( 1152 component_t& dest, 1153 const component_t& incoming, 1154 const pixel_t& incomingTexel, int component) 1155{ 1156 Scratch locals(registerFile()); 1157 integer_t texel(locals.obtain(), 32, CORRUPTIBLE); 1158 extract(texel, incomingTexel, component); 1159 1160 const int Nt = texel.size(); 1161 // Nt should always be less than 10 bits because it comes 1162 // from the TMU. 1163 1164 int Ni = incoming.size(); 1165 // Ni could be big because it comes from previous MODULATEs 1166 1167 if (Nt == 1) { 1168 // texel acts as a bit-mask 1169 // dest = incoming & ((texel << incoming.h)-texel) 1170 RSB(AL, 0, dest.reg, texel.reg, reg_imm(texel.reg, LSL, incoming.h)); 1171 AND(AL, 0, dest.reg, dest.reg, incoming.reg); 1172 dest.l = incoming.l; 1173 dest.h = incoming.h; 1174 dest.flags |= (incoming.flags & CLEAR_LO); 1175 } else if (Ni == 1) { 1176 MOV(AL, 0, dest.reg, reg_imm(incoming.reg, LSL, 31-incoming.h)); 1177 AND(AL, 0, dest.reg, texel.reg, reg_imm(dest.reg, ASR, 31)); 1178 dest.l = 0; 1179 dest.h = Nt; 1180 } else { 1181 int inReg = incoming.reg; 1182 int shift = incoming.l; 1183 if ((Nt + Ni) > 32) { 1184 // we will overflow, reduce the precision of Ni to 8 bits 1185 // (Note Nt cannot be more than 10 bits which happens with 1186 // 565 textures and GGL_LINEAR) 1187 shift += Ni-8; 1188 Ni = 8; 1189 } 1190 1191 // modulate by the component with the lowest precision 1192 if (Nt >= Ni) { 1193 if (shift) { 1194 // XXX: we should be able to avoid this shift 1195 // when shift==16 && Nt<16 && Ni<16, in which 1196 // we could use SMULBT below. 1197 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift)); 1198 inReg = dest.reg; 1199 shift = 0; 1200 } 1201 // operation: (Cf*Ct)/((1<<Ni)-1) 1202 // approximated with: Cf*(Ct + Ct>>(Ni-1))>>Ni 1203 // this operation doesn't change texel's size 1204 ADD(AL, 0, dest.reg, inReg, reg_imm(inReg, LSR, Ni-1)); 1205 if (Nt<16 && Ni<16) SMULBB(AL, dest.reg, texel.reg, dest.reg); 1206 else MUL(AL, 0, dest.reg, texel.reg, dest.reg); 1207 dest.l = Ni; 1208 dest.h = Nt + Ni; 1209 } else { 1210 if (shift && (shift != 16)) { 1211 // if shift==16, we can use 16-bits mul instructions later 1212 MOV(AL, 0, dest.reg, reg_imm(inReg, LSR, shift)); 1213 inReg = dest.reg; 1214 shift = 0; 1215 } 1216 // operation: (Cf*Ct)/((1<<Nt)-1) 1217 // approximated with: Ct*(Cf + Cf>>(Nt-1))>>Nt 1218 // this operation doesn't change incoming's size 1219 Scratch scratches(registerFile()); 1220 int t = (texel.flags & CORRUPTIBLE) ? texel.reg : dest.reg; 1221 if (t == inReg) 1222 t = scratches.obtain(); 1223 ADD(AL, 0, t, texel.reg, reg_imm(texel.reg, LSR, Nt-1)); 1224 if (Nt<16 && Ni<16) { 1225 if (shift==16) SMULBT(AL, dest.reg, t, inReg); 1226 else SMULBB(AL, dest.reg, t, inReg); 1227 } else MUL(AL, 0, dest.reg, t, inReg); 1228 dest.l = Nt; 1229 dest.h = Nt + Ni; 1230 } 1231 1232 // low bits are not valid 1233 dest.flags |= CLEAR_LO; 1234 1235 // no need to keep more than 8 bits/component 1236 if (dest.size() > 8) 1237 dest.l = dest.h-8; 1238 } 1239} 1240 1241void GGLAssembler::decal( 1242 component_t& dest, 1243 const component_t& incoming, 1244 const pixel_t& incomingTexel, int component) 1245{ 1246 // RGBA: 1247 // Cv = Cf*(1 - At) + Ct*At = Cf + (Ct - Cf)*At 1248 // Av = Af 1249 Scratch locals(registerFile()); 1250 integer_t texel(locals.obtain(), 32, CORRUPTIBLE); 1251 integer_t factor(locals.obtain(), 32, CORRUPTIBLE); 1252 extract(texel, incomingTexel, component); 1253 extract(factor, incomingTexel, GGLFormat::ALPHA); 1254 1255 // no need to keep more than 8-bits for decal 1256 int Ni = incoming.size(); 1257 int shift = incoming.l; 1258 if (Ni > 8) { 1259 shift += Ni-8; 1260 Ni = 8; 1261 } 1262 integer_t incomingNorm(incoming.reg, Ni, incoming.flags); 1263 if (shift) { 1264 MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift)); 1265 incomingNorm.reg = dest.reg; 1266 incomingNorm.flags |= CORRUPTIBLE; 1267 } 1268 ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1)); 1269 build_blendOneMinusFF(dest, factor, incomingNorm, texel); 1270} 1271 1272void GGLAssembler::blend( 1273 component_t& dest, 1274 const component_t& incoming, 1275 const pixel_t& incomingTexel, int component, int tmu) 1276{ 1277 // RGBA: 1278 // Cv = (1 - Ct)*Cf + Ct*Cc = Cf + (Cc - Cf)*Ct 1279 // Av = At*Af 1280 1281 if (component == GGLFormat::ALPHA) { 1282 modulate(dest, incoming, incomingTexel, component); 1283 return; 1284 } 1285 1286 Scratch locals(registerFile()); 1287 integer_t color(locals.obtain(), 8, CORRUPTIBLE); 1288 integer_t factor(locals.obtain(), 32, CORRUPTIBLE); 1289 LDRB(AL, color.reg, mBuilderContext.Rctx, 1290 immed12_pre(GGL_OFFSETOF(state.texture[tmu].env_color[component]))); 1291 extract(factor, incomingTexel, component); 1292 1293 // no need to keep more than 8-bits for blend 1294 int Ni = incoming.size(); 1295 int shift = incoming.l; 1296 if (Ni > 8) { 1297 shift += Ni-8; 1298 Ni = 8; 1299 } 1300 integer_t incomingNorm(incoming.reg, Ni, incoming.flags); 1301 if (shift) { 1302 MOV(AL, 0, dest.reg, reg_imm(incomingNorm.reg, LSR, shift)); 1303 incomingNorm.reg = dest.reg; 1304 incomingNorm.flags |= CORRUPTIBLE; 1305 } 1306 ADD(AL, 0, factor.reg, factor.reg, reg_imm(factor.reg, LSR, factor.s-1)); 1307 build_blendOneMinusFF(dest, factor, incomingNorm, color); 1308} 1309 1310void GGLAssembler::add( 1311 component_t& dest, 1312 const component_t& incoming, 1313 const pixel_t& incomingTexel, int component) 1314{ 1315 // RGBA: 1316 // Cv = Cf + Ct; 1317 Scratch locals(registerFile()); 1318 1319 component_t incomingTemp(incoming); 1320 1321 // use "dest" as a temporary for extracting the texel, unless "dest" 1322 // overlaps "incoming". 1323 integer_t texel(dest.reg, 32, CORRUPTIBLE); 1324 if (dest.reg == incomingTemp.reg) 1325 texel.reg = locals.obtain(); 1326 extract(texel, incomingTexel, component); 1327 1328 if (texel.s < incomingTemp.size()) { 1329 expand(texel, texel, incomingTemp.size()); 1330 } else if (texel.s > incomingTemp.size()) { 1331 if (incomingTemp.flags & CORRUPTIBLE) { 1332 expand(incomingTemp, incomingTemp, texel.s); 1333 } else { 1334 incomingTemp.reg = locals.obtain(); 1335 expand(incomingTemp, incoming, texel.s); 1336 } 1337 } 1338 1339 if (incomingTemp.l) { 1340 ADD(AL, 0, dest.reg, texel.reg, 1341 reg_imm(incomingTemp.reg, LSR, incomingTemp.l)); 1342 } else { 1343 ADD(AL, 0, dest.reg, texel.reg, incomingTemp.reg); 1344 } 1345 dest.l = 0; 1346 dest.h = texel.size(); 1347 component_sat(dest); 1348} 1349 1350// ---------------------------------------------------------------------------- 1351 1352}; // namespace android 1353 1354