Surface.cpp revision 19bac1e08be200c31efd26f0f5fd144c9b3eefd3
1// SwiftShader Software Renderer 2// 3// Copyright(c) 2005-2012 TransGaming Inc. 4// 5// All rights reserved. No part of this software may be copied, distributed, transmitted, 6// transcribed, stored in a retrieval system, translated into any human or computer 7// language by any means, or disclosed to third parties without the explicit written 8// agreement of TransGaming Inc. Without such an agreement, no rights or licenses, express 9// or implied, including but not limited to any patent rights, are granted to you. 10// 11 12#include "Surface.hpp" 13 14#include "Color.hpp" 15#include "Context.hpp" 16#include "Renderer.hpp" 17#include "Common/Half.hpp" 18#include "Common/Memory.hpp" 19#include "Common/CPUID.hpp" 20#include "Common/Resource.hpp" 21#include "Common/Debug.hpp" 22#include "Reactor/Reactor.hpp" 23 24#include <xmmintrin.h> 25#include <emmintrin.h> 26 27#undef min 28#undef max 29 30namespace sw 31{ 32 extern bool quadLayoutEnabled; 33 extern bool complementaryDepthBuffer; 34 extern TranscendentalPrecision logPrecision; 35 36 unsigned int *Surface::palette = 0; 37 unsigned int Surface::paletteID = 0; 38 39 void Rect::clip(int minX, int minY, int maxX, int maxY) 40 { 41 x0 = sw::clamp(x0, minX, maxX); 42 y0 = sw::clamp(y0, minY, maxY); 43 x1 = sw::clamp(x1, minX, maxX); 44 y1 = sw::clamp(y1, minY, maxY); 45 } 46 47 void Surface::Buffer::write(int x, int y, int z, const Color<float> &color) 48 { 49 void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB; 50 51 write(element, color); 52 } 53 54 void Surface::Buffer::write(int x, int y, const Color<float> &color) 55 { 56 void *element = (unsigned char*)buffer + x * bytes + y * pitchB; 57 58 write(element, color); 59 } 60 61 inline void Surface::Buffer::write(void *element, const Color<float> &color) 62 { 63 switch(format) 64 { 65 case FORMAT_A8: 66 *(unsigned char*)element = unorm<8>(color.a); 67 break; 68 case FORMAT_R8: 69 *(unsigned char*)element = unorm<8>(color.r); 70 break; 71 case FORMAT_R3G3B2: 72 *(unsigned char*)element = (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0); 73 break; 74 case FORMAT_A8R3G3B2: 75 *(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<3>(color.r) << 5) | (unorm<3>(color.g) << 2) | (unorm<2>(color.b) << 0); 76 break; 77 case FORMAT_X4R4G4B4: 78 *(unsigned short*)element = 0xF000 | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0); 79 break; 80 case FORMAT_A4R4G4B4: 81 *(unsigned short*)element = (unorm<4>(color.a) << 12) | (unorm<4>(color.r) << 8) | (unorm<4>(color.g) << 4) | (unorm<4>(color.b) << 0); 82 break; 83 case FORMAT_R5G6B5: 84 *(unsigned short*)element = (unorm<5>(color.r) << 11) | (unorm<6>(color.g) << 5) | (unorm<5>(color.b) << 0); 85 break; 86 case FORMAT_A1R5G5B5: 87 *(unsigned short*)element = (unorm<1>(color.a) << 15) | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0); 88 break; 89 case FORMAT_X1R5G5B5: 90 *(unsigned short*)element = 0x8000 | (unorm<5>(color.r) << 10) | (unorm<5>(color.g) << 5) | (unorm<5>(color.b) << 0); 91 break; 92 case FORMAT_A8R8G8B8: 93 *(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0); 94 break; 95 case FORMAT_X8R8G8B8: 96 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.r) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.b) << 0); 97 break; 98 case FORMAT_A8B8G8R8: 99 *(unsigned int*)element = (unorm<8>(color.a) << 24) | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0); 100 break; 101 case FORMAT_X8B8G8R8: 102 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0); 103 break; 104 case FORMAT_A2R10G10B10: 105 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.r) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.b) << 0); 106 break; 107 case FORMAT_A2B10G10R10: 108 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (unorm<10>(color.b) << 20) | (unorm<10>(color.g) << 10) | (unorm<10>(color.r) << 0); 109 break; 110 case FORMAT_G8R8: 111 *(unsigned int*)element = (unorm<8>(color.g) << 8) | (unorm<8>(color.r) << 0); 112 break; 113 case FORMAT_G16R16: 114 *(unsigned int*)element = (unorm<16>(color.g) << 16) | (unorm<16>(color.r) << 0); 115 break; 116 case FORMAT_A16B16G16R16: 117 ((unsigned short*)element)[0] = unorm<16>(color.r); 118 ((unsigned short*)element)[1] = unorm<16>(color.g); 119 ((unsigned short*)element)[2] = unorm<16>(color.b); 120 ((unsigned short*)element)[3] = unorm<16>(color.a); 121 break; 122 case FORMAT_V8U8: 123 *(unsigned short*)element = (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0); 124 break; 125 case FORMAT_L6V5U5: 126 *(unsigned short*)element = (unorm<6>(color.b) << 10) | (snorm<5>(color.g) << 5) | (snorm<5>(color.r) << 0); 127 break; 128 case FORMAT_Q8W8V8U8: 129 *(unsigned int*)element = (snorm<8>(color.a) << 24) | (snorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0); 130 break; 131 case FORMAT_X8L8V8U8: 132 *(unsigned int*)element = 0xFF000000 | (unorm<8>(color.b) << 16) | (snorm<8>(color.g) << 8) | (snorm<8>(color.r) << 0); 133 break; 134 case FORMAT_V16U16: 135 *(unsigned int*)element = (snorm<16>(color.g) << 16) | (snorm<16>(color.r) << 0); 136 break; 137 case FORMAT_A2W10V10U10: 138 *(unsigned int*)element = (unorm<2>(color.a) << 30) | (snorm<10>(color.b) << 20) | (snorm<10>(color.g) << 10) | (snorm<10>(color.r) << 0); 139 break; 140 case FORMAT_A16W16V16U16: 141 ((unsigned short*)element)[0] = snorm<16>(color.r); 142 ((unsigned short*)element)[1] = snorm<16>(color.g); 143 ((unsigned short*)element)[2] = snorm<16>(color.b); 144 ((unsigned short*)element)[3] = unorm<16>(color.a); 145 break; 146 case FORMAT_Q16W16V16U16: 147 ((unsigned short*)element)[0] = snorm<16>(color.r); 148 ((unsigned short*)element)[1] = snorm<16>(color.g); 149 ((unsigned short*)element)[2] = snorm<16>(color.b); 150 ((unsigned short*)element)[3] = snorm<16>(color.a); 151 break; 152 case FORMAT_R8G8B8: 153 ((unsigned char*)element)[0] = unorm<8>(color.b); 154 ((unsigned char*)element)[1] = unorm<8>(color.g); 155 ((unsigned char*)element)[2] = unorm<8>(color.r); 156 break; 157 case FORMAT_R16F: 158 *(half*)element = (half)color.r; 159 break; 160 case FORMAT_G16R16F: 161 ((half*)element)[0] = (half)color.r; 162 ((half*)element)[1] = (half)color.g; 163 break; 164 case FORMAT_A16B16G16R16F: 165 ((half*)element)[0] = (half)color.r; 166 ((half*)element)[1] = (half)color.g; 167 ((half*)element)[2] = (half)color.b; 168 ((half*)element)[3] = (half)color.a; 169 break; 170 case FORMAT_R32F: 171 *(float*)element = color.r; 172 break; 173 case FORMAT_G32R32F: 174 ((float*)element)[0] = color.r; 175 ((float*)element)[1] = color.g; 176 break; 177 case FORMAT_A32B32G32R32F: 178 ((float*)element)[0] = color.r; 179 ((float*)element)[1] = color.g; 180 ((float*)element)[2] = color.b; 181 ((float*)element)[3] = color.a; 182 break; 183 case FORMAT_D32F: 184 case FORMAT_D32F_LOCKABLE: 185 case FORMAT_D32F_TEXTURE: 186 case FORMAT_D32F_SHADOW: 187 *((float*)element) = color.r; 188 break; 189 case FORMAT_D32F_COMPLEMENTARY: 190 *((float*)element) = 1 - color.r; 191 break; 192 case FORMAT_S8: 193 *((unsigned char*)element) = unorm<8>(color.r); 194 break; 195 case FORMAT_L8: 196 *(unsigned char*)element = unorm<8>(color.r); 197 break; 198 case FORMAT_A4L4: 199 *(unsigned char*)element = (unorm<4>(color.a) << 4) | (unorm<4>(color.r) << 0); 200 break; 201 case FORMAT_L16: 202 *(unsigned short*)element = unorm<16>(color.r); 203 break; 204 case FORMAT_A8L8: 205 *(unsigned short*)element = (unorm<8>(color.a) << 8) | (unorm<8>(color.r) << 0); 206 break; 207 default: 208 ASSERT(false); 209 } 210 } 211 212 Color<float> Surface::Buffer::read(int x, int y, int z) const 213 { 214 void *element = (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB; 215 216 return read(element); 217 } 218 219 Color<float> Surface::Buffer::read(int x, int y) const 220 { 221 void *element = (unsigned char*)buffer + x * bytes + y * pitchB; 222 223 return read(element); 224 } 225 226 inline Color<float> Surface::Buffer::read(void *element) const 227 { 228 float r = 1; 229 float g = 1; 230 float b = 1; 231 float a = 1; 232 233 switch(format) 234 { 235 case FORMAT_P8: 236 { 237 ASSERT(palette); 238 239 unsigned int abgr = palette[*(unsigned char*)element]; 240 241 r = (abgr & 0x000000FF) * (1.0f / 0x000000FF); 242 g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00); 243 b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000); 244 a = (abgr & 0xFF000000) * (1.0f / 0xFF000000); 245 } 246 break; 247 case FORMAT_A8P8: 248 { 249 ASSERT(palette); 250 251 unsigned int bgr = palette[((unsigned char*)element)[0]]; 252 253 r = (bgr & 0x000000FF) * (1.0f / 0x000000FF); 254 g = (bgr & 0x0000FF00) * (1.0f / 0x0000FF00); 255 b = (bgr & 0x00FF0000) * (1.0f / 0x00FF0000); 256 a = ((unsigned char*)element)[1] * (1.0f / 0xFF); 257 } 258 break; 259 case FORMAT_A8: 260 r = 0; 261 g = 0; 262 b = 0; 263 a = *(unsigned char*)element * (1.0f / 0xFF); 264 break; 265 case FORMAT_R8: 266 r = *(unsigned char*)element * (1.0f / 0xFF); 267 break; 268 case FORMAT_R3G3B2: 269 { 270 unsigned char rgb = *(unsigned char*)element; 271 272 r = (rgb & 0xE0) * (1.0f / 0xE0); 273 g = (rgb & 0x1C) * (1.0f / 0x1C); 274 b = (rgb & 0x03) * (1.0f / 0x03); 275 } 276 break; 277 case FORMAT_A8R3G3B2: 278 { 279 unsigned short argb = *(unsigned short*)element; 280 281 a = (argb & 0xFF00) * (1.0f / 0xFF00); 282 r = (argb & 0x00E0) * (1.0f / 0x00E0); 283 g = (argb & 0x001C) * (1.0f / 0x001C); 284 b = (argb & 0x0003) * (1.0f / 0x0003); 285 } 286 break; 287 case FORMAT_X4R4G4B4: 288 { 289 unsigned short rgb = *(unsigned short*)element; 290 291 r = (rgb & 0x0F00) * (1.0f / 0x0F00); 292 g = (rgb & 0x00F0) * (1.0f / 0x00F0); 293 b = (rgb & 0x000F) * (1.0f / 0x000F); 294 } 295 break; 296 case FORMAT_A4R4G4B4: 297 { 298 unsigned short argb = *(unsigned short*)element; 299 300 a = (argb & 0xF000) * (1.0f / 0xF000); 301 r = (argb & 0x0F00) * (1.0f / 0x0F00); 302 g = (argb & 0x00F0) * (1.0f / 0x00F0); 303 b = (argb & 0x000F) * (1.0f / 0x000F); 304 } 305 break; 306 case FORMAT_R5G6B5: 307 { 308 unsigned short rgb = *(unsigned short*)element; 309 310 r = (rgb & 0xF800) * (1.0f / 0xF800); 311 g = (rgb & 0x07E0) * (1.0f / 0x07E0); 312 b = (rgb & 0x001F) * (1.0f / 0x001F); 313 } 314 break; 315 case FORMAT_A1R5G5B5: 316 { 317 unsigned short argb = *(unsigned short*)element; 318 319 a = (argb & 0x8000) * (1.0f / 0x8000); 320 r = (argb & 0x7C00) * (1.0f / 0x7C00); 321 g = (argb & 0x03E0) * (1.0f / 0x03E0); 322 b = (argb & 0x001F) * (1.0f / 0x001F); 323 } 324 break; 325 case FORMAT_X1R5G5B5: 326 { 327 unsigned short xrgb = *(unsigned short*)element; 328 329 r = (xrgb & 0x7C00) * (1.0f / 0x7C00); 330 g = (xrgb & 0x03E0) * (1.0f / 0x03E0); 331 b = (xrgb & 0x001F) * (1.0f / 0x001F); 332 } 333 break; 334 case FORMAT_A8R8G8B8: 335 { 336 unsigned int argb = *(unsigned int*)element; 337 338 a = (argb & 0xFF000000) * (1.0f / 0xFF000000); 339 r = (argb & 0x00FF0000) * (1.0f / 0x00FF0000); 340 g = (argb & 0x0000FF00) * (1.0f / 0x0000FF00); 341 b = (argb & 0x000000FF) * (1.0f / 0x000000FF); 342 } 343 break; 344 case FORMAT_X8R8G8B8: 345 { 346 unsigned int xrgb = *(unsigned int*)element; 347 348 r = (xrgb & 0x00FF0000) * (1.0f / 0x00FF0000); 349 g = (xrgb & 0x0000FF00) * (1.0f / 0x0000FF00); 350 b = (xrgb & 0x000000FF) * (1.0f / 0x000000FF); 351 } 352 break; 353 case FORMAT_A8B8G8R8: 354 { 355 unsigned int abgr = *(unsigned int*)element; 356 357 a = (abgr & 0xFF000000) * (1.0f / 0xFF000000); 358 b = (abgr & 0x00FF0000) * (1.0f / 0x00FF0000); 359 g = (abgr & 0x0000FF00) * (1.0f / 0x0000FF00); 360 r = (abgr & 0x000000FF) * (1.0f / 0x000000FF); 361 } 362 break; 363 case FORMAT_X8B8G8R8: 364 { 365 unsigned int xbgr = *(unsigned int*)element; 366 367 b = (xbgr & 0x00FF0000) * (1.0f / 0x00FF0000); 368 g = (xbgr & 0x0000FF00) * (1.0f / 0x0000FF00); 369 r = (xbgr & 0x000000FF) * (1.0f / 0x000000FF); 370 } 371 break; 372 case FORMAT_G8R8: 373 { 374 unsigned short gr = *(unsigned short*)element; 375 376 g = (gr & 0xFF00) * (1.0f / 0xFF00); 377 r = (gr & 0x00FF) * (1.0f / 0x00FF); 378 } 379 break; 380 case FORMAT_G16R16: 381 { 382 unsigned int gr = *(unsigned int*)element; 383 384 g = (gr & 0xFFFF0000) * (1.0f / 0xFFFF0000); 385 r = (gr & 0x0000FFFF) * (1.0f / 0x0000FFFF); 386 } 387 break; 388 case FORMAT_A2R10G10B10: 389 { 390 unsigned int argb = *(unsigned int*)element; 391 392 a = (argb & 0xC0000000) * (1.0f / 0xC0000000); 393 r = (argb & 0x3FF00000) * (1.0f / 0x3FF00000); 394 g = (argb & 0x000FFC00) * (1.0f / 0x000FFC00); 395 b = (argb & 0x000003FF) * (1.0f / 0x000003FF); 396 } 397 break; 398 case FORMAT_A2B10G10R10: 399 { 400 unsigned int abgr = *(unsigned int*)element; 401 402 a = (abgr & 0xC0000000) * (1.0f / 0xC0000000); 403 b = (abgr & 0x3FF00000) * (1.0f / 0x3FF00000); 404 g = (abgr & 0x000FFC00) * (1.0f / 0x000FFC00); 405 r = (abgr & 0x000003FF) * (1.0f / 0x000003FF); 406 } 407 break; 408 case FORMAT_A16B16G16R16: 409 r = ((unsigned short*)element)[0] * (1.0f / 0xFFFF); 410 g = ((unsigned short*)element)[1] * (1.0f / 0xFFFF); 411 b = ((unsigned short*)element)[2] * (1.0f / 0xFFFF); 412 a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF); 413 break; 414 case FORMAT_V8U8: 415 { 416 unsigned short vu = *(unsigned short*)element; 417 418 r = ((int)(vu & 0x00FF) << 24) * (1.0f / 0x7F000000); 419 g = ((int)(vu & 0xFF00) << 16) * (1.0f / 0x7F000000); 420 } 421 break; 422 case FORMAT_L6V5U5: 423 { 424 unsigned short lvu = *(unsigned short*)element; 425 426 r = ((int)(lvu & 0x001F) << 27) * (1.0f / 0x78000000); 427 g = ((int)(lvu & 0x03E0) << 22) * (1.0f / 0x78000000); 428 b = (lvu & 0xFC00) * (1.0f / 0xFC00); 429 } 430 break; 431 case FORMAT_Q8W8V8U8: 432 { 433 unsigned int qwvu = *(unsigned int*)element; 434 435 r = ((int)(qwvu & 0x000000FF) << 24) * (1.0f / 0x7F000000); 436 g = ((int)(qwvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000); 437 b = ((int)(qwvu & 0x00FF0000) << 8) * (1.0f / 0x7F000000); 438 a = ((int)(qwvu & 0xFF000000) << 0) * (1.0f / 0x7F000000); 439 } 440 break; 441 case FORMAT_X8L8V8U8: 442 { 443 unsigned int xlvu = *(unsigned int*)element; 444 445 r = ((int)(xlvu & 0x000000FF) << 24) * (1.0f / 0x7F000000); 446 g = ((int)(xlvu & 0x0000FF00) << 16) * (1.0f / 0x7F000000); 447 b = (xlvu & 0x00FF0000) * (1.0f / 0x00FF0000); 448 } 449 break; 450 case FORMAT_R8G8B8: 451 r = ((unsigned char*)element)[2] * (1.0f / 0xFF); 452 g = ((unsigned char*)element)[1] * (1.0f / 0xFF); 453 b = ((unsigned char*)element)[0] * (1.0f / 0xFF); 454 break; 455 case FORMAT_V16U16: 456 { 457 unsigned int vu = *(unsigned int*)element; 458 459 r = ((int)(vu & 0x0000FFFF) << 16) * (1.0f / 0x7FFF0000); 460 g = ((int)(vu & 0xFFFF0000) << 0) * (1.0f / 0x7FFF0000); 461 } 462 break; 463 case FORMAT_A2W10V10U10: 464 { 465 unsigned int awvu = *(unsigned int*)element; 466 467 r = ((int)(awvu & 0x000003FF) << 22) * (1.0f / 0x7FC00000); 468 g = ((int)(awvu & 0x000FFC00) << 12) * (1.0f / 0x7FC00000); 469 b = ((int)(awvu & 0x3FF00000) << 2) * (1.0f / 0x7FC00000); 470 a = (awvu & 0xC0000000) * (1.0f / 0xC0000000); 471 } 472 break; 473 case FORMAT_A16W16V16U16: 474 r = ((signed short*)element)[0] * (1.0f / 0x7FFF); 475 g = ((signed short*)element)[1] * (1.0f / 0x7FFF); 476 b = ((signed short*)element)[2] * (1.0f / 0x7FFF); 477 a = ((unsigned short*)element)[3] * (1.0f / 0xFFFF); 478 break; 479 case FORMAT_Q16W16V16U16: 480 r = ((signed short*)element)[0] * (1.0f / 0x7FFF); 481 g = ((signed short*)element)[1] * (1.0f / 0x7FFF); 482 b = ((signed short*)element)[2] * (1.0f / 0x7FFF); 483 a = ((signed short*)element)[3] * (1.0f / 0x7FFF); 484 break; 485 case FORMAT_L8: 486 r = 487 g = 488 b = *(unsigned char*)element * (1.0f / 0xFF); 489 break; 490 case FORMAT_A4L4: 491 { 492 unsigned char al = *(unsigned char*)element; 493 494 r = 495 g = 496 b = (al & 0x0F) * (1.0f / 0x0F); 497 a = (al & 0xF0) * (1.0f / 0xF0); 498 } 499 break; 500 case FORMAT_L16: 501 r = 502 g = 503 b = *(unsigned short*)element * (1.0f / 0xFFFF); 504 break; 505 case FORMAT_A8L8: 506 r = 507 g = 508 b = ((unsigned char*)element)[0] * (1.0f / 0xFF); 509 a = ((unsigned char*)element)[1] * (1.0f / 0xFF); 510 break; 511 case FORMAT_R16F: 512 r = *(half*)element; 513 break; 514 case FORMAT_G16R16F: 515 r = ((half*)element)[0]; 516 g = ((half*)element)[1]; 517 break; 518 case FORMAT_A16B16G16R16F: 519 r = ((half*)element)[0]; 520 g = ((half*)element)[1]; 521 b = ((half*)element)[2]; 522 a = ((half*)element)[3]; 523 break; 524 case FORMAT_R32F: 525 r = *(float*)element; 526 break; 527 case FORMAT_G32R32F: 528 r = ((float*)element)[0]; 529 g = ((float*)element)[1]; 530 break; 531 case FORMAT_A32B32G32R32F: 532 r = ((float*)element)[0]; 533 g = ((float*)element)[1]; 534 b = ((float*)element)[2]; 535 a = ((float*)element)[3]; 536 break; 537 case FORMAT_D32F: 538 case FORMAT_D32F_LOCKABLE: 539 case FORMAT_D32F_TEXTURE: 540 case FORMAT_D32F_SHADOW: 541 r = *(float*)element; 542 g = r; 543 b = r; 544 a = r; 545 break; 546 case FORMAT_D32F_COMPLEMENTARY: 547 r = 1 - *(float*)element; 548 g = r; 549 b = r; 550 a = r; 551 break; 552 case FORMAT_S8: 553 r = *(unsigned char*)element * (1.0f / 0xFF); 554 break; 555 default: 556 ASSERT(false); 557 } 558 559 // if(sRGB) 560 // { 561 // r = sRGBtoLinear(r); 562 // g = sRGBtoLinear(g); 563 // b = sRGBtoLinear(b); 564 // } 565 566 return Color<float>(r, g, b, a); 567 } 568 569 Color<float> Surface::Buffer::sample(float x, float y, float z) const 570 { 571 x -= 0.5f; 572 y -= 0.5f; 573 z -= 0.5f; 574 575 int x0 = clamp((int)x, 0, width - 1); 576 int x1 = (x0 + 1 >= width) ? x0 : x0 + 1; 577 578 int y0 = clamp((int)y, 0, height - 1); 579 int y1 = (y0 + 1 >= height) ? y0 : y0 + 1; 580 581 int z0 = clamp((int)z, 0, depth - 1); 582 int z1 = (z0 + 1 >= depth) ? z0 : z0 + 1; 583 584 Color<float> c000 = read(x0, y0, z0); 585 Color<float> c100 = read(x1, y0, z0); 586 Color<float> c010 = read(x0, y1, z0); 587 Color<float> c110 = read(x1, y1, z0); 588 Color<float> c001 = read(x0, y0, z1); 589 Color<float> c101 = read(x1, y0, z1); 590 Color<float> c011 = read(x0, y1, z1); 591 Color<float> c111 = read(x1, y1, z1); 592 593 float fx = x - x0; 594 float fy = y - y0; 595 float fz = z - z0; 596 597 c000 *= (1 - fx) * (1 - fy) * (1 - fz); 598 c100 *= fx * (1 - fy) * (1 - fz); 599 c010 *= (1 - fx) * fy * (1 - fz); 600 c110 *= fx * fy * (1 - fz); 601 c001 *= (1 - fx) * (1 - fy) * fz; 602 c101 *= fx * (1 - fy) * fz; 603 c011 *= (1 - fx) * fy * fz; 604 c111 *= fx * fy * fz; 605 606 return c000 + c100 + c010 + c110 + c001 + c101 + c011 + c111; 607 } 608 609 Color<float> Surface::Buffer::sample(float x, float y) const 610 { 611 x -= 0.5f; 612 y -= 0.5f; 613 614 int x0 = clamp((int)x, 0, width - 1); 615 int x1 = (x0 + 1 >= width) ? x0 : x0 + 1; 616 617 int y0 = clamp((int)y, 0, height - 1); 618 int y1 = (y0 + 1 >= height) ? y0 : y0 + 1; 619 620 Color<float> c00 = read(x0, y0); 621 Color<float> c10 = read(x1, y0); 622 Color<float> c01 = read(x0, y1); 623 Color<float> c11 = read(x1, y1); 624 625 float fx = x - x0; 626 float fy = y - y0; 627 628 c00 *= (1 - fx) * (1 - fy); 629 c10 *= fx * (1 - fy); 630 c01 *= (1 - fx) * fy; 631 c11 *= fx * fy; 632 633 return c00 + c10 + c01 + c11; 634 } 635 636 void *Surface::Buffer::lockRect(int x, int y, int z, Lock lock) 637 { 638 this->lock = lock; 639 640 switch(lock) 641 { 642 case LOCK_UNLOCKED: 643 case LOCK_READONLY: 644 break; 645 case LOCK_WRITEONLY: 646 case LOCK_READWRITE: 647 case LOCK_DISCARD: 648 dirty = true; 649 break; 650 default: 651 ASSERT(false); 652 } 653 654 switch(format) 655 { 656 #if S3TC_SUPPORT 657 case FORMAT_DXT1: 658 case FORMAT_ATI1: 659 return (unsigned char*)buffer + 8 * (x / 4) + (y / 4) * pitchB + z * sliceB; 660 case FORMAT_DXT3: 661 case FORMAT_DXT5: 662 case FORMAT_ATI2: 663 return (unsigned char*)buffer + 16 * (x / 4) + (y / 4) * pitchB + z * sliceB; 664 #endif 665 default: 666 return (unsigned char*)buffer + x * bytes + y * pitchB + z * sliceB; 667 } 668 669 return 0; 670 } 671 672 void Surface::Buffer::unlockRect() 673 { 674 lock = LOCK_UNLOCKED; 675 } 676 677 Surface::Surface(Resource *texture, int width, int height, int depth, Format format, bool lockable, bool renderTarget) : lockable(lockable), renderTarget(renderTarget) 678 { 679 resource = texture ? texture : new Resource(0); 680 hasParent = texture != 0; 681 depth = max(1, depth); 682 683 external.buffer = 0; 684 external.width = width; 685 external.height = height; 686 external.depth = depth; 687 external.format = format; 688 external.bytes = bytes(external.format); 689 external.pitchB = pitchB(external.width, external.format, renderTarget && !texture); 690 external.pitchP = pitchP(external.width, external.format, renderTarget && !texture); 691 external.sliceB = sliceB(external.width, external.height, external.format, renderTarget && !texture); 692 external.sliceP = sliceP(external.width, external.height, external.format, renderTarget && !texture); 693 external.lock = LOCK_UNLOCKED; 694 external.dirty = false; 695 external.paletteUsed = 0; 696 697 internal.buffer = 0; 698 internal.width = width; 699 internal.height = height; 700 internal.depth = depth; 701 internal.format = selectInternalFormat(format); 702 internal.bytes = bytes(internal.format); 703 internal.pitchB = pitchB(internal.width, internal.format, renderTarget); 704 internal.pitchP = pitchP(internal.width, internal.format, renderTarget); 705 internal.sliceB = sliceB(internal.width, internal.height, internal.format, renderTarget); 706 internal.sliceP = sliceP(internal.width, internal.height, internal.format, renderTarget); 707 internal.lock = LOCK_UNLOCKED; 708 internal.dirty = false; 709 internal.paletteUsed = 0; 710 711 stencil.buffer = 0; 712 stencil.width = width; 713 stencil.height = height; 714 stencil.depth = depth; 715 stencil.format = FORMAT_S8; 716 stencil.bytes = bytes(stencil.format); 717 stencil.pitchB = pitchB(stencil.width, stencil.format, renderTarget); 718 stencil.pitchP = pitchP(stencil.width, stencil.format, renderTarget); 719 stencil.sliceB = sliceB(stencil.width, stencil.height, stencil.format, renderTarget); 720 stencil.sliceP = sliceP(stencil.width, stencil.height, stencil.format, renderTarget); 721 stencil.lock = LOCK_UNLOCKED; 722 stencil.dirty = false; 723 stencil.paletteUsed = 0; 724 725 dirtyMipmaps = true; 726 } 727 728 Surface::~Surface() 729 { 730 if(!hasParent) 731 { 732 // Synchronize so we can deallocate the buffers below 733 resource->lock(DESTRUCT); 734 resource->unlock(); 735 resource->destruct(); 736 } 737 738 deallocate(external.buffer); 739 740 if(internal.buffer != external.buffer) 741 { 742 deallocate(internal.buffer); 743 } 744 745 deallocate(stencil.buffer); 746 747 external.buffer = 0; 748 internal.buffer = 0; 749 stencil.buffer = 0; 750 } 751 752 void *Surface::lockExternal(int x, int y, int z, Lock lock, Accessor client) 753 { 754 resource->lock(client); 755 756 if(!external.buffer) 757 { 758 if(internal.buffer && identicalFormats()) 759 { 760 external.buffer = internal.buffer; 761 } 762 else 763 { 764 external.buffer = allocateBuffer(external.width, external.height, external.depth, external.format); 765 } 766 } 767 768 if(internal.dirty) 769 { 770 if(lock != LOCK_DISCARD) 771 { 772 update(external, internal); 773 } 774 } 775 776 switch(lock) 777 { 778 case LOCK_READONLY: 779 break; 780 case LOCK_WRITEONLY: 781 case LOCK_READWRITE: 782 case LOCK_DISCARD: 783 dirtyMipmaps = true; 784 break; 785 default: 786 ASSERT(false); 787 } 788 789 return external.lockRect(x, y, z, lock); 790 } 791 792 void Surface::unlockExternal() 793 { 794 resource->unlock(); 795 796 external.unlockRect(); 797 } 798 799 void *Surface::lockInternal(int x, int y, int z, Lock lock, Accessor client) 800 { 801 if(lock != LOCK_UNLOCKED) 802 { 803 resource->lock(client); 804 } 805 806 if(!internal.buffer) 807 { 808 if(external.buffer && identicalFormats()) 809 { 810 internal.buffer = external.buffer; 811 } 812 else 813 { 814 internal.buffer = allocateBuffer(internal.width, internal.height, internal.depth, internal.format); 815 } 816 } 817 818 // FIXME: WHQL requires conversion to lower external precision and back 819 if(logPrecision >= WHQL) 820 { 821 if(internal.dirty && renderTarget && internal.format != external.format) 822 { 823 if(lock != LOCK_DISCARD) 824 { 825 switch(external.format) 826 { 827 case FORMAT_R3G3B2: 828 case FORMAT_A8R3G3B2: 829 case FORMAT_A1R5G5B5: 830 case FORMAT_A2R10G10B10: 831 case FORMAT_A2B10G10R10: 832 lockExternal(0, 0, 0, LOCK_READWRITE, client); 833 unlockExternal(); 834 break; 835 default: 836 // Difference passes WHQL 837 break; 838 } 839 } 840 } 841 } 842 843 if(external.dirty) 844 { 845 if(lock != LOCK_DISCARD) 846 { 847 update(internal, external); 848 } 849 } 850 851 if(isPalette(external.format) && internal.paletteUsed != Surface::paletteID) 852 { 853 update(internal, external); 854 } 855 856 switch(lock) 857 { 858 case LOCK_UNLOCKED: 859 case LOCK_READONLY: 860 break; 861 case LOCK_WRITEONLY: 862 case LOCK_READWRITE: 863 case LOCK_DISCARD: 864 dirtyMipmaps = true; 865 break; 866 default: 867 ASSERT(false); 868 } 869 870 if(lock == LOCK_READONLY && client == PUBLIC) 871 { 872 resolve(); 873 } 874 875 return internal.lockRect(x, y, z, lock); 876 } 877 878 void Surface::unlockInternal() 879 { 880 resource->unlock(); 881 882 internal.unlockRect(); 883 } 884 885 void *Surface::lockStencil(int front, Accessor client) 886 { 887 resource->lock(client); 888 889 if(!stencil.buffer) 890 { 891 stencil.buffer = allocateBuffer(stencil.width, stencil.height, stencil.depth, stencil.format); 892 } 893 894 if(external.dirty) 895 { 896 update(stencil, external); // FIXME: Only when not discarding 897 } 898 899 return stencil.lockRect(0, 0, front, LOCK_READWRITE); // FIXME 900 } 901 902 void Surface::unlockStencil() 903 { 904 resource->unlock(); 905 906 stencil.unlockRect(); 907 } 908 909 int Surface::bytes(Format format) 910 { 911 switch(format) 912 { 913 case FORMAT_NULL: return 0; 914 case FORMAT_P8: return 1; 915 case FORMAT_A8P8: return 2; 916 case FORMAT_A8: return 1; 917 case FORMAT_R8: return 1; 918 case FORMAT_R3G3B2: return 1; 919 case FORMAT_A8R3G3B2: return 2; 920 case FORMAT_R5G6B5: return 2; 921 case FORMAT_A1R5G5B5: return 2; 922 case FORMAT_X1R5G5B5: return 2; 923 case FORMAT_X4R4G4B4: return 2; 924 case FORMAT_A4R4G4B4: return 2; 925 case FORMAT_R8G8B8: return 3; 926 case FORMAT_X8R8G8B8: return 4; 927 // case FORMAT_X8G8R8B8Q: return 4; 928 case FORMAT_A8R8G8B8: return 4; 929 // case FORMAT_A8G8R8B8Q: return 4; 930 case FORMAT_X8B8G8R8: return 4; 931 case FORMAT_A8B8G8R8: return 4; 932 case FORMAT_A2R10G10B10: return 4; 933 case FORMAT_A2B10G10R10: return 4; 934 case FORMAT_G8R8: return 2; 935 case FORMAT_G16R16: return 4; 936 case FORMAT_A16B16G16R16: return 8; 937 // Compressed formats 938 #if S3TC_SUPPORT 939 case FORMAT_DXT1: return 2; // Column of four pixels 940 case FORMAT_DXT3: return 4; // Column of four pixels 941 case FORMAT_DXT5: return 4; // Column of four pixels 942 case FORMAT_ATI1: return 2; // Column of four pixels 943 case FORMAT_ATI2: return 4; // Column of four pixels 944 #endif 945 // Bumpmap formats 946 case FORMAT_V8U8: return 2; 947 case FORMAT_L6V5U5: return 2; 948 case FORMAT_Q8W8V8U8: return 4; 949 case FORMAT_X8L8V8U8: return 4; 950 case FORMAT_A2W10V10U10: return 4; 951 case FORMAT_V16U16: return 4; 952 case FORMAT_A16W16V16U16: return 8; 953 case FORMAT_Q16W16V16U16: return 8; 954 // Luminance formats 955 case FORMAT_L8: return 1; 956 case FORMAT_A4L4: return 1; 957 case FORMAT_L16: return 2; 958 case FORMAT_A8L8: return 2; 959 // Floating-point formats 960 case FORMAT_R16F: return 2; 961 case FORMAT_G16R16F: return 4; 962 case FORMAT_A16B16G16R16F: return 8; 963 case FORMAT_R32F: return 4; 964 case FORMAT_G32R32F: return 8; 965 case FORMAT_A32B32G32R32F: return 16; 966 // Depth/stencil formats 967 case FORMAT_D16: return 2; 968 case FORMAT_D32: return 4; 969 case FORMAT_D24X8: return 4; 970 case FORMAT_D24S8: return 4; 971 case FORMAT_D24FS8: return 4; 972 case FORMAT_D32F: return 4; 973 case FORMAT_D32F_COMPLEMENTARY: return 4; 974 case FORMAT_D32F_LOCKABLE: return 4; 975 case FORMAT_D32F_TEXTURE: return 4; 976 case FORMAT_D32F_SHADOW: return 4; 977 case FORMAT_DF24: return 4; 978 case FORMAT_DF16: return 2; 979 case FORMAT_INTZ: return 4; 980 case FORMAT_S8: return 1; 981 default: 982 ASSERT(false); 983 } 984 985 return 0; 986 } 987 988 int Surface::pitchB(int width, Format format, bool target) 989 { 990 if(target || isDepth(format) || isStencil(format)) 991 { 992 width = ((width + 1) & ~1); 993 } 994 995 switch(format) 996 { 997 #if S3TC_SUPPORT 998 case FORMAT_DXT1: 999 return 8 * ((width + 3) / 4); // 64 bit per 4x4 block, computed per 4 rows 1000 case FORMAT_DXT3: 1001 case FORMAT_DXT5: 1002 return 16 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per 4 rows 1003 case FORMAT_ATI1: 1004 return 2 * ((width + 3) / 4); // 64 bit per 4x4 block, computed per row 1005 case FORMAT_ATI2: 1006 return 4 * ((width + 3) / 4); // 128 bit per 4x4 block, computed per row 1007 #endif 1008 default: 1009 return bytes(format) * width; 1010 } 1011 } 1012 1013 int Surface::pitchP(int width, Format format, bool target) 1014 { 1015 int B = bytes(format); 1016 1017 return B > 0 ? pitchB(width, format, target) / B : 0; 1018 } 1019 1020 int Surface::sliceB(int width, int height, Format format, bool target) 1021 { 1022 if(target || isDepth(format) || isStencil(format)) 1023 { 1024 height = ((height + 1) & ~1); 1025 } 1026 1027 switch(format) 1028 { 1029 #if S3TC_SUPPORT 1030 case FORMAT_DXT1: 1031 case FORMAT_DXT3: 1032 case FORMAT_DXT5: 1033 return pitchB(width, format, target) * ((height + 3) / 4); // Pitch computed per 4 rows 1034 case FORMAT_ATI1: // Pitch computed per row 1035 case FORMAT_ATI2: // Pitch computed per row 1036 #endif 1037 default: 1038 return pitchB(width, format, target) * height; 1039 } 1040 } 1041 1042 int Surface::sliceP(int width, int height, Format format, bool target) 1043 { 1044 int B = bytes(format); 1045 1046 return B > 0 ? sliceB(width, height, format, target) / B : 0; 1047 } 1048 1049 void Surface::update(Buffer &destination, Buffer &source) 1050 { 1051 // ASSERT(source.lock != LOCK_UNLOCKED); 1052 // ASSERT(destination.lock != LOCK_UNLOCKED); 1053 1054 if(destination.buffer != source.buffer) 1055 { 1056 ASSERT(source.dirty && !destination.dirty); 1057 1058 switch(source.format) 1059 { 1060 case FORMAT_R8G8B8: decodeR8G8B8(destination, source); break; // FIXME: Check destination format 1061 case FORMAT_X8B8G8R8: decodeX8B8G8R8(destination, source); break; // FIXME: Check destination format 1062 case FORMAT_A8B8G8R8: decodeA8B8G8R8(destination, source); break; // FIXME: Check destination format 1063 case FORMAT_R5G6B5: decodeR5G6B5(destination, source); break; // FIXME: Check destination format 1064 case FORMAT_X1R5G5B5: decodeX1R5G5B5(destination, source); break; // FIXME: Check destination format 1065 case FORMAT_A1R5G5B5: decodeA1R5G5B5(destination, source); break; // FIXME: Check destination format 1066 case FORMAT_X4R4G4B4: decodeX4R4G4B4(destination, source); break; // FIXME: Check destination format 1067 case FORMAT_A4R4G4B4: decodeA4R4G4B4(destination, source); break; // FIXME: Check destination format 1068 case FORMAT_P8: decodeP8(destination, source); break; // FIXME: Check destination format 1069 #if S3TC_SUPPORT 1070 case FORMAT_DXT1: decodeDXT1(destination, source); break; // FIXME: Check destination format 1071 case FORMAT_DXT3: decodeDXT3(destination, source); break; // FIXME: Check destination format 1072 case FORMAT_DXT5: decodeDXT5(destination, source); break; // FIXME: Check destination format 1073 case FORMAT_ATI1: decodeATI1(destination, source); break; // FIXME: Check destination format 1074 case FORMAT_ATI2: decodeATI2(destination, source); break; // FIXME: Check destination format 1075 #endif 1076 default: genericUpdate(destination, source); break; 1077 } 1078 } 1079 1080 source.dirty = false; 1081 destination.paletteUsed = Surface::paletteID; 1082 } 1083 1084 void Surface::genericUpdate(Buffer &destination, Buffer &source) 1085 { 1086 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1087 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1088 1089 int depth = min(destination.depth, source.depth); 1090 int height = min(destination.height, source.height); 1091 int width = min(destination.width, source.width); 1092 int rowBytes = width * source.bytes; 1093 1094 for(int z = 0; z < depth; z++) 1095 { 1096 unsigned char *sourceRow = sourceSlice; 1097 unsigned char *destinationRow = destinationSlice; 1098 1099 for(int y = 0; y < height; y++) 1100 { 1101 if(source.format == destination.format) 1102 { 1103 memcpy(destinationRow, sourceRow, rowBytes); 1104 } 1105 else 1106 { 1107 unsigned char *sourceElement = sourceRow; 1108 unsigned char *destinationElement = destinationRow; 1109 1110 for(int x = 0; x < width; x++) 1111 { 1112 Color<float> color = source.read(sourceElement); 1113 destination.write(destinationElement, color); 1114 1115 sourceElement += source.bytes; 1116 destinationElement += destination.bytes; 1117 } 1118 } 1119 1120 sourceRow += source.pitchB; 1121 destinationRow += destination.pitchB; 1122 } 1123 1124 sourceSlice += source.sliceB; 1125 destinationSlice += destination.sliceB; 1126 } 1127 } 1128 1129 void Surface::decodeR8G8B8(Buffer &destination, const Buffer &source) 1130 { 1131 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1132 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1133 1134 for(int z = 0; z < destination.depth && z < source.depth; z++) 1135 { 1136 unsigned char *sourceRow = sourceSlice; 1137 unsigned char *destinationRow = destinationSlice; 1138 1139 for(int y = 0; y < destination.height && y < source.height; y++) 1140 { 1141 unsigned char *sourceElement = sourceRow; 1142 unsigned char *destinationElement = destinationRow; 1143 1144 for(int x = 0; x < destination.width && x < source.width; x++) 1145 { 1146 unsigned int b = sourceElement[0]; 1147 unsigned int g = sourceElement[1]; 1148 unsigned int r = sourceElement[2]; 1149 1150 *(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0); 1151 1152 sourceElement += source.bytes; 1153 destinationElement += destination.bytes; 1154 } 1155 1156 sourceRow += source.pitchB; 1157 destinationRow += destination.pitchB; 1158 } 1159 1160 sourceSlice += source.sliceB; 1161 destinationSlice += destination.sliceB; 1162 } 1163 } 1164 1165 void Surface::decodeX8B8G8R8(Buffer &destination, const Buffer &source) 1166 { 1167 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1168 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1169 1170 for(int z = 0; z < destination.depth && z < source.depth; z++) 1171 { 1172 unsigned char *sourceRow = sourceSlice; 1173 unsigned char *destinationRow = destinationSlice; 1174 1175 for(int y = 0; y < destination.height && y < source.height; y++) 1176 { 1177 unsigned char *sourceElement = sourceRow; 1178 unsigned char *destinationElement = destinationRow; 1179 1180 for(int x = 0; x < destination.width && x < source.width; x++) 1181 { 1182 unsigned int r = sourceElement[0]; 1183 unsigned int g = sourceElement[1]; 1184 unsigned int b = sourceElement[2]; 1185 1186 *(unsigned int*)destinationElement = 0xFF000000 | (r << 16) | (g << 8) | (b << 0); 1187 1188 sourceElement += source.bytes; 1189 destinationElement += destination.bytes; 1190 } 1191 1192 sourceRow += source.pitchB; 1193 destinationRow += destination.pitchB; 1194 } 1195 1196 sourceSlice += source.sliceB; 1197 destinationSlice += destination.sliceB; 1198 } 1199 } 1200 1201 void Surface::decodeA8B8G8R8(Buffer &destination, const Buffer &source) 1202 { 1203 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1204 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1205 1206 for(int z = 0; z < destination.depth && z < source.depth; z++) 1207 { 1208 unsigned char *sourceRow = sourceSlice; 1209 unsigned char *destinationRow = destinationSlice; 1210 1211 for(int y = 0; y < destination.height && y < source.height; y++) 1212 { 1213 unsigned char *sourceElement = sourceRow; 1214 unsigned char *destinationElement = destinationRow; 1215 1216 for(int x = 0; x < destination.width && x < source.width; x++) 1217 { 1218 unsigned int r = sourceElement[0]; 1219 unsigned int g = sourceElement[1]; 1220 unsigned int b = sourceElement[2]; 1221 unsigned int a = sourceElement[3]; 1222 1223 *(unsigned int*)destinationElement = (a << 24) | (r << 16) | (g << 8) | (b << 0); 1224 1225 sourceElement += source.bytes; 1226 destinationElement += destination.bytes; 1227 } 1228 1229 sourceRow += source.pitchB; 1230 destinationRow += destination.pitchB; 1231 } 1232 1233 sourceSlice += source.sliceB; 1234 destinationSlice += destination.sliceB; 1235 } 1236 } 1237 1238 void Surface::decodeR5G6B5(Buffer &destination, const Buffer &source) 1239 { 1240 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1241 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1242 1243 for(int z = 0; z < destination.depth && z < source.depth; z++) 1244 { 1245 unsigned char *sourceRow = sourceSlice; 1246 unsigned char *destinationRow = destinationSlice; 1247 1248 for(int y = 0; y < destination.height && y < source.height; y++) 1249 { 1250 unsigned char *sourceElement = sourceRow; 1251 unsigned char *destinationElement = destinationRow; 1252 1253 for(int x = 0; x < destination.width && x < source.width; x++) 1254 { 1255 unsigned int rgb = *(unsigned short*)sourceElement; 1256 1257 unsigned int r = (((rgb & 0xF800) * 67385 + 0x800000) >> 8) & 0x00FF0000; 1258 unsigned int g = (((rgb & 0x07E0) * 8289 + 0x8000) >> 8) & 0x0000FF00; 1259 unsigned int b = (((rgb & 0x001F) * 2106 + 0x80) >> 8); 1260 1261 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b; 1262 1263 sourceElement += source.bytes; 1264 destinationElement += destination.bytes; 1265 } 1266 1267 sourceRow += source.pitchB; 1268 destinationRow += destination.pitchB; 1269 } 1270 1271 sourceSlice += source.sliceB; 1272 destinationSlice += destination.sliceB; 1273 } 1274 } 1275 1276 void Surface::decodeX1R5G5B5(Buffer &destination, const Buffer &source) 1277 { 1278 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1279 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1280 1281 for(int z = 0; z < destination.depth && z < source.depth; z++) 1282 { 1283 unsigned char *sourceRow = sourceSlice; 1284 unsigned char *destinationRow = destinationSlice; 1285 1286 for(int y = 0; y < destination.height && y < source.height; y++) 1287 { 1288 unsigned char *sourceElement = sourceRow; 1289 unsigned char *destinationElement = destinationRow; 1290 1291 for(int x = 0; x < destination.width && x < source.width; x++) 1292 { 1293 unsigned int xrgb = *(unsigned short*)sourceElement; 1294 1295 unsigned int r = (((xrgb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000; 1296 unsigned int g = (((xrgb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00; 1297 unsigned int b = (((xrgb & 0x001F) * 2106 + 0x80) >> 8); 1298 1299 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b; 1300 1301 sourceElement += source.bytes; 1302 destinationElement += destination.bytes; 1303 } 1304 1305 sourceRow += source.pitchB; 1306 destinationRow += destination.pitchB; 1307 } 1308 1309 sourceSlice += source.sliceB; 1310 destinationSlice += destination.sliceB; 1311 } 1312 } 1313 1314 void Surface::decodeA1R5G5B5(Buffer &destination, const Buffer &source) 1315 { 1316 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1317 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1318 1319 for(int z = 0; z < destination.depth && z < source.depth; z++) 1320 { 1321 unsigned char *sourceRow = sourceSlice; 1322 unsigned char *destinationRow = destinationSlice; 1323 1324 for(int y = 0; y < destination.height && y < source.height; y++) 1325 { 1326 unsigned char *sourceElement = sourceRow; 1327 unsigned char *destinationElement = destinationRow; 1328 1329 for(int x = 0; x < destination.width && x < source.width; x++) 1330 { 1331 unsigned int argb = *(unsigned short*)sourceElement; 1332 1333 unsigned int a = (argb & 0x8000) * 130560; 1334 unsigned int r = (((argb & 0x7C00) * 134771 + 0x800000) >> 8) & 0x00FF0000; 1335 unsigned int g = (((argb & 0x03E0) * 16846 + 0x8000) >> 8) & 0x0000FF00; 1336 unsigned int b = (((argb & 0x001F) * 2106 + 0x80) >> 8); 1337 1338 *(unsigned int*)destinationElement = a | r | g | b; 1339 1340 sourceElement += source.bytes; 1341 destinationElement += destination.bytes; 1342 } 1343 1344 sourceRow += source.pitchB; 1345 destinationRow += destination.pitchB; 1346 } 1347 1348 sourceSlice += source.sliceB; 1349 destinationSlice += destination.sliceB; 1350 } 1351 } 1352 1353 void Surface::decodeX4R4G4B4(Buffer &destination, const Buffer &source) 1354 { 1355 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1356 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1357 1358 for(int z = 0; z < destination.depth && z < source.depth; z++) 1359 { 1360 unsigned char *sourceRow = sourceSlice; 1361 unsigned char *destinationRow = destinationSlice; 1362 1363 for(int y = 0; y < destination.height && y < source.height; y++) 1364 { 1365 unsigned char *sourceElement = sourceRow; 1366 unsigned char *destinationElement = destinationRow; 1367 1368 for(int x = 0; x < destination.width && x < source.width; x++) 1369 { 1370 unsigned int xrgb = *(unsigned short*)sourceElement; 1371 1372 unsigned int r = ((xrgb & 0x0F00) * 0x00001100) & 0x00FF0000; 1373 unsigned int g = ((xrgb & 0x00F0) * 0x00000110) & 0x0000FF00; 1374 unsigned int b = (xrgb & 0x000F) * 0x00000011; 1375 1376 *(unsigned int*)destinationElement = 0xFF000000 | r | g | b; 1377 1378 sourceElement += source.bytes; 1379 destinationElement += destination.bytes; 1380 } 1381 1382 sourceRow += source.pitchB; 1383 destinationRow += destination.pitchB; 1384 } 1385 1386 sourceSlice += source.sliceB; 1387 destinationSlice += destination.sliceB; 1388 } 1389 } 1390 1391 void Surface::decodeA4R4G4B4(Buffer &destination, const Buffer &source) 1392 { 1393 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1394 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1395 1396 for(int z = 0; z < destination.depth && z < source.depth; z++) 1397 { 1398 unsigned char *sourceRow = sourceSlice; 1399 unsigned char *destinationRow = destinationSlice; 1400 1401 for(int y = 0; y < destination.height && y < source.height; y++) 1402 { 1403 unsigned char *sourceElement = sourceRow; 1404 unsigned char *destinationElement = destinationRow; 1405 1406 for(int x = 0; x < destination.width && x < source.width; x++) 1407 { 1408 unsigned int argb = *(unsigned short*)sourceElement; 1409 1410 unsigned int a = ((argb & 0xF000) * 0x00011000) & 0xFF000000; 1411 unsigned int r = ((argb & 0x0F00) * 0x00001100) & 0x00FF0000; 1412 unsigned int g = ((argb & 0x00F0) * 0x00000110) & 0x0000FF00; 1413 unsigned int b = (argb & 0x000F) * 0x00000011; 1414 1415 *(unsigned int*)destinationElement = a | r | g | b; 1416 1417 sourceElement += source.bytes; 1418 destinationElement += destination.bytes; 1419 } 1420 1421 sourceRow += source.pitchB; 1422 destinationRow += destination.pitchB; 1423 } 1424 1425 sourceSlice += source.sliceB; 1426 destinationSlice += destination.sliceB; 1427 } 1428 } 1429 1430 void Surface::decodeP8(Buffer &destination, const Buffer &source) 1431 { 1432 unsigned char *sourceSlice = (unsigned char*)source.buffer; 1433 unsigned char *destinationSlice = (unsigned char*)destination.buffer; 1434 1435 for(int z = 0; z < destination.depth && z < source.depth; z++) 1436 { 1437 unsigned char *sourceRow = sourceSlice; 1438 unsigned char *destinationRow = destinationSlice; 1439 1440 for(int y = 0; y < destination.height && y < source.height; y++) 1441 { 1442 unsigned char *sourceElement = sourceRow; 1443 unsigned char *destinationElement = destinationRow; 1444 1445 for(int x = 0; x < destination.width && x < source.width; x++) 1446 { 1447 unsigned int abgr = palette[*(unsigned char*)sourceElement]; 1448 1449 unsigned int r = (abgr & 0x000000FF) << 16; 1450 unsigned int g = (abgr & 0x0000FF00) << 0; 1451 unsigned int b = (abgr & 0x00FF0000) >> 16; 1452 unsigned int a = (abgr & 0xFF000000) >> 0; 1453 1454 *(unsigned int*)destinationElement = a | r | g | b; 1455 1456 sourceElement += source.bytes; 1457 destinationElement += destination.bytes; 1458 } 1459 1460 sourceRow += source.pitchB; 1461 destinationRow += destination.pitchB; 1462 } 1463 1464 sourceSlice += source.sliceB; 1465 destinationSlice += destination.sliceB; 1466 } 1467 } 1468 1469#if S3TC_SUPPORT 1470 void Surface::decodeDXT1(Buffer &internal, const Buffer &external) 1471 { 1472 unsigned int *destSlice = (unsigned int*)internal.buffer; 1473 DXT1 *source = (DXT1*)external.buffer; 1474 1475 for(int z = 0; z < external.depth; z++) 1476 { 1477 unsigned int *dest = destSlice; 1478 1479 for(int y = 0; y < external.height; y += 4) 1480 { 1481 for(int x = 0; x < external.width; x += 4) 1482 { 1483 Color<byte> c[4]; 1484 1485 c[0] = source->c0; 1486 c[1] = source->c1; 1487 1488 if(source->c0 > source->c1) // No transparency 1489 { 1490 // c2 = 2 / 3 * c0 + 1 / 3 * c1 1491 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3); 1492 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3); 1493 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3); 1494 c[2].a = 0xFF; 1495 1496 // c3 = 1 / 3 * c0 + 2 / 3 * c1 1497 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3); 1498 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3); 1499 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3); 1500 c[3].a = 0xFF; 1501 } 1502 else // c3 transparent 1503 { 1504 // c2 = 1 / 2 * c0 + 1 / 2 * c1 1505 c[2].r = (byte)(((word)c[0].r + (word)c[1].r) / 2); 1506 c[2].g = (byte)(((word)c[0].g + (word)c[1].g) / 2); 1507 c[2].b = (byte)(((word)c[0].b + (word)c[1].b) / 2); 1508 c[2].a = 0xFF; 1509 1510 c[3].r = 0; 1511 c[3].g = 0; 1512 c[3].b = 0; 1513 c[3].a = 0; 1514 } 1515 1516 for(int j = 0; j < 4 && (y + j) < internal.height; j++) 1517 { 1518 for(int i = 0; i < 4 && (x + i) < internal.width; i++) 1519 { 1520 dest[(x + i) + (y + j) * internal.width] = c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4]; 1521 } 1522 } 1523 1524 source++; 1525 } 1526 } 1527 1528 (byte*&)destSlice += internal.sliceB; 1529 } 1530 } 1531 1532 void Surface::decodeDXT3(Buffer &internal, const Buffer &external) 1533 { 1534 unsigned int *destSlice = (unsigned int*)internal.buffer; 1535 DXT3 *source = (DXT3*)external.buffer; 1536 1537 for(int z = 0; z < external.depth; z++) 1538 { 1539 unsigned int *dest = destSlice; 1540 1541 for(int y = 0; y < external.height; y += 4) 1542 { 1543 for(int x = 0; x < external.width; x += 4) 1544 { 1545 Color<byte> c[4]; 1546 1547 c[0] = source->c0; 1548 c[1] = source->c1; 1549 1550 // c2 = 2 / 3 * c0 + 1 / 3 * c1 1551 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3); 1552 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3); 1553 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3); 1554 1555 // c3 = 1 / 3 * c0 + 2 / 3 * c1 1556 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3); 1557 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3); 1558 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3); 1559 1560 for(int j = 0; j < 4 && (y + j) < internal.height; j++) 1561 { 1562 for(int i = 0; i < 4 && (x + i) < internal.width; i++) 1563 { 1564 unsigned int a = (unsigned int)(source->a >> 4 * (i + j * 4)) & 0x0F; 1565 unsigned int color = (c[(unsigned int)(source->lut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | ((a << 28) + (a << 24)); 1566 1567 dest[(x + i) + (y + j) * internal.width] = color; 1568 } 1569 } 1570 1571 source++; 1572 } 1573 } 1574 1575 (byte*&)destSlice += internal.sliceB; 1576 } 1577 } 1578 1579 void Surface::decodeDXT5(Buffer &internal, const Buffer &external) 1580 { 1581 unsigned int *destSlice = (unsigned int*)internal.buffer; 1582 DXT5 *source = (DXT5*)external.buffer; 1583 1584 for(int z = 0; z < external.depth; z++) 1585 { 1586 unsigned int *dest = destSlice; 1587 1588 for(int y = 0; y < external.height; y += 4) 1589 { 1590 for(int x = 0; x < external.width; x += 4) 1591 { 1592 Color<byte> c[4]; 1593 1594 c[0] = source->c0; 1595 c[1] = source->c1; 1596 1597 // c2 = 2 / 3 * c0 + 1 / 3 * c1 1598 c[2].r = (byte)((2 * (word)c[0].r + (word)c[1].r + 1) / 3); 1599 c[2].g = (byte)((2 * (word)c[0].g + (word)c[1].g + 1) / 3); 1600 c[2].b = (byte)((2 * (word)c[0].b + (word)c[1].b + 1) / 3); 1601 1602 // c3 = 1 / 3 * c0 + 2 / 3 * c1 1603 c[3].r = (byte)(((word)c[0].r + 2 * (word)c[1].r + 1) / 3); 1604 c[3].g = (byte)(((word)c[0].g + 2 * (word)c[1].g + 1) / 3); 1605 c[3].b = (byte)(((word)c[0].b + 2 * (word)c[1].b + 1) / 3); 1606 1607 byte a[8]; 1608 1609 a[0] = source->a0; 1610 a[1] = source->a1; 1611 1612 if(a[0] > a[1]) 1613 { 1614 a[2] = (byte)((6 * (word)a[0] + 1 * (word)a[1] + 3) / 7); 1615 a[3] = (byte)((5 * (word)a[0] + 2 * (word)a[1] + 3) / 7); 1616 a[4] = (byte)((4 * (word)a[0] + 3 * (word)a[1] + 3) / 7); 1617 a[5] = (byte)((3 * (word)a[0] + 4 * (word)a[1] + 3) / 7); 1618 a[6] = (byte)((2 * (word)a[0] + 5 * (word)a[1] + 3) / 7); 1619 a[7] = (byte)((1 * (word)a[0] + 6 * (word)a[1] + 3) / 7); 1620 } 1621 else 1622 { 1623 a[2] = (byte)((4 * (word)a[0] + 1 * (word)a[1] + 2) / 5); 1624 a[3] = (byte)((3 * (word)a[0] + 2 * (word)a[1] + 2) / 5); 1625 a[4] = (byte)((2 * (word)a[0] + 3 * (word)a[1] + 2) / 5); 1626 a[5] = (byte)((1 * (word)a[0] + 4 * (word)a[1] + 2) / 5); 1627 a[6] = 0; 1628 a[7] = 0xFF; 1629 } 1630 1631 for(int j = 0; j < 4 && (y + j) < internal.height; j++) 1632 { 1633 for(int i = 0; i < 4 && (x + i) < internal.width; i++) 1634 { 1635 unsigned int alpha = (unsigned int)a[(unsigned int)(source->alut >> (16 + 3 * (i + j * 4))) % 8] << 24; 1636 unsigned int color = (c[(source->clut >> 2 * (i + j * 4)) % 4] & 0x00FFFFFF) | alpha; 1637 1638 dest[(x + i) + (y + j) * internal.width] = color; 1639 } 1640 } 1641 1642 source++; 1643 } 1644 } 1645 1646 (byte*&)destSlice += internal.sliceB; 1647 } 1648 } 1649 1650 void Surface::decodeATI1(Buffer &internal, const Buffer &external) 1651 { 1652 byte *destSlice = (byte*)internal.buffer; 1653 ATI1 *source = (ATI1*)external.buffer; 1654 1655 for(int z = 0; z < external.depth; z++) 1656 { 1657 byte *dest = destSlice; 1658 1659 for(int y = 0; y < external.height; y += 4) 1660 { 1661 for(int x = 0; x < external.width; x += 4) 1662 { 1663 byte r[8]; 1664 1665 r[0] = source->r0; 1666 r[1] = source->r1; 1667 1668 if(r[0] > r[1]) 1669 { 1670 r[2] = (byte)((6 * (word)r[0] + 1 * (word)r[1] + 3) / 7); 1671 r[3] = (byte)((5 * (word)r[0] + 2 * (word)r[1] + 3) / 7); 1672 r[4] = (byte)((4 * (word)r[0] + 3 * (word)r[1] + 3) / 7); 1673 r[5] = (byte)((3 * (word)r[0] + 4 * (word)r[1] + 3) / 7); 1674 r[6] = (byte)((2 * (word)r[0] + 5 * (word)r[1] + 3) / 7); 1675 r[7] = (byte)((1 * (word)r[0] + 6 * (word)r[1] + 3) / 7); 1676 } 1677 else 1678 { 1679 r[2] = (byte)((4 * (word)r[0] + 1 * (word)r[1] + 2) / 5); 1680 r[3] = (byte)((3 * (word)r[0] + 2 * (word)r[1] + 2) / 5); 1681 r[4] = (byte)((2 * (word)r[0] + 3 * (word)r[1] + 2) / 5); 1682 r[5] = (byte)((1 * (word)r[0] + 4 * (word)r[1] + 2) / 5); 1683 r[6] = 0; 1684 r[7] = 0xFF; 1685 } 1686 1687 for(int j = 0; j < 4 && (y + j) < internal.height; j++) 1688 { 1689 for(int i = 0; i < 4 && (x + i) < internal.width; i++) 1690 { 1691 dest[(x + i) + (y + j) * internal.width] = r[(unsigned int)(source->rlut >> (16 + 3 * (i + j * 4))) % 8]; 1692 } 1693 } 1694 1695 source++; 1696 } 1697 } 1698 1699 destSlice += internal.sliceB; 1700 } 1701 } 1702 1703 void Surface::decodeATI2(Buffer &internal, const Buffer &external) 1704 { 1705 word *destSlice = (word*)internal.buffer; 1706 ATI2 *source = (ATI2*)external.buffer; 1707 1708 for(int z = 0; z < external.depth; z++) 1709 { 1710 word *dest = destSlice; 1711 1712 for(int y = 0; y < external.height; y += 4) 1713 { 1714 for(int x = 0; x < external.width; x += 4) 1715 { 1716 byte X[8]; 1717 1718 X[0] = source->x0; 1719 X[1] = source->x1; 1720 1721 if(X[0] > X[1]) 1722 { 1723 X[2] = (byte)((6 * (word)X[0] + 1 * (word)X[1] + 3) / 7); 1724 X[3] = (byte)((5 * (word)X[0] + 2 * (word)X[1] + 3) / 7); 1725 X[4] = (byte)((4 * (word)X[0] + 3 * (word)X[1] + 3) / 7); 1726 X[5] = (byte)((3 * (word)X[0] + 4 * (word)X[1] + 3) / 7); 1727 X[6] = (byte)((2 * (word)X[0] + 5 * (word)X[1] + 3) / 7); 1728 X[7] = (byte)((1 * (word)X[0] + 6 * (word)X[1] + 3) / 7); 1729 } 1730 else 1731 { 1732 X[2] = (byte)((4 * (word)X[0] + 1 * (word)X[1] + 2) / 5); 1733 X[3] = (byte)((3 * (word)X[0] + 2 * (word)X[1] + 2) / 5); 1734 X[4] = (byte)((2 * (word)X[0] + 3 * (word)X[1] + 2) / 5); 1735 X[5] = (byte)((1 * (word)X[0] + 4 * (word)X[1] + 2) / 5); 1736 X[6] = 0; 1737 X[7] = 0xFF; 1738 } 1739 1740 byte Y[8]; 1741 1742 Y[0] = source->y0; 1743 Y[1] = source->y1; 1744 1745 if(Y[0] > Y[1]) 1746 { 1747 Y[2] = (byte)((6 * (word)Y[0] + 1 * (word)Y[1] + 3) / 7); 1748 Y[3] = (byte)((5 * (word)Y[0] + 2 * (word)Y[1] + 3) / 7); 1749 Y[4] = (byte)((4 * (word)Y[0] + 3 * (word)Y[1] + 3) / 7); 1750 Y[5] = (byte)((3 * (word)Y[0] + 4 * (word)Y[1] + 3) / 7); 1751 Y[6] = (byte)((2 * (word)Y[0] + 5 * (word)Y[1] + 3) / 7); 1752 Y[7] = (byte)((1 * (word)Y[0] + 6 * (word)Y[1] + 3) / 7); 1753 } 1754 else 1755 { 1756 Y[2] = (byte)((4 * (word)Y[0] + 1 * (word)Y[1] + 2) / 5); 1757 Y[3] = (byte)((3 * (word)Y[0] + 2 * (word)Y[1] + 2) / 5); 1758 Y[4] = (byte)((2 * (word)Y[0] + 3 * (word)Y[1] + 2) / 5); 1759 Y[5] = (byte)((1 * (word)Y[0] + 4 * (word)Y[1] + 2) / 5); 1760 Y[6] = 0; 1761 Y[7] = 0xFF; 1762 } 1763 1764 for(int j = 0; j < 4 && (y + j) < internal.height; j++) 1765 { 1766 for(int i = 0; i < 4 && (x + i) < internal.width; i++) 1767 { 1768 word r = X[(unsigned int)(source->xlut >> (16 + 3 * (i + j * 4))) % 8]; 1769 word g = Y[(unsigned int)(source->ylut >> (16 + 3 * (i + j * 4))) % 8]; 1770 1771 dest[(x + i) + (y + j) * internal.width] = (g << 8) + r; 1772 } 1773 } 1774 1775 source++; 1776 } 1777 } 1778 1779 (byte*&)destSlice += internal.sliceB; 1780 } 1781 } 1782#endif 1783 1784 unsigned int Surface::size(int width, int height, int depth, Format format) 1785 { 1786 // Dimensions rounded up to multiples of 4, used for DXTC formats 1787 int width4 = (width + 3) & ~3; 1788 int height4 = (height + 3) & ~3; 1789 1790 switch(format) 1791 { 1792 #if S3TC_SUPPORT 1793 case FORMAT_DXT1: 1794 case FORMAT_ATI1: 1795 return width4 * height4 * depth / 2; 1796 case FORMAT_DXT3: 1797 case FORMAT_DXT5: 1798 case FORMAT_ATI2: 1799 return width4 * height4 * depth; 1800 #endif 1801 default: 1802 return bytes(format) * width * height * depth; 1803 } 1804 1805 return 0; 1806 } 1807 1808 bool Surface::isStencil(Format format) 1809 { 1810 switch(format) 1811 { 1812 case FORMAT_D32: 1813 case FORMAT_D16: 1814 case FORMAT_D24X8: 1815 case FORMAT_D32F: 1816 case FORMAT_D32F_COMPLEMENTARY: 1817 case FORMAT_D32F_LOCKABLE: 1818 return false; 1819 case FORMAT_D24S8: 1820 case FORMAT_D24FS8: 1821 case FORMAT_S8: 1822 case FORMAT_DF24: 1823 case FORMAT_DF16: 1824 case FORMAT_D32F_TEXTURE: 1825 case FORMAT_D32F_SHADOW: 1826 case FORMAT_INTZ: 1827 return true; 1828 default: 1829 return false; 1830 } 1831 } 1832 1833 bool Surface::isDepth(Format format) 1834 { 1835 switch(format) 1836 { 1837 case FORMAT_D32: 1838 case FORMAT_D16: 1839 case FORMAT_D24X8: 1840 case FORMAT_D24S8: 1841 case FORMAT_D24FS8: 1842 case FORMAT_D32F: 1843 case FORMAT_D32F_COMPLEMENTARY: 1844 case FORMAT_D32F_LOCKABLE: 1845 case FORMAT_DF24: 1846 case FORMAT_DF16: 1847 case FORMAT_D32F_TEXTURE: 1848 case FORMAT_D32F_SHADOW: 1849 case FORMAT_INTZ: 1850 return true; 1851 case FORMAT_S8: 1852 return false; 1853 default: 1854 return false; 1855 } 1856 } 1857 1858 bool Surface::isPalette(Format format) 1859 { 1860 switch(format) 1861 { 1862 case FORMAT_P8: 1863 case FORMAT_A8P8: 1864 return true; 1865 default: 1866 return false; 1867 } 1868 } 1869 1870 bool Surface::isFloatFormat(Format format) 1871 { 1872 switch(format) 1873 { 1874 case FORMAT_X8R8G8B8: 1875 case FORMAT_A8R8G8B8: 1876 case FORMAT_G8R8: 1877 case FORMAT_G16R16: 1878 case FORMAT_A16B16G16R16: 1879 case FORMAT_V8U8: 1880 case FORMAT_Q8W8V8U8: 1881 case FORMAT_X8L8V8U8: 1882 case FORMAT_V16U16: 1883 case FORMAT_A16W16V16U16: 1884 case FORMAT_Q16W16V16U16: 1885 case FORMAT_A8: 1886 case FORMAT_R8: 1887 case FORMAT_L8: 1888 case FORMAT_L16: 1889 case FORMAT_A8L8: 1890 return false; 1891 case FORMAT_R32F: 1892 case FORMAT_G32R32F: 1893 case FORMAT_A32B32G32R32F: 1894 case FORMAT_D32F: 1895 case FORMAT_D32F_COMPLEMENTARY: 1896 case FORMAT_D32F_LOCKABLE: 1897 case FORMAT_D32F_TEXTURE: 1898 case FORMAT_D32F_SHADOW: 1899 return true; 1900 default: 1901 ASSERT(false); 1902 } 1903 1904 return false; 1905 } 1906 1907 bool Surface::isUnsignedComponent(Format format, int component) 1908 { 1909 switch(format) 1910 { 1911 case FORMAT_NULL: 1912 case FORMAT_X8R8G8B8: 1913 case FORMAT_A8R8G8B8: 1914 case FORMAT_G8R8: 1915 case FORMAT_G16R16: 1916 case FORMAT_A16B16G16R16: 1917 case FORMAT_D32F: 1918 case FORMAT_D32F_COMPLEMENTARY: 1919 case FORMAT_D32F_LOCKABLE: 1920 case FORMAT_D32F_TEXTURE: 1921 case FORMAT_D32F_SHADOW: 1922 case FORMAT_A8: 1923 case FORMAT_R8: 1924 case FORMAT_L8: 1925 case FORMAT_L16: 1926 case FORMAT_A8L8: 1927 return true; 1928 case FORMAT_V8U8: 1929 case FORMAT_X8L8V8U8: 1930 case FORMAT_V16U16: 1931 if(component < 2) 1932 { 1933 return false; 1934 } 1935 else 1936 { 1937 return true; 1938 } 1939 case FORMAT_A16W16V16U16: 1940 if(component < 3) 1941 { 1942 return false; 1943 } 1944 else 1945 { 1946 return true; 1947 } 1948 case FORMAT_Q8W8V8U8: 1949 case FORMAT_Q16W16V16U16: 1950 return false; 1951 case FORMAT_R32F: 1952 if(component < 1) 1953 { 1954 return false; 1955 } 1956 else 1957 { 1958 return true; 1959 } 1960 case FORMAT_G32R32F: 1961 if(component < 2) 1962 { 1963 return false; 1964 } 1965 else 1966 { 1967 return true; 1968 } 1969 case FORMAT_A32B32G32R32F: 1970 return false; 1971 default: 1972 ASSERT(false); 1973 } 1974 1975 return false; 1976 } 1977 1978 bool Surface::isSRGBreadable(Format format) 1979 { 1980 // Keep in sync with Capabilities::isSRGBreadable 1981 switch(format) 1982 { 1983 case FORMAT_L8: 1984 case FORMAT_A8L8: 1985 case FORMAT_R8G8B8: 1986 case FORMAT_A8R8G8B8: 1987 case FORMAT_X8R8G8B8: 1988 case FORMAT_A8B8G8R8: 1989 case FORMAT_X8B8G8R8: 1990 case FORMAT_R5G6B5: 1991 case FORMAT_X1R5G5B5: 1992 case FORMAT_A1R5G5B5: 1993 case FORMAT_A4R4G4B4: 1994 #if S3TC_SUPPORT 1995 case FORMAT_DXT1: 1996 case FORMAT_DXT3: 1997 case FORMAT_DXT5: 1998 case FORMAT_ATI1: 1999 case FORMAT_ATI2: 2000 #endif 2001 return true; 2002 default: 2003 return false; 2004 } 2005 2006 return false; 2007 } 2008 2009 bool Surface::isSRGBwritable(Format format) 2010 { 2011 // Keep in sync with Capabilities::isSRGBwritable 2012 switch(format) 2013 { 2014 case FORMAT_NULL: 2015 case FORMAT_A8R8G8B8: 2016 case FORMAT_X8R8G8B8: 2017 case FORMAT_A8B8G8R8: 2018 case FORMAT_X8B8G8R8: 2019 case FORMAT_R5G6B5: 2020 return true; 2021 default: 2022 return false; 2023 } 2024 } 2025 2026 bool Surface::isCompressed(Format format) 2027 { 2028 switch(format) 2029 { 2030 #if S3TC_SUPPORT 2031 case FORMAT_DXT1: 2032 case FORMAT_DXT3: 2033 case FORMAT_DXT5: 2034 case FORMAT_ATI1: 2035 case FORMAT_ATI2: 2036 return true; 2037 #endif 2038 default: 2039 return false; 2040 } 2041 } 2042 2043 int Surface::componentCount(Format format) 2044 { 2045 switch(format) 2046 { 2047 case FORMAT_X8R8G8B8: return 3; 2048 case FORMAT_A8R8G8B8: return 4; 2049 case FORMAT_G8R8: return 2; 2050 case FORMAT_G16R16: return 2; 2051 case FORMAT_A16B16G16R16: return 4; 2052 case FORMAT_V8U8: return 2; 2053 case FORMAT_Q8W8V8U8: return 4; 2054 case FORMAT_X8L8V8U8: return 3; 2055 case FORMAT_V16U16: return 2; 2056 case FORMAT_A16W16V16U16: return 4; 2057 case FORMAT_Q16W16V16U16: return 4; 2058 case FORMAT_R32F: return 1; 2059 case FORMAT_G32R32F: return 2; 2060 case FORMAT_A32B32G32R32F: return 4; 2061 case FORMAT_D32F_LOCKABLE: return 1; 2062 case FORMAT_D32F_TEXTURE: return 1; 2063 case FORMAT_D32F_SHADOW: return 1; 2064 case FORMAT_A8: return 1; 2065 case FORMAT_R8: return 1; 2066 case FORMAT_L8: return 1; 2067 case FORMAT_L16: return 1; 2068 case FORMAT_A8L8: return 2; 2069 default: 2070 ASSERT(false); 2071 } 2072 2073 return 1; 2074 } 2075 2076 void *Surface::allocateBuffer(int width, int height, int depth, Format format) 2077 { 2078 int width4 = (width + 3) & ~3; 2079 int height4 = (height + 3) & ~3; 2080 2081 return allocate(size(width4, height4, depth, format)); 2082 } 2083 2084 void Surface::memfill(void *buffer, int pattern, int bytes) 2085 { 2086 while((size_t)buffer & 0x1 && bytes >= 1) 2087 { 2088 *(char*)buffer = (char)pattern; 2089 (char*&)buffer += 1; 2090 bytes -= 1; 2091 } 2092 2093 while((size_t)buffer & 0x3 && bytes >= 2) 2094 { 2095 *(short*)buffer = (short)pattern; 2096 (short*&)buffer += 1; 2097 bytes -= 2; 2098 } 2099 2100 if(CPUID::supportsSSE()) 2101 { 2102 while((size_t)buffer & 0xF && bytes >= 4) 2103 { 2104 *(int*)buffer = pattern; 2105 (int*&)buffer += 1; 2106 bytes -= 4; 2107 } 2108 2109 __m128 quad = _mm_set_ps1((float&)pattern); 2110 2111 float *pointer = (float*)buffer; 2112 int qxwords = bytes / 64; 2113 bytes -= qxwords * 64; 2114 2115 while(qxwords--) 2116 { 2117 _mm_stream_ps(pointer + 0, quad); 2118 _mm_stream_ps(pointer + 4, quad); 2119 _mm_stream_ps(pointer + 8, quad); 2120 _mm_stream_ps(pointer + 12, quad); 2121 2122 pointer += 16; 2123 } 2124 2125 buffer = pointer; 2126 } 2127 2128 while(bytes >= 4) 2129 { 2130 *(int*)buffer = (int)pattern; 2131 (int*&)buffer += 1; 2132 bytes -= 4; 2133 } 2134 2135 while(bytes >= 2) 2136 { 2137 *(short*)buffer = (short)pattern; 2138 (short*&)buffer += 1; 2139 bytes -= 2; 2140 } 2141 2142 while(bytes >= 1) 2143 { 2144 *(char*)buffer = (char)pattern; 2145 (char*&)buffer += 1; 2146 bytes -= 1; 2147 } 2148 } 2149 2150 void Surface::clearColorBuffer(unsigned int color, unsigned int rgbaMask, int x0, int y0, int width, int height) 2151 { 2152 // FIXME: Also clear buffers in other formats? 2153 2154 // Not overlapping 2155 if(x0 > internal.width) return; 2156 if(y0 > internal.height) return; 2157 if(x0 + width < 0) return; 2158 if(y0 + height < 0) return; 2159 2160 // Clip against dimensions 2161 if(x0 < 0) {width += x0; x0 = 0;} 2162 if(x0 + width > internal.width) width = internal.width - x0; 2163 if(y0 < 0) {height += y0; y0 = 0;} 2164 if(y0 + height > internal.height) height = internal.height - y0; 2165 2166 const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height; 2167 const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY; 2168 2169 int width2 = (internal.width + 1) & ~1; 2170 2171 int x1 = x0 + width; 2172 int y1 = y0 + height; 2173 2174 int bytes = 4 * (x1 - x0); 2175 2176 // if(lockable || !quadLayoutEnabled) 2177 { 2178 unsigned char *buffer = (unsigned char*)lockInternal(x0, y0, 0, lock, PUBLIC); 2179 2180 unsigned char r8 = (color & 0x00FF0000) >> 16; 2181 unsigned char g8 = (color & 0x0000FF00) >> 8; 2182 unsigned char b8 = (color & 0x000000FF) >> 0; 2183 unsigned char a8 = (color & 0xFF000000) >> 24; 2184 2185 unsigned short r16 = (r8 << 8) + r8; 2186 unsigned short g16 = (g8 << 8) + g8; 2187 unsigned short b16 = (b8 << 8) + b8; 2188 unsigned short a16 = (a8 << 8) + a8; 2189 2190 float r32f = r8 / 255.0f; 2191 float g32f = g8 / 255.0f; 2192 float b32f = b8 / 255.0f; 2193 float a32f = a8 / 255.0f; 2194 2195 unsigned char g8r8[4] = {r8, g8, r8, g8}; 2196 unsigned short g16r16[2] = {r16, g16}; 2197 2198 for(int z = 0; z < internal.depth; z++) 2199 { 2200 unsigned char *target = buffer; 2201 2202 for(int y = y0; y < y1; y++) 2203 { 2204 switch(internal.format) 2205 { 2206 case FORMAT_NULL: 2207 break; 2208 case FORMAT_X8R8G8B8: 2209 case FORMAT_A8R8G8B8: 2210 // case FORMAT_X8G8R8B8Q: // FIXME 2211 // case FORMAT_A8G8R8B8Q: // FIXME 2212 if(rgbaMask == 0xF || (internal.format == FORMAT_X8R8G8B8 && rgbaMask == 0x7)) 2213 { 2214 memfill(target, color, 4 * (x1 - x0)); 2215 } 2216 else 2217 { 2218 unsigned int bgraMask = (rgbaMask & 0x1 ? 0x00FF0000 : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0) | (rgbaMask & 0x4 ? 0x000000FF : 0) | (rgbaMask & 0x8 ? 0xFF000000 : 0); 2219 unsigned int invMask = ~bgraMask; 2220 unsigned int maskedColor = color & bgraMask; 2221 unsigned int *target32 = (unsigned int*)target; 2222 2223 for(int x = 0; x < width; x++) 2224 { 2225 target32[x] = maskedColor | (target32[x] & invMask); 2226 } 2227 } 2228 break; 2229 case FORMAT_G8R8: 2230 if((rgbaMask & 0x3) == 0x3) 2231 { 2232 memfill(target, (int&)g8r8, 2 * (x1 - x0)); 2233 } 2234 else 2235 { 2236 unsigned short rgMask = (rgbaMask & 0x1 ? 0x000000FF : 0) | (rgbaMask & 0x2 ? 0x0000FF00 : 0); 2237 unsigned short invMask = ~rgMask; 2238 unsigned short maskedColor = (unsigned short&)g8r8 & rgMask; 2239 unsigned short *target16 = (unsigned short*)target; 2240 2241 for(int x = 0; x < width; x++) 2242 { 2243 target16[x] = maskedColor | (target16[x] & invMask); 2244 } 2245 } 2246 break; 2247 case FORMAT_G16R16: 2248 if((rgbaMask & 0x3) == 0x3) 2249 { 2250 memfill(target, (int&)g16r16, 4 * (x1 - x0)); 2251 } 2252 else 2253 { 2254 unsigned int rgMask = (rgbaMask & 0x1 ? 0x0000FFFF : 0) | (rgbaMask & 0x2 ? 0xFFFF0000 : 0); 2255 unsigned int invMask = ~rgMask; 2256 unsigned int maskedColor = (unsigned int&)g16r16 & rgMask; 2257 unsigned int *target32 = (unsigned int*)target; 2258 2259 for(int x = 0; x < width; x++) 2260 { 2261 target32[x] = maskedColor | (target32[x] & invMask); 2262 } 2263 } 2264 break; 2265 case FORMAT_A16B16G16R16: 2266 if(rgbaMask == 0xF) 2267 { 2268 for(int x = 0; x < width; x++) 2269 { 2270 ((unsigned short*)target)[4 * x + 0] = r16; 2271 ((unsigned short*)target)[4 * x + 1] = g16; 2272 ((unsigned short*)target)[4 * x + 2] = b16; 2273 ((unsigned short*)target)[4 * x + 3] = a16; 2274 } 2275 } 2276 else 2277 { 2278 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 0] = r16; 2279 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 1] = g16; 2280 if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 2] = b16; 2281 if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((unsigned short*)target)[4 * x + 3] = a16; 2282 } 2283 break; 2284 case FORMAT_R32F: 2285 if(rgbaMask & 0x1) 2286 { 2287 for(int x = 0; x < width; x++) 2288 { 2289 ((float*)target)[x] = r32f; 2290 } 2291 } 2292 break; 2293 case FORMAT_G32R32F: 2294 if((rgbaMask & 0x3) == 0x3) 2295 { 2296 for(int x = 0; x < width; x++) 2297 { 2298 ((float*)target)[2 * x + 0] = r32f; 2299 ((float*)target)[2 * x + 1] = g32f; 2300 } 2301 } 2302 else 2303 { 2304 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 0] = r32f; 2305 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[2 * x + 1] = g32f; 2306 } 2307 break; 2308 case FORMAT_A32B32G32R32F: 2309 if(rgbaMask == 0xF) 2310 { 2311 for(int x = 0; x < width; x++) 2312 { 2313 ((float*)target)[4 * x + 0] = r32f; 2314 ((float*)target)[4 * x + 1] = g32f; 2315 ((float*)target)[4 * x + 2] = b32f; 2316 ((float*)target)[4 * x + 3] = a32f; 2317 } 2318 } 2319 else 2320 { 2321 if(rgbaMask & 0x1) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 0] = r32f; 2322 if(rgbaMask & 0x2) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 1] = g32f; 2323 if(rgbaMask & 0x4) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 2] = b32f; 2324 if(rgbaMask & 0x8) for(int x = 0; x < width; x++) ((float*)target)[4 * x + 3] = a32f; 2325 } 2326 break; 2327 default: 2328 ASSERT(false); 2329 } 2330 2331 target += internal.pitchB; 2332 } 2333 2334 buffer += internal.sliceB; 2335 } 2336 2337 unlockInternal(); 2338 } 2339 /* else 2340 { 2341 // unsigned char *target = (unsigned char*&)buffer; 2342 // 2343 // for(int y = y0; y < y1; y++) 2344 // { 2345 // for(int x = x0; x < x1; x++) 2346 // { 2347 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 0] = (color & 0x000000FF) >> 0; 2348 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 4] = (color & 0x00FF0000) >> 16; 2349 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 8] = (color & 0x0000FF00) >> 8; 2350 // target[width2 * 4 * (y & ~1) + 2 * (y & 1) + 8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24; 2351 // } 2352 // } 2353 2354 unsigned char colorQ[16]; 2355 2356 colorQ[0] = (color & 0x000000FF) >> 0; 2357 colorQ[1] = (color & 0x000000FF) >> 0; 2358 colorQ[2] = (color & 0x000000FF) >> 0; 2359 colorQ[3] = (color & 0x000000FF) >> 0; 2360 colorQ[4] = (color & 0x00FF0000) >> 16; 2361 colorQ[5] = (color & 0x00FF0000) >> 16; 2362 colorQ[6] = (color & 0x00FF0000) >> 16; 2363 colorQ[7] = (color & 0x00FF0000) >> 16; 2364 colorQ[8] = (color & 0x0000FF00) >> 8; 2365 colorQ[9] = (color & 0x0000FF00) >> 8; 2366 colorQ[10] = (color & 0x0000FF00) >> 8; 2367 colorQ[11] = (color & 0x0000FF00) >> 8; 2368 colorQ[12] = (color & 0xFF000000) >> 24; 2369 colorQ[13] = (color & 0xFF000000) >> 24; 2370 colorQ[14] = (color & 0xFF000000) >> 24; 2371 colorQ[15] = (color & 0xFF000000) >> 24; 2372 2373 for(int y = y0; y < y1; y++) 2374 { 2375 unsigned char *target = (unsigned char*)lockInternal(0, 0, 0, lock) + width2 * 4 * (y & ~1) + 2 * (y & 1); // FIXME: Unlock 2376 2377 if((y & 1) == 0 && y + 1 < y1) // Fill quad line at once 2378 { 2379 if((x0 & 1) != 0) 2380 { 2381 target[8 * (x0 & ~1) + 1 + 0] = (color & 0x000000FF) >> 0; 2382 target[8 * (x0 & ~1) + 1 + 4] = (color & 0x00FF0000) >> 16; 2383 target[8 * (x0 & ~1) + 1 + 8] = (color & 0x0000FF00) >> 8; 2384 target[8 * (x0 & ~1) + 1 + 12] = (color & 0xFF000000) >> 24; 2385 2386 target[8 * (x0 & ~1) + 3 + 0] = (color & 0x000000FF) >> 0; 2387 target[8 * (x0 & ~1) + 3 + 4] = (color & 0x00FF0000) >> 16; 2388 target[8 * (x0 & ~1) + 3 + 8] = (color & 0x0000FF00) >> 8; 2389 target[8 * (x0 & ~1) + 3 + 12] = (color & 0xFF000000) >> 24; 2390 } 2391 2392 __asm 2393 { 2394 movq mm0, colorQ+0 2395 movq mm1, colorQ+8 2396 2397 mov eax, x0 2398 add eax, 1 2399 and eax, 0xFFFFFFFE 2400 cmp eax, x1 2401 jge qEnd 2402 2403 mov edi, target 2404 2405 qLoop: 2406 movntq [edi+8*eax+0], mm0 2407 movntq [edi+8*eax+8], mm1 2408 2409 add eax, 2 2410 cmp eax, x1 2411 jl qLoop 2412 qEnd: 2413 emms 2414 } 2415 2416 if((x1 & 1) != 0) 2417 { 2418 target[8 * (x1 & ~1) + 0 + 0] = (color & 0x000000FF) >> 0; 2419 target[8 * (x1 & ~1) + 0 + 4] = (color & 0x00FF0000) >> 16; 2420 target[8 * (x1 & ~1) + 0 + 8] = (color & 0x0000FF00) >> 8; 2421 target[8 * (x1 & ~1) + 0 + 12] = (color & 0xFF000000) >> 24; 2422 2423 target[8 * (x1 & ~1) + 2 + 0] = (color & 0x000000FF) >> 0; 2424 target[8 * (x1 & ~1) + 2 + 4] = (color & 0x00FF0000) >> 16; 2425 target[8 * (x1 & ~1) + 2 + 8] = (color & 0x0000FF00) >> 8; 2426 target[8 * (x1 & ~1) + 2 + 12] = (color & 0xFF000000) >> 24; 2427 } 2428 2429 y++; 2430 } 2431 else 2432 { 2433 for(int x = x0; x < x1; x++) 2434 { 2435 target[8 * (x & ~1) + (x & 1) + 0] = (color & 0x000000FF) >> 0; 2436 target[8 * (x & ~1) + (x & 1) + 4] = (color & 0x00FF0000) >> 16; 2437 target[8 * (x & ~1) + (x & 1) + 8] = (color & 0x0000FF00) >> 8; 2438 target[8 * (x & ~1) + (x & 1) + 12] = (color & 0xFF000000) >> 24; 2439 } 2440 } 2441 } 2442 }*/ 2443 } 2444 2445 void Surface::clearDepthBuffer(float depth, int x0, int y0, int width, int height) 2446 { 2447 // Not overlapping 2448 if(x0 > internal.width) return; 2449 if(y0 > internal.height) return; 2450 if(x0 + width < 0) return; 2451 if(y0 + height < 0) return; 2452 2453 // Clip against dimensions 2454 if(x0 < 0) {width += x0; x0 = 0;} 2455 if(x0 + width > internal.width) width = internal.width - x0; 2456 if(y0 < 0) {height += y0; y0 = 0;} 2457 if(y0 + height > internal.height) height = internal.height - y0; 2458 2459 const bool entire = x0 == 0 && y0 == 0 && width == internal.width && height == internal.height; 2460 const Lock lock = entire ? LOCK_DISCARD : LOCK_WRITEONLY; 2461 2462 int width2 = (internal.width + 1) & ~1; 2463 2464 int x1 = x0 + width; 2465 int y1 = y0 + height; 2466 2467 if(internal.format == FORMAT_D32F_LOCKABLE || 2468 internal.format == FORMAT_D32F_TEXTURE || 2469 internal.format == FORMAT_D32F_SHADOW) 2470 { 2471 float *target = (float*)lockInternal(0, 0, 0, lock, PUBLIC) + x0 + width2 * y0; 2472 2473 for(int z = 0; z < internal.depth; z++) 2474 { 2475 for(int y = y0; y < y1; y++) 2476 { 2477 memfill(target, (int&)depth, 4 * width); 2478 target += width2; 2479 } 2480 } 2481 2482 unlockInternal(); 2483 } 2484 else // Quad layout 2485 { 2486 if(complementaryDepthBuffer) 2487 { 2488 depth = 1 - depth; 2489 } 2490 2491 float *buffer = (float*)lockInternal(0, 0, 0, lock, PUBLIC); 2492 2493 for(int z = 0; z < internal.depth; z++) 2494 { 2495 for(int y = y0; y < y1; y++) 2496 { 2497 float *target = buffer + (y & ~1) * width2 + (y & 1) * 2; 2498 2499 if((y & 1) == 0 && y + 1 < y1) // Fill quad line at once 2500 { 2501 if((x0 & 1) != 0) 2502 { 2503 target[(x0 & ~1) * 2 + 1] = depth; 2504 target[(x0 & ~1) * 2 + 3] = depth; 2505 } 2506 2507 // for(int x2 = ((x0 + 1) & ~1) * 2; x2 < x1 * 2; x2 += 4) 2508 // { 2509 // target[x2 + 0] = depth; 2510 // target[x2 + 1] = depth; 2511 // target[x2 + 2] = depth; 2512 // target[x2 + 3] = depth; 2513 // } 2514 2515 // __asm 2516 // { 2517 // movss xmm0, depth 2518 // shufps xmm0, xmm0, 0x00 2519 // 2520 // mov eax, x0 2521 // add eax, 1 2522 // and eax, 0xFFFFFFFE 2523 // cmp eax, x1 2524 // jge qEnd 2525 // 2526 // mov edi, target 2527 // 2528 // qLoop: 2529 // movntps [edi+8*eax], xmm0 2530 // 2531 // add eax, 2 2532 // cmp eax, x1 2533 // jl qLoop 2534 // qEnd: 2535 // } 2536 2537 memfill(&target[((x0 + 1) & ~1) * 2], (int&)depth, 8 * ((x1 & ~1) - ((x0 + 1) & ~1))); 2538 2539 if((x1 & 1) != 0) 2540 { 2541 target[(x1 & ~1) * 2 + 0] = depth; 2542 target[(x1 & ~1) * 2 + 2] = depth; 2543 } 2544 2545 y++; 2546 } 2547 else 2548 { 2549 for(int x = x0; x < x1; x++) 2550 { 2551 target[(x & ~1) * 2 + (x & 1)] = depth; 2552 } 2553 } 2554 } 2555 2556 buffer += internal.sliceP; 2557 } 2558 2559 unlockInternal(); 2560 } 2561 } 2562 2563 void Surface::clearStencilBuffer(unsigned char s, unsigned char mask, int x0, int y0, int width, int height) 2564 { 2565 // Not overlapping 2566 if(x0 > internal.width) return; 2567 if(y0 > internal.height) return; 2568 if(x0 + width < 0) return; 2569 if(y0 + height < 0) return; 2570 2571 // Clip against dimensions 2572 if(x0 < 0) {width += x0; x0 = 0;} 2573 if(x0 + width > internal.width) width = internal.width - x0; 2574 if(y0 < 0) {height += y0; y0 = 0;} 2575 if(y0 + height > internal.height) height = internal.height - y0; 2576 2577 int width2 = (internal.width + 1) & ~1; 2578 2579 int x1 = x0 + width; 2580 int y1 = y0 + height; 2581 2582 unsigned char maskedS = s & mask; 2583 unsigned char invMask = ~mask; 2584 unsigned int fill = maskedS; 2585 fill = fill | (fill << 8) | (fill << 16) + (fill << 24); 2586 2587 if(false) 2588 { 2589 char *target = (char*)lockStencil(0, PUBLIC) + x0 + width2 * y0; 2590 2591 for(int z = 0; z < stencil.depth; z++) 2592 { 2593 for(int y = y0; y < y0 + height; y++) 2594 { 2595 if(mask == 0xFF) 2596 { 2597 memfill(target, fill, width); 2598 } 2599 else 2600 { 2601 for(int x = 0; x < width; x++) 2602 { 2603 target[x] = maskedS | (target[x] & invMask); 2604 } 2605 } 2606 2607 target += width2; 2608 } 2609 } 2610 2611 unlockStencil(); 2612 } 2613 else // Quad layout 2614 { 2615 char *buffer = (char*)lockStencil(0, PUBLIC); 2616 2617 if(mask == 0xFF) 2618 { 2619 for(int z = 0; z < stencil.depth; z++) 2620 { 2621 for(int y = y0; y < y1; y++) 2622 { 2623 char *target = buffer + (y & ~1) * width2 + (y & 1) * 2; 2624 2625 if((y & 1) == 0 && y + 1 < y1 && mask == 0xFF) // Fill quad line at once 2626 { 2627 if((x0 & 1) != 0) 2628 { 2629 target[(x0 & ~1) * 2 + 1] = fill; 2630 target[(x0 & ~1) * 2 + 3] = fill; 2631 } 2632 2633 memfill(&target[((x0 + 1) & ~1) * 2], fill, ((x1 + 1) & ~1) * 2 - ((x0 + 1) & ~1) * 2); 2634 2635 if((x1 & 1) != 0) 2636 { 2637 target[(x1 & ~1) * 2 + 0] = fill; 2638 target[(x1 & ~1) * 2 + 2] = fill; 2639 } 2640 2641 y++; 2642 } 2643 else 2644 { 2645 for(int x = x0; x < x1; x++) 2646 { 2647 target[(x & ~1) * 2 + (x & 1)] = maskedS | (target[x] & invMask); 2648 } 2649 } 2650 } 2651 2652 buffer += stencil.sliceP; 2653 } 2654 } 2655 2656 unlockStencil(); 2657 } 2658 } 2659 2660 void Surface::fill(const Color<float> &color, int x0, int y0, int width, int height) 2661 { 2662 unsigned char *row; 2663 Buffer *buffer; 2664 2665 if(internal.dirty) 2666 { 2667 row = (unsigned char*)lockInternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC); 2668 buffer = &internal; 2669 } 2670 else 2671 { 2672 row = (unsigned char*)lockExternal(x0, y0, 0, LOCK_WRITEONLY, PUBLIC); 2673 buffer = &external; 2674 } 2675 2676 if(buffer->bytes <= 4) 2677 { 2678 int c; 2679 buffer->write(&c, color); 2680 2681 if(buffer->bytes <= 1) c = (c << 8) | c; 2682 if(buffer->bytes <= 2) c = (c << 16) | c; 2683 2684 for(int y = 0; y < height; y++) 2685 { 2686 memfill(row, c, width * buffer->bytes); 2687 2688 row += buffer->pitchB; 2689 } 2690 } 2691 else // Generic 2692 { 2693 for(int y = 0; y < height; y++) 2694 { 2695 unsigned char *element = row; 2696 2697 for(int x = 0; x < width; x++) 2698 { 2699 buffer->write(element, color); 2700 2701 element += buffer->bytes; 2702 } 2703 2704 row += buffer->pitchB; 2705 } 2706 } 2707 2708 if(buffer == &internal) 2709 { 2710 unlockInternal(); 2711 } 2712 else 2713 { 2714 unlockExternal(); 2715 } 2716 } 2717 2718 Color<float> Surface::readExternal(int x, int y, int z) const 2719 { 2720 ASSERT(external.lock != LOCK_UNLOCKED); 2721 2722 return external.read(x, y, z); 2723 } 2724 2725 Color<float> Surface::readExternal(int x, int y) const 2726 { 2727 ASSERT(external.lock != LOCK_UNLOCKED); 2728 2729 return external.read(x, y); 2730 } 2731 2732 Color<float> Surface::sampleExternal(float x, float y, float z) const 2733 { 2734 ASSERT(external.lock != LOCK_UNLOCKED); 2735 2736 return external.sample(x, y, z); 2737 } 2738 2739 Color<float> Surface::sampleExternal(float x, float y) const 2740 { 2741 ASSERT(external.lock != LOCK_UNLOCKED); 2742 2743 return external.sample(x, y); 2744 } 2745 2746 void Surface::writeExternal(int x, int y, int z, const Color<float> &color) 2747 { 2748 ASSERT(external.lock != LOCK_UNLOCKED); 2749 2750 external.write(x, y, z, color); 2751 } 2752 2753 void Surface::writeExternal(int x, int y, const Color<float> &color) 2754 { 2755 ASSERT(external.lock != LOCK_UNLOCKED); 2756 2757 external.write(x, y, color); 2758 } 2759 2760 Color<float> Surface::readInternal(int x, int y, int z) const 2761 { 2762 ASSERT(internal.lock != LOCK_UNLOCKED); 2763 2764 return internal.read(x, y, z); 2765 } 2766 2767 Color<float> Surface::readInternal(int x, int y) const 2768 { 2769 ASSERT(internal.lock != LOCK_UNLOCKED); 2770 2771 return internal.read(x, y); 2772 } 2773 2774 Color<float> Surface::sampleInternal(float x, float y, float z) const 2775 { 2776 ASSERT(internal.lock != LOCK_UNLOCKED); 2777 2778 return internal.sample(x, y, z); 2779 } 2780 2781 Color<float> Surface::sampleInternal(float x, float y) const 2782 { 2783 ASSERT(internal.lock != LOCK_UNLOCKED); 2784 2785 return internal.sample(x, y); 2786 } 2787 2788 void Surface::writeInternal(int x, int y, int z, const Color<float> &color) 2789 { 2790 ASSERT(internal.lock != LOCK_UNLOCKED); 2791 2792 internal.write(x, y, z, color); 2793 } 2794 2795 void Surface::writeInternal(int x, int y, const Color<float> &color) 2796 { 2797 ASSERT(internal.lock != LOCK_UNLOCKED); 2798 2799 internal.write(x, y, color); 2800 } 2801 2802 bool Surface::hasStencil() const 2803 { 2804 return isStencil(external.format); 2805 } 2806 2807 bool Surface::hasDepth() const 2808 { 2809 return isDepth(external.format); 2810 } 2811 2812 bool Surface::hasPalette() const 2813 { 2814 return isPalette(external.format); 2815 } 2816 2817 bool Surface::isRenderTarget() const 2818 { 2819 return renderTarget; 2820 } 2821 2822 bool Surface::hasDirtyMipmaps() const 2823 { 2824 return dirtyMipmaps; 2825 } 2826 2827 void Surface::cleanMipmaps() 2828 { 2829 dirtyMipmaps = false; 2830 } 2831 2832 Resource *Surface::getResource() 2833 { 2834 return resource; 2835 } 2836 2837 bool Surface::identicalFormats() const 2838 { 2839 return external.format == internal.format && 2840 external.width == internal.width && 2841 external.height == internal.height && 2842 external.depth == internal.depth && 2843 external.pitchB == internal.pitchB && 2844 external.sliceB == internal.sliceB; 2845 } 2846 2847 Format Surface::selectInternalFormat(Format format) const 2848 { 2849 switch(format) 2850 { 2851 case FORMAT_NULL: 2852 return FORMAT_NULL; 2853 case FORMAT_P8: 2854 case FORMAT_A8P8: 2855 case FORMAT_A4R4G4B4: 2856 case FORMAT_A1R5G5B5: 2857 case FORMAT_A8R3G3B2: 2858 return FORMAT_A8R8G8B8; 2859 case FORMAT_A8: 2860 return FORMAT_A8; 2861 case FORMAT_R8: 2862 return FORMAT_R8; 2863 case FORMAT_A2R10G10B10: 2864 case FORMAT_A2B10G10R10: 2865 case FORMAT_A16B16G16R16: 2866 return FORMAT_A16B16G16R16; 2867 case FORMAT_G8R8: 2868 return FORMAT_G8R8; 2869 case FORMAT_G16R16: 2870 return FORMAT_G16R16; 2871 case FORMAT_A8R8G8B8: 2872 case FORMAT_A8B8G8R8: 2873 if(lockable || !quadLayoutEnabled) 2874 { 2875 return FORMAT_A8R8G8B8; 2876 } 2877 else 2878 { 2879 return FORMAT_A8G8R8B8Q; 2880 } 2881 case FORMAT_R3G3B2: 2882 case FORMAT_R5G6B5: 2883 case FORMAT_R8G8B8: 2884 case FORMAT_X4R4G4B4: 2885 case FORMAT_X1R5G5B5: 2886 case FORMAT_X8R8G8B8: 2887 case FORMAT_X8B8G8R8: 2888 if(lockable || !quadLayoutEnabled) 2889 { 2890 return FORMAT_X8R8G8B8; 2891 } 2892 else 2893 { 2894 return FORMAT_X8G8R8B8Q; 2895 } 2896 // Compressed formats 2897 #if S3TC_SUPPORT 2898 case FORMAT_DXT1: 2899 case FORMAT_DXT3: 2900 case FORMAT_DXT5: 2901 return FORMAT_A8R8G8B8; 2902 case FORMAT_ATI1: 2903 return FORMAT_R8; 2904 case FORMAT_ATI2: 2905 return FORMAT_G8R8; 2906 #endif 2907 // Bumpmap formats 2908 case FORMAT_V8U8: return FORMAT_V8U8; 2909 case FORMAT_L6V5U5: return FORMAT_X8L8V8U8; 2910 case FORMAT_Q8W8V8U8: return FORMAT_Q8W8V8U8; 2911 case FORMAT_X8L8V8U8: return FORMAT_X8L8V8U8; 2912 case FORMAT_V16U16: return FORMAT_V16U16; 2913 case FORMAT_A2W10V10U10: return FORMAT_A16W16V16U16; 2914 case FORMAT_Q16W16V16U16: return FORMAT_Q16W16V16U16; 2915 // Floating-point formats 2916 case FORMAT_R16F: return FORMAT_R32F; 2917 case FORMAT_G16R16F: return FORMAT_G32R32F; 2918 case FORMAT_A16B16G16R16F: return FORMAT_A32B32G32R32F; 2919 case FORMAT_R32F: return FORMAT_R32F; 2920 case FORMAT_G32R32F: return FORMAT_G32R32F; 2921 case FORMAT_A32B32G32R32F: return FORMAT_A32B32G32R32F; 2922 // Luminance formats 2923 case FORMAT_L8: return FORMAT_L8; 2924 case FORMAT_A4L4: return FORMAT_A8L8; 2925 case FORMAT_L16: return FORMAT_L16; 2926 case FORMAT_A8L8: return FORMAT_A8L8; 2927 // Depth/stencil formats 2928 case FORMAT_D16: 2929 case FORMAT_D32: 2930 case FORMAT_D24X8: 2931 case FORMAT_D24S8: 2932 case FORMAT_D24FS8: 2933 if(hasParent) // Texture 2934 { 2935 return FORMAT_D32F_SHADOW; 2936 } 2937 else if(complementaryDepthBuffer) 2938 { 2939 return FORMAT_D32F_COMPLEMENTARY; 2940 } 2941 else 2942 { 2943 return FORMAT_D32F; 2944 } 2945 case FORMAT_D32F_LOCKABLE: return FORMAT_D32F_LOCKABLE; 2946 case FORMAT_INTZ: return FORMAT_D32F_TEXTURE; 2947 case FORMAT_DF24: return FORMAT_D32F_SHADOW; 2948 case FORMAT_DF16: return FORMAT_D32F_SHADOW; 2949 default: 2950 ASSERT(false); 2951 } 2952 2953 return FORMAT_NULL; 2954 } 2955 2956 void Surface::setTexturePalette(unsigned int *palette) 2957 { 2958 Surface::palette = palette; 2959 Surface::paletteID++; 2960 } 2961 2962 void Surface::resolve() 2963 { 2964 if(internal.depth <= 1 || !internal.dirty || !renderTarget || internal.format == FORMAT_NULL) 2965 { 2966 return; 2967 } 2968 2969 void *source = internal.lockRect(0, 0, 0, LOCK_READWRITE); 2970 2971 int quality = internal.depth; 2972 int width = internal.width; 2973 int height = internal.height; 2974 int pitch = internal.pitchB; 2975 int slice = internal.sliceB; 2976 2977 unsigned char *source0 = (unsigned char*)source; 2978 unsigned char *source1 = source0 + slice; 2979 unsigned char *source2 = source1 + slice; 2980 unsigned char *source3 = source2 + slice; 2981 unsigned char *source4 = source3 + slice; 2982 unsigned char *source5 = source4 + slice; 2983 unsigned char *source6 = source5 + slice; 2984 unsigned char *source7 = source6 + slice; 2985 unsigned char *source8 = source7 + slice; 2986 unsigned char *source9 = source8 + slice; 2987 unsigned char *sourceA = source9 + slice; 2988 unsigned char *sourceB = sourceA + slice; 2989 unsigned char *sourceC = sourceB + slice; 2990 unsigned char *sourceD = sourceC + slice; 2991 unsigned char *sourceE = sourceD + slice; 2992 unsigned char *sourceF = sourceE + slice; 2993 2994 if(internal.format == FORMAT_X8R8G8B8 || internal.format == FORMAT_A8R8G8B8) 2995 { 2996 if(CPUID::supportsSSE2() && (width % 4) == 0) 2997 { 2998 if(internal.depth == 2) 2999 { 3000 for(int y = 0; y < height; y++) 3001 { 3002 for(int x = 0; x < width; x += 4) 3003 { 3004 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x)); 3005 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x)); 3006 3007 c0 = _mm_avg_epu8(c0, c1); 3008 3009 _mm_store_si128((__m128i*)(source0 + 4 * x), c0); 3010 } 3011 3012 source0 += pitch; 3013 source1 += pitch; 3014 } 3015 } 3016 else if(internal.depth == 4) 3017 { 3018 for(int y = 0; y < height; y++) 3019 { 3020 for(int x = 0; x < width; x += 4) 3021 { 3022 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x)); 3023 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x)); 3024 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x)); 3025 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x)); 3026 3027 c0 = _mm_avg_epu8(c0, c1); 3028 c2 = _mm_avg_epu8(c2, c3); 3029 c0 = _mm_avg_epu8(c0, c2); 3030 3031 _mm_store_si128((__m128i*)(source0 + 4 * x), c0); 3032 } 3033 3034 source0 += pitch; 3035 source1 += pitch; 3036 source2 += pitch; 3037 source3 += pitch; 3038 } 3039 } 3040 else if(internal.depth == 8) 3041 { 3042 for(int y = 0; y < height; y++) 3043 { 3044 for(int x = 0; x < width; x += 4) 3045 { 3046 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x)); 3047 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x)); 3048 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x)); 3049 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x)); 3050 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x)); 3051 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x)); 3052 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x)); 3053 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x)); 3054 3055 c0 = _mm_avg_epu8(c0, c1); 3056 c2 = _mm_avg_epu8(c2, c3); 3057 c4 = _mm_avg_epu8(c4, c5); 3058 c6 = _mm_avg_epu8(c6, c7); 3059 c0 = _mm_avg_epu8(c0, c2); 3060 c4 = _mm_avg_epu8(c4, c6); 3061 c0 = _mm_avg_epu8(c0, c4); 3062 3063 _mm_store_si128((__m128i*)(source0 + 4 * x), c0); 3064 } 3065 3066 source0 += pitch; 3067 source1 += pitch; 3068 source2 += pitch; 3069 source3 += pitch; 3070 source4 += pitch; 3071 source5 += pitch; 3072 source6 += pitch; 3073 source7 += pitch; 3074 } 3075 } 3076 else if(internal.depth == 16) 3077 { 3078 for(int y = 0; y < height; y++) 3079 { 3080 for(int x = 0; x < width; x += 4) 3081 { 3082 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x)); 3083 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x)); 3084 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x)); 3085 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x)); 3086 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x)); 3087 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x)); 3088 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x)); 3089 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x)); 3090 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x)); 3091 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x)); 3092 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x)); 3093 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x)); 3094 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x)); 3095 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x)); 3096 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x)); 3097 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x)); 3098 3099 c0 = _mm_avg_epu8(c0, c1); 3100 c2 = _mm_avg_epu8(c2, c3); 3101 c4 = _mm_avg_epu8(c4, c5); 3102 c6 = _mm_avg_epu8(c6, c7); 3103 c8 = _mm_avg_epu8(c8, c9); 3104 cA = _mm_avg_epu8(cA, cB); 3105 cC = _mm_avg_epu8(cC, cD); 3106 cE = _mm_avg_epu8(cE, cF); 3107 c0 = _mm_avg_epu8(c0, c2); 3108 c4 = _mm_avg_epu8(c4, c6); 3109 c8 = _mm_avg_epu8(c8, cA); 3110 cC = _mm_avg_epu8(cC, cE); 3111 c0 = _mm_avg_epu8(c0, c4); 3112 c8 = _mm_avg_epu8(c8, cC); 3113 c0 = _mm_avg_epu8(c0, c8); 3114 3115 _mm_store_si128((__m128i*)(source0 + 4 * x), c0); 3116 } 3117 3118 source0 += pitch; 3119 source1 += pitch; 3120 source2 += pitch; 3121 source3 += pitch; 3122 source4 += pitch; 3123 source5 += pitch; 3124 source6 += pitch; 3125 source7 += pitch; 3126 source8 += pitch; 3127 source9 += pitch; 3128 sourceA += pitch; 3129 sourceB += pitch; 3130 sourceC += pitch; 3131 sourceD += pitch; 3132 sourceE += pitch; 3133 sourceF += pitch; 3134 } 3135 } 3136 else ASSERT(false); 3137 } 3138 else 3139 { 3140 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7F7F7F7F) + (((x) ^ (y)) & 0x01010101)) 3141 3142 if(internal.depth == 2) 3143 { 3144 for(int y = 0; y < height; y++) 3145 { 3146 for(int x = 0; x < width; x++) 3147 { 3148 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3149 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3150 3151 c0 = AVERAGE(c0, c1); 3152 3153 *(unsigned int*)(source0 + 4 * x) = c0; 3154 } 3155 3156 source0 += pitch; 3157 source1 += pitch; 3158 } 3159 } 3160 else if(internal.depth == 4) 3161 { 3162 for(int y = 0; y < height; y++) 3163 { 3164 for(int x = 0; x < width; x++) 3165 { 3166 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3167 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3168 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3169 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3170 3171 c0 = AVERAGE(c0, c1); 3172 c2 = AVERAGE(c2, c3); 3173 c0 = AVERAGE(c0, c2); 3174 3175 *(unsigned int*)(source0 + 4 * x) = c0; 3176 } 3177 3178 source0 += pitch; 3179 source1 += pitch; 3180 source2 += pitch; 3181 source3 += pitch; 3182 } 3183 } 3184 else if(internal.depth == 8) 3185 { 3186 for(int y = 0; y < height; y++) 3187 { 3188 for(int x = 0; x < width; x++) 3189 { 3190 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3191 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3192 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3193 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3194 unsigned int c4 = *(unsigned int*)(source4 + 4 * x); 3195 unsigned int c5 = *(unsigned int*)(source5 + 4 * x); 3196 unsigned int c6 = *(unsigned int*)(source6 + 4 * x); 3197 unsigned int c7 = *(unsigned int*)(source7 + 4 * x); 3198 3199 c0 = AVERAGE(c0, c1); 3200 c2 = AVERAGE(c2, c3); 3201 c4 = AVERAGE(c4, c5); 3202 c6 = AVERAGE(c6, c7); 3203 c0 = AVERAGE(c0, c2); 3204 c4 = AVERAGE(c4, c6); 3205 c0 = AVERAGE(c0, c4); 3206 3207 *(unsigned int*)(source0 + 4 * x) = c0; 3208 } 3209 3210 source0 += pitch; 3211 source1 += pitch; 3212 source2 += pitch; 3213 source3 += pitch; 3214 source4 += pitch; 3215 source5 += pitch; 3216 source6 += pitch; 3217 source7 += pitch; 3218 } 3219 } 3220 else if(internal.depth == 16) 3221 { 3222 for(int y = 0; y < height; y++) 3223 { 3224 for(int x = 0; x < width; x++) 3225 { 3226 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3227 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3228 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3229 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3230 unsigned int c4 = *(unsigned int*)(source4 + 4 * x); 3231 unsigned int c5 = *(unsigned int*)(source5 + 4 * x); 3232 unsigned int c6 = *(unsigned int*)(source6 + 4 * x); 3233 unsigned int c7 = *(unsigned int*)(source7 + 4 * x); 3234 unsigned int c8 = *(unsigned int*)(source8 + 4 * x); 3235 unsigned int c9 = *(unsigned int*)(source9 + 4 * x); 3236 unsigned int cA = *(unsigned int*)(sourceA + 4 * x); 3237 unsigned int cB = *(unsigned int*)(sourceB + 4 * x); 3238 unsigned int cC = *(unsigned int*)(sourceC + 4 * x); 3239 unsigned int cD = *(unsigned int*)(sourceD + 4 * x); 3240 unsigned int cE = *(unsigned int*)(sourceE + 4 * x); 3241 unsigned int cF = *(unsigned int*)(sourceF + 4 * x); 3242 3243 c0 = AVERAGE(c0, c1); 3244 c2 = AVERAGE(c2, c3); 3245 c4 = AVERAGE(c4, c5); 3246 c6 = AVERAGE(c6, c7); 3247 c8 = AVERAGE(c8, c9); 3248 cA = AVERAGE(cA, cB); 3249 cC = AVERAGE(cC, cD); 3250 cE = AVERAGE(cE, cF); 3251 c0 = AVERAGE(c0, c2); 3252 c4 = AVERAGE(c4, c6); 3253 c8 = AVERAGE(c8, cA); 3254 cC = AVERAGE(cC, cE); 3255 c0 = AVERAGE(c0, c4); 3256 c8 = AVERAGE(c8, cC); 3257 c0 = AVERAGE(c0, c8); 3258 3259 *(unsigned int*)(source0 + 4 * x) = c0; 3260 } 3261 3262 source0 += pitch; 3263 source1 += pitch; 3264 source2 += pitch; 3265 source3 += pitch; 3266 source4 += pitch; 3267 source5 += pitch; 3268 source6 += pitch; 3269 source7 += pitch; 3270 source8 += pitch; 3271 source9 += pitch; 3272 sourceA += pitch; 3273 sourceB += pitch; 3274 sourceC += pitch; 3275 sourceD += pitch; 3276 sourceE += pitch; 3277 sourceF += pitch; 3278 } 3279 } 3280 else ASSERT(false); 3281 3282 #undef AVERAGE 3283 } 3284 } 3285 else if(internal.format == FORMAT_G16R16) 3286 { 3287 if(CPUID::supportsSSE2() && (width % 4) == 0) 3288 { 3289 if(internal.depth == 2) 3290 { 3291 for(int y = 0; y < height; y++) 3292 { 3293 for(int x = 0; x < width; x += 4) 3294 { 3295 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x)); 3296 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x)); 3297 3298 c0 = _mm_avg_epu16(c0, c1); 3299 3300 _mm_store_si128((__m128i*)(source0 + 4 * x), c0); 3301 } 3302 3303 source0 += pitch; 3304 source1 += pitch; 3305 } 3306 } 3307 else if(internal.depth == 4) 3308 { 3309 for(int y = 0; y < height; y++) 3310 { 3311 for(int x = 0; x < width; x += 4) 3312 { 3313 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x)); 3314 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x)); 3315 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x)); 3316 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x)); 3317 3318 c0 = _mm_avg_epu16(c0, c1); 3319 c2 = _mm_avg_epu16(c2, c3); 3320 c0 = _mm_avg_epu16(c0, c2); 3321 3322 _mm_store_si128((__m128i*)(source0 + 4 * x), c0); 3323 } 3324 3325 source0 += pitch; 3326 source1 += pitch; 3327 source2 += pitch; 3328 source3 += pitch; 3329 } 3330 } 3331 else if(internal.depth == 8) 3332 { 3333 for(int y = 0; y < height; y++) 3334 { 3335 for(int x = 0; x < width; x += 4) 3336 { 3337 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x)); 3338 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x)); 3339 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x)); 3340 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x)); 3341 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x)); 3342 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x)); 3343 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x)); 3344 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x)); 3345 3346 c0 = _mm_avg_epu16(c0, c1); 3347 c2 = _mm_avg_epu16(c2, c3); 3348 c4 = _mm_avg_epu16(c4, c5); 3349 c6 = _mm_avg_epu16(c6, c7); 3350 c0 = _mm_avg_epu16(c0, c2); 3351 c4 = _mm_avg_epu16(c4, c6); 3352 c0 = _mm_avg_epu16(c0, c4); 3353 3354 _mm_store_si128((__m128i*)(source0 + 4 * x), c0); 3355 } 3356 3357 source0 += pitch; 3358 source1 += pitch; 3359 source2 += pitch; 3360 source3 += pitch; 3361 source4 += pitch; 3362 source5 += pitch; 3363 source6 += pitch; 3364 source7 += pitch; 3365 } 3366 } 3367 else if(internal.depth == 16) 3368 { 3369 for(int y = 0; y < height; y++) 3370 { 3371 for(int x = 0; x < width; x += 4) 3372 { 3373 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 4 * x)); 3374 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 4 * x)); 3375 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 4 * x)); 3376 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 4 * x)); 3377 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 4 * x)); 3378 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 4 * x)); 3379 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 4 * x)); 3380 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 4 * x)); 3381 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 4 * x)); 3382 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 4 * x)); 3383 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 4 * x)); 3384 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 4 * x)); 3385 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 4 * x)); 3386 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 4 * x)); 3387 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 4 * x)); 3388 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 4 * x)); 3389 3390 c0 = _mm_avg_epu16(c0, c1); 3391 c2 = _mm_avg_epu16(c2, c3); 3392 c4 = _mm_avg_epu16(c4, c5); 3393 c6 = _mm_avg_epu16(c6, c7); 3394 c8 = _mm_avg_epu16(c8, c9); 3395 cA = _mm_avg_epu16(cA, cB); 3396 cC = _mm_avg_epu16(cC, cD); 3397 cE = _mm_avg_epu16(cE, cF); 3398 c0 = _mm_avg_epu16(c0, c2); 3399 c4 = _mm_avg_epu16(c4, c6); 3400 c8 = _mm_avg_epu16(c8, cA); 3401 cC = _mm_avg_epu16(cC, cE); 3402 c0 = _mm_avg_epu16(c0, c4); 3403 c8 = _mm_avg_epu16(c8, cC); 3404 c0 = _mm_avg_epu16(c0, c8); 3405 3406 _mm_store_si128((__m128i*)(source0 + 4 * x), c0); 3407 } 3408 3409 source0 += pitch; 3410 source1 += pitch; 3411 source2 += pitch; 3412 source3 += pitch; 3413 source4 += pitch; 3414 source5 += pitch; 3415 source6 += pitch; 3416 source7 += pitch; 3417 source8 += pitch; 3418 source9 += pitch; 3419 sourceA += pitch; 3420 sourceB += pitch; 3421 sourceC += pitch; 3422 sourceD += pitch; 3423 sourceE += pitch; 3424 sourceF += pitch; 3425 } 3426 } 3427 else ASSERT(false); 3428 } 3429 else 3430 { 3431 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001)) 3432 3433 if(internal.depth == 2) 3434 { 3435 for(int y = 0; y < height; y++) 3436 { 3437 for(int x = 0; x < width; x++) 3438 { 3439 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3440 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3441 3442 c0 = AVERAGE(c0, c1); 3443 3444 *(unsigned int*)(source0 + 4 * x) = c0; 3445 } 3446 3447 source0 += pitch; 3448 source1 += pitch; 3449 } 3450 } 3451 else if(internal.depth == 4) 3452 { 3453 for(int y = 0; y < height; y++) 3454 { 3455 for(int x = 0; x < width; x++) 3456 { 3457 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3458 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3459 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3460 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3461 3462 c0 = AVERAGE(c0, c1); 3463 c2 = AVERAGE(c2, c3); 3464 c0 = AVERAGE(c0, c2); 3465 3466 *(unsigned int*)(source0 + 4 * x) = c0; 3467 } 3468 3469 source0 += pitch; 3470 source1 += pitch; 3471 source2 += pitch; 3472 source3 += pitch; 3473 } 3474 } 3475 else if(internal.depth == 8) 3476 { 3477 for(int y = 0; y < height; y++) 3478 { 3479 for(int x = 0; x < width; x++) 3480 { 3481 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3482 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3483 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3484 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3485 unsigned int c4 = *(unsigned int*)(source4 + 4 * x); 3486 unsigned int c5 = *(unsigned int*)(source5 + 4 * x); 3487 unsigned int c6 = *(unsigned int*)(source6 + 4 * x); 3488 unsigned int c7 = *(unsigned int*)(source7 + 4 * x); 3489 3490 c0 = AVERAGE(c0, c1); 3491 c2 = AVERAGE(c2, c3); 3492 c4 = AVERAGE(c4, c5); 3493 c6 = AVERAGE(c6, c7); 3494 c0 = AVERAGE(c0, c2); 3495 c4 = AVERAGE(c4, c6); 3496 c0 = AVERAGE(c0, c4); 3497 3498 *(unsigned int*)(source0 + 4 * x) = c0; 3499 } 3500 3501 source0 += pitch; 3502 source1 += pitch; 3503 source2 += pitch; 3504 source3 += pitch; 3505 source4 += pitch; 3506 source5 += pitch; 3507 source6 += pitch; 3508 source7 += pitch; 3509 } 3510 } 3511 else if(internal.depth == 16) 3512 { 3513 for(int y = 0; y < height; y++) 3514 { 3515 for(int x = 0; x < width; x++) 3516 { 3517 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3518 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3519 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3520 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3521 unsigned int c4 = *(unsigned int*)(source4 + 4 * x); 3522 unsigned int c5 = *(unsigned int*)(source5 + 4 * x); 3523 unsigned int c6 = *(unsigned int*)(source6 + 4 * x); 3524 unsigned int c7 = *(unsigned int*)(source7 + 4 * x); 3525 unsigned int c8 = *(unsigned int*)(source8 + 4 * x); 3526 unsigned int c9 = *(unsigned int*)(source9 + 4 * x); 3527 unsigned int cA = *(unsigned int*)(sourceA + 4 * x); 3528 unsigned int cB = *(unsigned int*)(sourceB + 4 * x); 3529 unsigned int cC = *(unsigned int*)(sourceC + 4 * x); 3530 unsigned int cD = *(unsigned int*)(sourceD + 4 * x); 3531 unsigned int cE = *(unsigned int*)(sourceE + 4 * x); 3532 unsigned int cF = *(unsigned int*)(sourceF + 4 * x); 3533 3534 c0 = AVERAGE(c0, c1); 3535 c2 = AVERAGE(c2, c3); 3536 c4 = AVERAGE(c4, c5); 3537 c6 = AVERAGE(c6, c7); 3538 c8 = AVERAGE(c8, c9); 3539 cA = AVERAGE(cA, cB); 3540 cC = AVERAGE(cC, cD); 3541 cE = AVERAGE(cE, cF); 3542 c0 = AVERAGE(c0, c2); 3543 c4 = AVERAGE(c4, c6); 3544 c8 = AVERAGE(c8, cA); 3545 cC = AVERAGE(cC, cE); 3546 c0 = AVERAGE(c0, c4); 3547 c8 = AVERAGE(c8, cC); 3548 c0 = AVERAGE(c0, c8); 3549 3550 *(unsigned int*)(source0 + 4 * x) = c0; 3551 } 3552 3553 source0 += pitch; 3554 source1 += pitch; 3555 source2 += pitch; 3556 source3 += pitch; 3557 source4 += pitch; 3558 source5 += pitch; 3559 source6 += pitch; 3560 source7 += pitch; 3561 source8 += pitch; 3562 source9 += pitch; 3563 sourceA += pitch; 3564 sourceB += pitch; 3565 sourceC += pitch; 3566 sourceD += pitch; 3567 sourceE += pitch; 3568 sourceF += pitch; 3569 } 3570 } 3571 else ASSERT(false); 3572 3573 #undef AVERAGE 3574 } 3575 } 3576 else if(internal.format == FORMAT_A16B16G16R16) 3577 { 3578 if(CPUID::supportsSSE2() && (width % 2) == 0) 3579 { 3580 if(internal.depth == 2) 3581 { 3582 for(int y = 0; y < height; y++) 3583 { 3584 for(int x = 0; x < width; x += 2) 3585 { 3586 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x)); 3587 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x)); 3588 3589 c0 = _mm_avg_epu16(c0, c1); 3590 3591 _mm_store_si128((__m128i*)(source0 + 8 * x), c0); 3592 } 3593 3594 source0 += pitch; 3595 source1 += pitch; 3596 } 3597 } 3598 else if(internal.depth == 4) 3599 { 3600 for(int y = 0; y < height; y++) 3601 { 3602 for(int x = 0; x < width; x += 2) 3603 { 3604 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x)); 3605 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x)); 3606 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x)); 3607 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x)); 3608 3609 c0 = _mm_avg_epu16(c0, c1); 3610 c2 = _mm_avg_epu16(c2, c3); 3611 c0 = _mm_avg_epu16(c0, c2); 3612 3613 _mm_store_si128((__m128i*)(source0 + 8 * x), c0); 3614 } 3615 3616 source0 += pitch; 3617 source1 += pitch; 3618 source2 += pitch; 3619 source3 += pitch; 3620 } 3621 } 3622 else if(internal.depth == 8) 3623 { 3624 for(int y = 0; y < height; y++) 3625 { 3626 for(int x = 0; x < width; x += 2) 3627 { 3628 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x)); 3629 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x)); 3630 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x)); 3631 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x)); 3632 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x)); 3633 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x)); 3634 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x)); 3635 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x)); 3636 3637 c0 = _mm_avg_epu16(c0, c1); 3638 c2 = _mm_avg_epu16(c2, c3); 3639 c4 = _mm_avg_epu16(c4, c5); 3640 c6 = _mm_avg_epu16(c6, c7); 3641 c0 = _mm_avg_epu16(c0, c2); 3642 c4 = _mm_avg_epu16(c4, c6); 3643 c0 = _mm_avg_epu16(c0, c4); 3644 3645 _mm_store_si128((__m128i*)(source0 + 8 * x), c0); 3646 } 3647 3648 source0 += pitch; 3649 source1 += pitch; 3650 source2 += pitch; 3651 source3 += pitch; 3652 source4 += pitch; 3653 source5 += pitch; 3654 source6 += pitch; 3655 source7 += pitch; 3656 } 3657 } 3658 else if(internal.depth == 16) 3659 { 3660 for(int y = 0; y < height; y++) 3661 { 3662 for(int x = 0; x < width; x += 2) 3663 { 3664 __m128i c0 = _mm_load_si128((__m128i*)(source0 + 8 * x)); 3665 __m128i c1 = _mm_load_si128((__m128i*)(source1 + 8 * x)); 3666 __m128i c2 = _mm_load_si128((__m128i*)(source2 + 8 * x)); 3667 __m128i c3 = _mm_load_si128((__m128i*)(source3 + 8 * x)); 3668 __m128i c4 = _mm_load_si128((__m128i*)(source4 + 8 * x)); 3669 __m128i c5 = _mm_load_si128((__m128i*)(source5 + 8 * x)); 3670 __m128i c6 = _mm_load_si128((__m128i*)(source6 + 8 * x)); 3671 __m128i c7 = _mm_load_si128((__m128i*)(source7 + 8 * x)); 3672 __m128i c8 = _mm_load_si128((__m128i*)(source8 + 8 * x)); 3673 __m128i c9 = _mm_load_si128((__m128i*)(source9 + 8 * x)); 3674 __m128i cA = _mm_load_si128((__m128i*)(sourceA + 8 * x)); 3675 __m128i cB = _mm_load_si128((__m128i*)(sourceB + 8 * x)); 3676 __m128i cC = _mm_load_si128((__m128i*)(sourceC + 8 * x)); 3677 __m128i cD = _mm_load_si128((__m128i*)(sourceD + 8 * x)); 3678 __m128i cE = _mm_load_si128((__m128i*)(sourceE + 8 * x)); 3679 __m128i cF = _mm_load_si128((__m128i*)(sourceF + 8 * x)); 3680 3681 c0 = _mm_avg_epu16(c0, c1); 3682 c2 = _mm_avg_epu16(c2, c3); 3683 c4 = _mm_avg_epu16(c4, c5); 3684 c6 = _mm_avg_epu16(c6, c7); 3685 c8 = _mm_avg_epu16(c8, c9); 3686 cA = _mm_avg_epu16(cA, cB); 3687 cC = _mm_avg_epu16(cC, cD); 3688 cE = _mm_avg_epu16(cE, cF); 3689 c0 = _mm_avg_epu16(c0, c2); 3690 c4 = _mm_avg_epu16(c4, c6); 3691 c8 = _mm_avg_epu16(c8, cA); 3692 cC = _mm_avg_epu16(cC, cE); 3693 c0 = _mm_avg_epu16(c0, c4); 3694 c8 = _mm_avg_epu16(c8, cC); 3695 c0 = _mm_avg_epu16(c0, c8); 3696 3697 _mm_store_si128((__m128i*)(source0 + 8 * x), c0); 3698 } 3699 3700 source0 += pitch; 3701 source1 += pitch; 3702 source2 += pitch; 3703 source3 += pitch; 3704 source4 += pitch; 3705 source5 += pitch; 3706 source6 += pitch; 3707 source7 += pitch; 3708 source8 += pitch; 3709 source9 += pitch; 3710 sourceA += pitch; 3711 sourceB += pitch; 3712 sourceC += pitch; 3713 sourceD += pitch; 3714 sourceE += pitch; 3715 sourceF += pitch; 3716 } 3717 } 3718 else ASSERT(false); 3719 } 3720 else 3721 { 3722 #define AVERAGE(x, y) (((x) & (y)) + ((((x) ^ (y)) >> 1) & 0x7FFF7FFF) + (((x) ^ (y)) & 0x00010001)) 3723 3724 if(internal.depth == 2) 3725 { 3726 for(int y = 0; y < height; y++) 3727 { 3728 for(int x = 0; x < 2 * width; x++) 3729 { 3730 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3731 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3732 3733 c0 = AVERAGE(c0, c1); 3734 3735 *(unsigned int*)(source0 + 4 * x) = c0; 3736 } 3737 3738 source0 += pitch; 3739 source1 += pitch; 3740 } 3741 } 3742 else if(internal.depth == 4) 3743 { 3744 for(int y = 0; y < height; y++) 3745 { 3746 for(int x = 0; x < 2 * width; x++) 3747 { 3748 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3749 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3750 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3751 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3752 3753 c0 = AVERAGE(c0, c1); 3754 c2 = AVERAGE(c2, c3); 3755 c0 = AVERAGE(c0, c2); 3756 3757 *(unsigned int*)(source0 + 4 * x) = c0; 3758 } 3759 3760 source0 += pitch; 3761 source1 += pitch; 3762 source2 += pitch; 3763 source3 += pitch; 3764 } 3765 } 3766 else if(internal.depth == 8) 3767 { 3768 for(int y = 0; y < height; y++) 3769 { 3770 for(int x = 0; x < 2 * width; x++) 3771 { 3772 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3773 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3774 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3775 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3776 unsigned int c4 = *(unsigned int*)(source4 + 4 * x); 3777 unsigned int c5 = *(unsigned int*)(source5 + 4 * x); 3778 unsigned int c6 = *(unsigned int*)(source6 + 4 * x); 3779 unsigned int c7 = *(unsigned int*)(source7 + 4 * x); 3780 3781 c0 = AVERAGE(c0, c1); 3782 c2 = AVERAGE(c2, c3); 3783 c4 = AVERAGE(c4, c5); 3784 c6 = AVERAGE(c6, c7); 3785 c0 = AVERAGE(c0, c2); 3786 c4 = AVERAGE(c4, c6); 3787 c0 = AVERAGE(c0, c4); 3788 3789 *(unsigned int*)(source0 + 4 * x) = c0; 3790 } 3791 3792 source0 += pitch; 3793 source1 += pitch; 3794 source2 += pitch; 3795 source3 += pitch; 3796 source4 += pitch; 3797 source5 += pitch; 3798 source6 += pitch; 3799 source7 += pitch; 3800 } 3801 } 3802 else if(internal.depth == 16) 3803 { 3804 for(int y = 0; y < height; y++) 3805 { 3806 for(int x = 0; x < 2 * width; x++) 3807 { 3808 unsigned int c0 = *(unsigned int*)(source0 + 4 * x); 3809 unsigned int c1 = *(unsigned int*)(source1 + 4 * x); 3810 unsigned int c2 = *(unsigned int*)(source2 + 4 * x); 3811 unsigned int c3 = *(unsigned int*)(source3 + 4 * x); 3812 unsigned int c4 = *(unsigned int*)(source4 + 4 * x); 3813 unsigned int c5 = *(unsigned int*)(source5 + 4 * x); 3814 unsigned int c6 = *(unsigned int*)(source6 + 4 * x); 3815 unsigned int c7 = *(unsigned int*)(source7 + 4 * x); 3816 unsigned int c8 = *(unsigned int*)(source8 + 4 * x); 3817 unsigned int c9 = *(unsigned int*)(source9 + 4 * x); 3818 unsigned int cA = *(unsigned int*)(sourceA + 4 * x); 3819 unsigned int cB = *(unsigned int*)(sourceB + 4 * x); 3820 unsigned int cC = *(unsigned int*)(sourceC + 4 * x); 3821 unsigned int cD = *(unsigned int*)(sourceD + 4 * x); 3822 unsigned int cE = *(unsigned int*)(sourceE + 4 * x); 3823 unsigned int cF = *(unsigned int*)(sourceF + 4 * x); 3824 3825 c0 = AVERAGE(c0, c1); 3826 c2 = AVERAGE(c2, c3); 3827 c4 = AVERAGE(c4, c5); 3828 c6 = AVERAGE(c6, c7); 3829 c8 = AVERAGE(c8, c9); 3830 cA = AVERAGE(cA, cB); 3831 cC = AVERAGE(cC, cD); 3832 cE = AVERAGE(cE, cF); 3833 c0 = AVERAGE(c0, c2); 3834 c4 = AVERAGE(c4, c6); 3835 c8 = AVERAGE(c8, cA); 3836 cC = AVERAGE(cC, cE); 3837 c0 = AVERAGE(c0, c4); 3838 c8 = AVERAGE(c8, cC); 3839 c0 = AVERAGE(c0, c8); 3840 3841 *(unsigned int*)(source0 + 4 * x) = c0; 3842 } 3843 3844 source0 += pitch; 3845 source1 += pitch; 3846 source2 += pitch; 3847 source3 += pitch; 3848 source4 += pitch; 3849 source5 += pitch; 3850 source6 += pitch; 3851 source7 += pitch; 3852 source8 += pitch; 3853 source9 += pitch; 3854 sourceA += pitch; 3855 sourceB += pitch; 3856 sourceC += pitch; 3857 sourceD += pitch; 3858 sourceE += pitch; 3859 sourceF += pitch; 3860 } 3861 } 3862 else ASSERT(false); 3863 3864 #undef AVERAGE 3865 } 3866 } 3867 else if(internal.format == FORMAT_R32F) 3868 { 3869 if(CPUID::supportsSSE() && (width % 4) == 0) 3870 { 3871 if(internal.depth == 2) 3872 { 3873 for(int y = 0; y < height; y++) 3874 { 3875 for(int x = 0; x < width; x += 4) 3876 { 3877 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x)); 3878 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x)); 3879 3880 c0 = _mm_add_ps(c0, c1); 3881 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f)); 3882 3883 _mm_store_ps((float*)(source0 + 4 * x), c0); 3884 } 3885 3886 source0 += pitch; 3887 source1 += pitch; 3888 } 3889 } 3890 else if(internal.depth == 4) 3891 { 3892 for(int y = 0; y < height; y++) 3893 { 3894 for(int x = 0; x < width; x += 4) 3895 { 3896 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x)); 3897 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x)); 3898 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x)); 3899 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x)); 3900 3901 c0 = _mm_add_ps(c0, c1); 3902 c2 = _mm_add_ps(c2, c3); 3903 c0 = _mm_add_ps(c0, c2); 3904 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f)); 3905 3906 _mm_store_ps((float*)(source0 + 4 * x), c0); 3907 } 3908 3909 source0 += pitch; 3910 source1 += pitch; 3911 source2 += pitch; 3912 source3 += pitch; 3913 } 3914 } 3915 else if(internal.depth == 8) 3916 { 3917 for(int y = 0; y < height; y++) 3918 { 3919 for(int x = 0; x < width; x += 4) 3920 { 3921 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x)); 3922 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x)); 3923 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x)); 3924 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x)); 3925 __m128 c4 = _mm_load_ps((float*)(source4 + 4 * x)); 3926 __m128 c5 = _mm_load_ps((float*)(source5 + 4 * x)); 3927 __m128 c6 = _mm_load_ps((float*)(source6 + 4 * x)); 3928 __m128 c7 = _mm_load_ps((float*)(source7 + 4 * x)); 3929 3930 c0 = _mm_add_ps(c0, c1); 3931 c2 = _mm_add_ps(c2, c3); 3932 c4 = _mm_add_ps(c4, c5); 3933 c6 = _mm_add_ps(c6, c7); 3934 c0 = _mm_add_ps(c0, c2); 3935 c4 = _mm_add_ps(c4, c6); 3936 c0 = _mm_add_ps(c0, c4); 3937 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f)); 3938 3939 _mm_store_ps((float*)(source0 + 4 * x), c0); 3940 } 3941 3942 source0 += pitch; 3943 source1 += pitch; 3944 source2 += pitch; 3945 source3 += pitch; 3946 source4 += pitch; 3947 source5 += pitch; 3948 source6 += pitch; 3949 source7 += pitch; 3950 } 3951 } 3952 else if(internal.depth == 16) 3953 { 3954 for(int y = 0; y < height; y++) 3955 { 3956 for(int x = 0; x < width; x += 4) 3957 { 3958 __m128 c0 = _mm_load_ps((float*)(source0 + 4 * x)); 3959 __m128 c1 = _mm_load_ps((float*)(source1 + 4 * x)); 3960 __m128 c2 = _mm_load_ps((float*)(source2 + 4 * x)); 3961 __m128 c3 = _mm_load_ps((float*)(source3 + 4 * x)); 3962 __m128 c4 = _mm_load_ps((float*)(source4 + 4 * x)); 3963 __m128 c5 = _mm_load_ps((float*)(source5 + 4 * x)); 3964 __m128 c6 = _mm_load_ps((float*)(source6 + 4 * x)); 3965 __m128 c7 = _mm_load_ps((float*)(source7 + 4 * x)); 3966 __m128 c8 = _mm_load_ps((float*)(source8 + 4 * x)); 3967 __m128 c9 = _mm_load_ps((float*)(source9 + 4 * x)); 3968 __m128 cA = _mm_load_ps((float*)(sourceA + 4 * x)); 3969 __m128 cB = _mm_load_ps((float*)(sourceB + 4 * x)); 3970 __m128 cC = _mm_load_ps((float*)(sourceC + 4 * x)); 3971 __m128 cD = _mm_load_ps((float*)(sourceD + 4 * x)); 3972 __m128 cE = _mm_load_ps((float*)(sourceE + 4 * x)); 3973 __m128 cF = _mm_load_ps((float*)(sourceF + 4 * x)); 3974 3975 c0 = _mm_add_ps(c0, c1); 3976 c2 = _mm_add_ps(c2, c3); 3977 c4 = _mm_add_ps(c4, c5); 3978 c6 = _mm_add_ps(c6, c7); 3979 c8 = _mm_add_ps(c8, c9); 3980 cA = _mm_add_ps(cA, cB); 3981 cC = _mm_add_ps(cC, cD); 3982 cE = _mm_add_ps(cE, cF); 3983 c0 = _mm_add_ps(c0, c2); 3984 c4 = _mm_add_ps(c4, c6); 3985 c8 = _mm_add_ps(c8, cA); 3986 cC = _mm_add_ps(cC, cE); 3987 c0 = _mm_add_ps(c0, c4); 3988 c8 = _mm_add_ps(c8, cC); 3989 c0 = _mm_add_ps(c0, c8); 3990 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f)); 3991 3992 _mm_store_ps((float*)(source0 + 4 * x), c0); 3993 } 3994 3995 source0 += pitch; 3996 source1 += pitch; 3997 source2 += pitch; 3998 source3 += pitch; 3999 source4 += pitch; 4000 source5 += pitch; 4001 source6 += pitch; 4002 source7 += pitch; 4003 source8 += pitch; 4004 source9 += pitch; 4005 sourceA += pitch; 4006 sourceB += pitch; 4007 sourceC += pitch; 4008 sourceD += pitch; 4009 sourceE += pitch; 4010 sourceF += pitch; 4011 } 4012 } 4013 else ASSERT(false); 4014 } 4015 else 4016 { 4017 if(internal.depth == 2) 4018 { 4019 for(int y = 0; y < height; y++) 4020 { 4021 for(int x = 0; x < width; x++) 4022 { 4023 float c0 = *(float*)(source0 + 4 * x); 4024 float c1 = *(float*)(source1 + 4 * x); 4025 4026 c0 = c0 + c1; 4027 c0 *= 1.0f / 2.0f; 4028 4029 *(float*)(source0 + 4 * x) = c0; 4030 } 4031 4032 source0 += pitch; 4033 source1 += pitch; 4034 } 4035 } 4036 else if(internal.depth == 4) 4037 { 4038 for(int y = 0; y < height; y++) 4039 { 4040 for(int x = 0; x < width; x++) 4041 { 4042 float c0 = *(float*)(source0 + 4 * x); 4043 float c1 = *(float*)(source1 + 4 * x); 4044 float c2 = *(float*)(source2 + 4 * x); 4045 float c3 = *(float*)(source3 + 4 * x); 4046 4047 c0 = c0 + c1; 4048 c2 = c2 + c3; 4049 c0 = c0 + c2; 4050 c0 *= 1.0f / 4.0f; 4051 4052 *(float*)(source0 + 4 * x) = c0; 4053 } 4054 4055 source0 += pitch; 4056 source1 += pitch; 4057 source2 += pitch; 4058 source3 += pitch; 4059 } 4060 } 4061 else if(internal.depth == 8) 4062 { 4063 for(int y = 0; y < height; y++) 4064 { 4065 for(int x = 0; x < width; x++) 4066 { 4067 float c0 = *(float*)(source0 + 4 * x); 4068 float c1 = *(float*)(source1 + 4 * x); 4069 float c2 = *(float*)(source2 + 4 * x); 4070 float c3 = *(float*)(source3 + 4 * x); 4071 float c4 = *(float*)(source4 + 4 * x); 4072 float c5 = *(float*)(source5 + 4 * x); 4073 float c6 = *(float*)(source6 + 4 * x); 4074 float c7 = *(float*)(source7 + 4 * x); 4075 4076 c0 = c0 + c1; 4077 c2 = c2 + c3; 4078 c4 = c4 + c5; 4079 c6 = c6 + c7; 4080 c0 = c0 + c2; 4081 c4 = c4 + c6; 4082 c0 = c0 + c4; 4083 c0 *= 1.0f / 8.0f; 4084 4085 *(float*)(source0 + 4 * x) = c0; 4086 } 4087 4088 source0 += pitch; 4089 source1 += pitch; 4090 source2 += pitch; 4091 source3 += pitch; 4092 source4 += pitch; 4093 source5 += pitch; 4094 source6 += pitch; 4095 source7 += pitch; 4096 } 4097 } 4098 else if(internal.depth == 16) 4099 { 4100 for(int y = 0; y < height; y++) 4101 { 4102 for(int x = 0; x < width; x++) 4103 { 4104 float c0 = *(float*)(source0 + 4 * x); 4105 float c1 = *(float*)(source1 + 4 * x); 4106 float c2 = *(float*)(source2 + 4 * x); 4107 float c3 = *(float*)(source3 + 4 * x); 4108 float c4 = *(float*)(source4 + 4 * x); 4109 float c5 = *(float*)(source5 + 4 * x); 4110 float c6 = *(float*)(source6 + 4 * x); 4111 float c7 = *(float*)(source7 + 4 * x); 4112 float c8 = *(float*)(source8 + 4 * x); 4113 float c9 = *(float*)(source9 + 4 * x); 4114 float cA = *(float*)(sourceA + 4 * x); 4115 float cB = *(float*)(sourceB + 4 * x); 4116 float cC = *(float*)(sourceC + 4 * x); 4117 float cD = *(float*)(sourceD + 4 * x); 4118 float cE = *(float*)(sourceE + 4 * x); 4119 float cF = *(float*)(sourceF + 4 * x); 4120 4121 c0 = c0 + c1; 4122 c2 = c2 + c3; 4123 c4 = c4 + c5; 4124 c6 = c6 + c7; 4125 c8 = c8 + c9; 4126 cA = cA + cB; 4127 cC = cC + cD; 4128 cE = cE + cF; 4129 c0 = c0 + c2; 4130 c4 = c4 + c6; 4131 c8 = c8 + cA; 4132 cC = cC + cE; 4133 c0 = c0 + c4; 4134 c8 = c8 + cC; 4135 c0 = c0 + c8; 4136 c0 *= 1.0f / 16.0f; 4137 4138 *(float*)(source0 + 4 * x) = c0; 4139 } 4140 4141 source0 += pitch; 4142 source1 += pitch; 4143 source2 += pitch; 4144 source3 += pitch; 4145 source4 += pitch; 4146 source5 += pitch; 4147 source6 += pitch; 4148 source7 += pitch; 4149 source8 += pitch; 4150 source9 += pitch; 4151 sourceA += pitch; 4152 sourceB += pitch; 4153 sourceC += pitch; 4154 sourceD += pitch; 4155 sourceE += pitch; 4156 sourceF += pitch; 4157 } 4158 } 4159 else ASSERT(false); 4160 } 4161 } 4162 else if(internal.format == FORMAT_G32R32F) 4163 { 4164 if(CPUID::supportsSSE() && (width % 2) == 0) 4165 { 4166 if(internal.depth == 2) 4167 { 4168 for(int y = 0; y < height; y++) 4169 { 4170 for(int x = 0; x < width; x += 2) 4171 { 4172 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x)); 4173 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x)); 4174 4175 c0 = _mm_add_ps(c0, c1); 4176 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f)); 4177 4178 _mm_store_ps((float*)(source0 + 8 * x), c0); 4179 } 4180 4181 source0 += pitch; 4182 source1 += pitch; 4183 } 4184 } 4185 else if(internal.depth == 4) 4186 { 4187 for(int y = 0; y < height; y++) 4188 { 4189 for(int x = 0; x < width; x += 2) 4190 { 4191 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x)); 4192 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x)); 4193 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x)); 4194 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x)); 4195 4196 c0 = _mm_add_ps(c0, c1); 4197 c2 = _mm_add_ps(c2, c3); 4198 c0 = _mm_add_ps(c0, c2); 4199 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f)); 4200 4201 _mm_store_ps((float*)(source0 + 8 * x), c0); 4202 } 4203 4204 source0 += pitch; 4205 source1 += pitch; 4206 source2 += pitch; 4207 source3 += pitch; 4208 } 4209 } 4210 else if(internal.depth == 8) 4211 { 4212 for(int y = 0; y < height; y++) 4213 { 4214 for(int x = 0; x < width; x += 2) 4215 { 4216 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x)); 4217 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x)); 4218 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x)); 4219 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x)); 4220 __m128 c4 = _mm_load_ps((float*)(source4 + 8 * x)); 4221 __m128 c5 = _mm_load_ps((float*)(source5 + 8 * x)); 4222 __m128 c6 = _mm_load_ps((float*)(source6 + 8 * x)); 4223 __m128 c7 = _mm_load_ps((float*)(source7 + 8 * x)); 4224 4225 c0 = _mm_add_ps(c0, c1); 4226 c2 = _mm_add_ps(c2, c3); 4227 c4 = _mm_add_ps(c4, c5); 4228 c6 = _mm_add_ps(c6, c7); 4229 c0 = _mm_add_ps(c0, c2); 4230 c4 = _mm_add_ps(c4, c6); 4231 c0 = _mm_add_ps(c0, c4); 4232 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f)); 4233 4234 _mm_store_ps((float*)(source0 + 8 * x), c0); 4235 } 4236 4237 source0 += pitch; 4238 source1 += pitch; 4239 source2 += pitch; 4240 source3 += pitch; 4241 source4 += pitch; 4242 source5 += pitch; 4243 source6 += pitch; 4244 source7 += pitch; 4245 } 4246 } 4247 else if(internal.depth == 16) 4248 { 4249 for(int y = 0; y < height; y++) 4250 { 4251 for(int x = 0; x < width; x += 2) 4252 { 4253 __m128 c0 = _mm_load_ps((float*)(source0 + 8 * x)); 4254 __m128 c1 = _mm_load_ps((float*)(source1 + 8 * x)); 4255 __m128 c2 = _mm_load_ps((float*)(source2 + 8 * x)); 4256 __m128 c3 = _mm_load_ps((float*)(source3 + 8 * x)); 4257 __m128 c4 = _mm_load_ps((float*)(source4 + 8 * x)); 4258 __m128 c5 = _mm_load_ps((float*)(source5 + 8 * x)); 4259 __m128 c6 = _mm_load_ps((float*)(source6 + 8 * x)); 4260 __m128 c7 = _mm_load_ps((float*)(source7 + 8 * x)); 4261 __m128 c8 = _mm_load_ps((float*)(source8 + 8 * x)); 4262 __m128 c9 = _mm_load_ps((float*)(source9 + 8 * x)); 4263 __m128 cA = _mm_load_ps((float*)(sourceA + 8 * x)); 4264 __m128 cB = _mm_load_ps((float*)(sourceB + 8 * x)); 4265 __m128 cC = _mm_load_ps((float*)(sourceC + 8 * x)); 4266 __m128 cD = _mm_load_ps((float*)(sourceD + 8 * x)); 4267 __m128 cE = _mm_load_ps((float*)(sourceE + 8 * x)); 4268 __m128 cF = _mm_load_ps((float*)(sourceF + 8 * x)); 4269 4270 c0 = _mm_add_ps(c0, c1); 4271 c2 = _mm_add_ps(c2, c3); 4272 c4 = _mm_add_ps(c4, c5); 4273 c6 = _mm_add_ps(c6, c7); 4274 c8 = _mm_add_ps(c8, c9); 4275 cA = _mm_add_ps(cA, cB); 4276 cC = _mm_add_ps(cC, cD); 4277 cE = _mm_add_ps(cE, cF); 4278 c0 = _mm_add_ps(c0, c2); 4279 c4 = _mm_add_ps(c4, c6); 4280 c8 = _mm_add_ps(c8, cA); 4281 cC = _mm_add_ps(cC, cE); 4282 c0 = _mm_add_ps(c0, c4); 4283 c8 = _mm_add_ps(c8, cC); 4284 c0 = _mm_add_ps(c0, c8); 4285 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f)); 4286 4287 _mm_store_ps((float*)(source0 + 8 * x), c0); 4288 } 4289 4290 source0 += pitch; 4291 source1 += pitch; 4292 source2 += pitch; 4293 source3 += pitch; 4294 source4 += pitch; 4295 source5 += pitch; 4296 source6 += pitch; 4297 source7 += pitch; 4298 source8 += pitch; 4299 source9 += pitch; 4300 sourceA += pitch; 4301 sourceB += pitch; 4302 sourceC += pitch; 4303 sourceD += pitch; 4304 sourceE += pitch; 4305 sourceF += pitch; 4306 } 4307 } 4308 else ASSERT(false); 4309 } 4310 else 4311 { 4312 if(internal.depth == 2) 4313 { 4314 for(int y = 0; y < height; y++) 4315 { 4316 for(int x = 0; x < 2 * width; x++) 4317 { 4318 float c0 = *(float*)(source0 + 4 * x); 4319 float c1 = *(float*)(source1 + 4 * x); 4320 4321 c0 = c0 + c1; 4322 c0 *= 1.0f / 2.0f; 4323 4324 *(float*)(source0 + 4 * x) = c0; 4325 } 4326 4327 source0 += pitch; 4328 source1 += pitch; 4329 } 4330 } 4331 else if(internal.depth == 4) 4332 { 4333 for(int y = 0; y < height; y++) 4334 { 4335 for(int x = 0; x < 2 * width; x++) 4336 { 4337 float c0 = *(float*)(source0 + 4 * x); 4338 float c1 = *(float*)(source1 + 4 * x); 4339 float c2 = *(float*)(source2 + 4 * x); 4340 float c3 = *(float*)(source3 + 4 * x); 4341 4342 c0 = c0 + c1; 4343 c2 = c2 + c3; 4344 c0 = c0 + c2; 4345 c0 *= 1.0f / 4.0f; 4346 4347 *(float*)(source0 + 4 * x) = c0; 4348 } 4349 4350 source0 += pitch; 4351 source1 += pitch; 4352 source2 += pitch; 4353 source3 += pitch; 4354 } 4355 } 4356 else if(internal.depth == 8) 4357 { 4358 for(int y = 0; y < height; y++) 4359 { 4360 for(int x = 0; x < 2 * width; x++) 4361 { 4362 float c0 = *(float*)(source0 + 4 * x); 4363 float c1 = *(float*)(source1 + 4 * x); 4364 float c2 = *(float*)(source2 + 4 * x); 4365 float c3 = *(float*)(source3 + 4 * x); 4366 float c4 = *(float*)(source4 + 4 * x); 4367 float c5 = *(float*)(source5 + 4 * x); 4368 float c6 = *(float*)(source6 + 4 * x); 4369 float c7 = *(float*)(source7 + 4 * x); 4370 4371 c0 = c0 + c1; 4372 c2 = c2 + c3; 4373 c4 = c4 + c5; 4374 c6 = c6 + c7; 4375 c0 = c0 + c2; 4376 c4 = c4 + c6; 4377 c0 = c0 + c4; 4378 c0 *= 1.0f / 8.0f; 4379 4380 *(float*)(source0 + 4 * x) = c0; 4381 } 4382 4383 source0 += pitch; 4384 source1 += pitch; 4385 source2 += pitch; 4386 source3 += pitch; 4387 source4 += pitch; 4388 source5 += pitch; 4389 source6 += pitch; 4390 source7 += pitch; 4391 } 4392 } 4393 else if(internal.depth == 16) 4394 { 4395 for(int y = 0; y < height; y++) 4396 { 4397 for(int x = 0; x < 2 * width; x++) 4398 { 4399 float c0 = *(float*)(source0 + 4 * x); 4400 float c1 = *(float*)(source1 + 4 * x); 4401 float c2 = *(float*)(source2 + 4 * x); 4402 float c3 = *(float*)(source3 + 4 * x); 4403 float c4 = *(float*)(source4 + 4 * x); 4404 float c5 = *(float*)(source5 + 4 * x); 4405 float c6 = *(float*)(source6 + 4 * x); 4406 float c7 = *(float*)(source7 + 4 * x); 4407 float c8 = *(float*)(source8 + 4 * x); 4408 float c9 = *(float*)(source9 + 4 * x); 4409 float cA = *(float*)(sourceA + 4 * x); 4410 float cB = *(float*)(sourceB + 4 * x); 4411 float cC = *(float*)(sourceC + 4 * x); 4412 float cD = *(float*)(sourceD + 4 * x); 4413 float cE = *(float*)(sourceE + 4 * x); 4414 float cF = *(float*)(sourceF + 4 * x); 4415 4416 c0 = c0 + c1; 4417 c2 = c2 + c3; 4418 c4 = c4 + c5; 4419 c6 = c6 + c7; 4420 c8 = c8 + c9; 4421 cA = cA + cB; 4422 cC = cC + cD; 4423 cE = cE + cF; 4424 c0 = c0 + c2; 4425 c4 = c4 + c6; 4426 c8 = c8 + cA; 4427 cC = cC + cE; 4428 c0 = c0 + c4; 4429 c8 = c8 + cC; 4430 c0 = c0 + c8; 4431 c0 *= 1.0f / 16.0f; 4432 4433 *(float*)(source0 + 4 * x) = c0; 4434 } 4435 4436 source0 += pitch; 4437 source1 += pitch; 4438 source2 += pitch; 4439 source3 += pitch; 4440 source4 += pitch; 4441 source5 += pitch; 4442 source6 += pitch; 4443 source7 += pitch; 4444 source8 += pitch; 4445 source9 += pitch; 4446 sourceA += pitch; 4447 sourceB += pitch; 4448 sourceC += pitch; 4449 sourceD += pitch; 4450 sourceE += pitch; 4451 sourceF += pitch; 4452 } 4453 } 4454 else ASSERT(false); 4455 } 4456 } 4457 else if(internal.format == FORMAT_A32B32G32R32F) 4458 { 4459 if(CPUID::supportsSSE()) 4460 { 4461 if(internal.depth == 2) 4462 { 4463 for(int y = 0; y < height; y++) 4464 { 4465 for(int x = 0; x < width; x++) 4466 { 4467 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x)); 4468 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x)); 4469 4470 c0 = _mm_add_ps(c0, c1); 4471 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 2.0f)); 4472 4473 _mm_store_ps((float*)(source0 + 16 * x), c0); 4474 } 4475 4476 source0 += pitch; 4477 source1 += pitch; 4478 } 4479 } 4480 else if(internal.depth == 4) 4481 { 4482 for(int y = 0; y < height; y++) 4483 { 4484 for(int x = 0; x < width; x++) 4485 { 4486 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x)); 4487 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x)); 4488 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x)); 4489 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x)); 4490 4491 c0 = _mm_add_ps(c0, c1); 4492 c2 = _mm_add_ps(c2, c3); 4493 c0 = _mm_add_ps(c0, c2); 4494 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 4.0f)); 4495 4496 _mm_store_ps((float*)(source0 + 16 * x), c0); 4497 } 4498 4499 source0 += pitch; 4500 source1 += pitch; 4501 source2 += pitch; 4502 source3 += pitch; 4503 } 4504 } 4505 else if(internal.depth == 8) 4506 { 4507 for(int y = 0; y < height; y++) 4508 { 4509 for(int x = 0; x < width; x++) 4510 { 4511 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x)); 4512 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x)); 4513 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x)); 4514 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x)); 4515 __m128 c4 = _mm_load_ps((float*)(source4 + 16 * x)); 4516 __m128 c5 = _mm_load_ps((float*)(source5 + 16 * x)); 4517 __m128 c6 = _mm_load_ps((float*)(source6 + 16 * x)); 4518 __m128 c7 = _mm_load_ps((float*)(source7 + 16 * x)); 4519 4520 c0 = _mm_add_ps(c0, c1); 4521 c2 = _mm_add_ps(c2, c3); 4522 c4 = _mm_add_ps(c4, c5); 4523 c6 = _mm_add_ps(c6, c7); 4524 c0 = _mm_add_ps(c0, c2); 4525 c4 = _mm_add_ps(c4, c6); 4526 c0 = _mm_add_ps(c0, c4); 4527 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 8.0f)); 4528 4529 _mm_store_ps((float*)(source0 + 16 * x), c0); 4530 } 4531 4532 source0 += pitch; 4533 source1 += pitch; 4534 source2 += pitch; 4535 source3 += pitch; 4536 source4 += pitch; 4537 source5 += pitch; 4538 source6 += pitch; 4539 source7 += pitch; 4540 } 4541 } 4542 else if(internal.depth == 16) 4543 { 4544 for(int y = 0; y < height; y++) 4545 { 4546 for(int x = 0; x < width; x++) 4547 { 4548 __m128 c0 = _mm_load_ps((float*)(source0 + 16 * x)); 4549 __m128 c1 = _mm_load_ps((float*)(source1 + 16 * x)); 4550 __m128 c2 = _mm_load_ps((float*)(source2 + 16 * x)); 4551 __m128 c3 = _mm_load_ps((float*)(source3 + 16 * x)); 4552 __m128 c4 = _mm_load_ps((float*)(source4 + 16 * x)); 4553 __m128 c5 = _mm_load_ps((float*)(source5 + 16 * x)); 4554 __m128 c6 = _mm_load_ps((float*)(source6 + 16 * x)); 4555 __m128 c7 = _mm_load_ps((float*)(source7 + 16 * x)); 4556 __m128 c8 = _mm_load_ps((float*)(source8 + 16 * x)); 4557 __m128 c9 = _mm_load_ps((float*)(source9 + 16 * x)); 4558 __m128 cA = _mm_load_ps((float*)(sourceA + 16 * x)); 4559 __m128 cB = _mm_load_ps((float*)(sourceB + 16 * x)); 4560 __m128 cC = _mm_load_ps((float*)(sourceC + 16 * x)); 4561 __m128 cD = _mm_load_ps((float*)(sourceD + 16 * x)); 4562 __m128 cE = _mm_load_ps((float*)(sourceE + 16 * x)); 4563 __m128 cF = _mm_load_ps((float*)(sourceF + 16 * x)); 4564 4565 c0 = _mm_add_ps(c0, c1); 4566 c2 = _mm_add_ps(c2, c3); 4567 c4 = _mm_add_ps(c4, c5); 4568 c6 = _mm_add_ps(c6, c7); 4569 c8 = _mm_add_ps(c8, c9); 4570 cA = _mm_add_ps(cA, cB); 4571 cC = _mm_add_ps(cC, cD); 4572 cE = _mm_add_ps(cE, cF); 4573 c0 = _mm_add_ps(c0, c2); 4574 c4 = _mm_add_ps(c4, c6); 4575 c8 = _mm_add_ps(c8, cA); 4576 cC = _mm_add_ps(cC, cE); 4577 c0 = _mm_add_ps(c0, c4); 4578 c8 = _mm_add_ps(c8, cC); 4579 c0 = _mm_add_ps(c0, c8); 4580 c0 = _mm_mul_ps(c0, _mm_set1_ps(1.0f / 16.0f)); 4581 4582 _mm_store_ps((float*)(source0 + 16 * x), c0); 4583 } 4584 4585 source0 += pitch; 4586 source1 += pitch; 4587 source2 += pitch; 4588 source3 += pitch; 4589 source4 += pitch; 4590 source5 += pitch; 4591 source6 += pitch; 4592 source7 += pitch; 4593 source8 += pitch; 4594 source9 += pitch; 4595 sourceA += pitch; 4596 sourceB += pitch; 4597 sourceC += pitch; 4598 sourceD += pitch; 4599 sourceE += pitch; 4600 sourceF += pitch; 4601 } 4602 } 4603 else ASSERT(false); 4604 } 4605 else 4606 { 4607 if(internal.depth == 2) 4608 { 4609 for(int y = 0; y < height; y++) 4610 { 4611 for(int x = 0; x < 4 * width; x++) 4612 { 4613 float c0 = *(float*)(source0 + 4 * x); 4614 float c1 = *(float*)(source1 + 4 * x); 4615 4616 c0 = c0 + c1; 4617 c0 *= 1.0f / 2.0f; 4618 4619 *(float*)(source0 + 4 * x) = c0; 4620 } 4621 4622 source0 += pitch; 4623 source1 += pitch; 4624 } 4625 } 4626 else if(internal.depth == 4) 4627 { 4628 for(int y = 0; y < height; y++) 4629 { 4630 for(int x = 0; x < 4 * width; x++) 4631 { 4632 float c0 = *(float*)(source0 + 4 * x); 4633 float c1 = *(float*)(source1 + 4 * x); 4634 float c2 = *(float*)(source2 + 4 * x); 4635 float c3 = *(float*)(source3 + 4 * x); 4636 4637 c0 = c0 + c1; 4638 c2 = c2 + c3; 4639 c0 = c0 + c2; 4640 c0 *= 1.0f / 4.0f; 4641 4642 *(float*)(source0 + 4 * x) = c0; 4643 } 4644 4645 source0 += pitch; 4646 source1 += pitch; 4647 source2 += pitch; 4648 source3 += pitch; 4649 } 4650 } 4651 else if(internal.depth == 8) 4652 { 4653 for(int y = 0; y < height; y++) 4654 { 4655 for(int x = 0; x < 4 * width; x++) 4656 { 4657 float c0 = *(float*)(source0 + 4 * x); 4658 float c1 = *(float*)(source1 + 4 * x); 4659 float c2 = *(float*)(source2 + 4 * x); 4660 float c3 = *(float*)(source3 + 4 * x); 4661 float c4 = *(float*)(source4 + 4 * x); 4662 float c5 = *(float*)(source5 + 4 * x); 4663 float c6 = *(float*)(source6 + 4 * x); 4664 float c7 = *(float*)(source7 + 4 * x); 4665 4666 c0 = c0 + c1; 4667 c2 = c2 + c3; 4668 c4 = c4 + c5; 4669 c6 = c6 + c7; 4670 c0 = c0 + c2; 4671 c4 = c4 + c6; 4672 c0 = c0 + c4; 4673 c0 *= 1.0f / 8.0f; 4674 4675 *(float*)(source0 + 4 * x) = c0; 4676 } 4677 4678 source0 += pitch; 4679 source1 += pitch; 4680 source2 += pitch; 4681 source3 += pitch; 4682 source4 += pitch; 4683 source5 += pitch; 4684 source6 += pitch; 4685 source7 += pitch; 4686 } 4687 } 4688 else if(internal.depth == 16) 4689 { 4690 for(int y = 0; y < height; y++) 4691 { 4692 for(int x = 0; x < 4 * width; x++) 4693 { 4694 float c0 = *(float*)(source0 + 4 * x); 4695 float c1 = *(float*)(source1 + 4 * x); 4696 float c2 = *(float*)(source2 + 4 * x); 4697 float c3 = *(float*)(source3 + 4 * x); 4698 float c4 = *(float*)(source4 + 4 * x); 4699 float c5 = *(float*)(source5 + 4 * x); 4700 float c6 = *(float*)(source6 + 4 * x); 4701 float c7 = *(float*)(source7 + 4 * x); 4702 float c8 = *(float*)(source8 + 4 * x); 4703 float c9 = *(float*)(source9 + 4 * x); 4704 float cA = *(float*)(sourceA + 4 * x); 4705 float cB = *(float*)(sourceB + 4 * x); 4706 float cC = *(float*)(sourceC + 4 * x); 4707 float cD = *(float*)(sourceD + 4 * x); 4708 float cE = *(float*)(sourceE + 4 * x); 4709 float cF = *(float*)(sourceF + 4 * x); 4710 4711 c0 = c0 + c1; 4712 c2 = c2 + c3; 4713 c4 = c4 + c5; 4714 c6 = c6 + c7; 4715 c8 = c8 + c9; 4716 cA = cA + cB; 4717 cC = cC + cD; 4718 cE = cE + cF; 4719 c0 = c0 + c2; 4720 c4 = c4 + c6; 4721 c8 = c8 + cA; 4722 cC = cC + cE; 4723 c0 = c0 + c4; 4724 c8 = c8 + cC; 4725 c0 = c0 + c8; 4726 c0 *= 1.0f / 16.0f; 4727 4728 *(float*)(source0 + 4 * x) = c0; 4729 } 4730 4731 source0 += pitch; 4732 source1 += pitch; 4733 source2 += pitch; 4734 source3 += pitch; 4735 source4 += pitch; 4736 source5 += pitch; 4737 source6 += pitch; 4738 source7 += pitch; 4739 source8 += pitch; 4740 source9 += pitch; 4741 sourceA += pitch; 4742 sourceB += pitch; 4743 sourceC += pitch; 4744 sourceD += pitch; 4745 sourceE += pitch; 4746 sourceF += pitch; 4747 } 4748 } 4749 else ASSERT(false); 4750 } 4751 } 4752 else 4753 { 4754 // UNIMPLEMENTED(); 4755 } 4756 } 4757} 4758