16c354881b63935626a0700366937530d38b8b1e8krajcevski/*
26c354881b63935626a0700366937530d38b8b1e8krajcevski * Copyright 2014 Google Inc.
36c354881b63935626a0700366937530d38b8b1e8krajcevski *
46c354881b63935626a0700366937530d38b8b1e8krajcevski * Use of this source code is governed by a BSD-style license that can be
56c354881b63935626a0700366937530d38b8b1e8krajcevski * found in the LICENSE file.
66c354881b63935626a0700366937530d38b8b1e8krajcevski */
76c354881b63935626a0700366937530d38b8b1e8krajcevski
86c354881b63935626a0700366937530d38b8b1e8krajcevski#include "SkTextureCompressor.h"
9d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski#include "SkTextureCompressor_Blitter.h"
106c354881b63935626a0700366937530d38b8b1e8krajcevski
11b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski#include "SkBlitter.h"
126c354881b63935626a0700366937530d38b8b1e8krajcevski#include "SkEndian.h"
136c354881b63935626a0700366937530d38b8b1e8krajcevski
146c354881b63935626a0700366937530d38b8b1e8krajcevski// #define COMPRESS_R11_EAC_SLOW 1
156c354881b63935626a0700366937530d38b8b1e8krajcevski// #define COMPRESS_R11_EAC_FAST 1
166c354881b63935626a0700366937530d38b8b1e8krajcevski#define COMPRESS_R11_EAC_FASTEST 1
176c354881b63935626a0700366937530d38b8b1e8krajcevski
186c354881b63935626a0700366937530d38b8b1e8krajcevski// Blocks compressed into R11 EAC are represented as follows:
196c354881b63935626a0700366937530d38b8b1e8krajcevski// 0000000000000000000000000000000000000000000000000000000000000000
206c354881b63935626a0700366937530d38b8b1e8krajcevski// |base_cw|mod|mul|  ----------------- indices -------------------
216c354881b63935626a0700366937530d38b8b1e8krajcevski//
226c354881b63935626a0700366937530d38b8b1e8krajcevski// To reconstruct the value of a given pixel, we use the formula:
236c354881b63935626a0700366937530d38b8b1e8krajcevski// clamp[0, 2047](base_cw * 8 + 4 + mod_val*mul*8)
246c354881b63935626a0700366937530d38b8b1e8krajcevski//
256c354881b63935626a0700366937530d38b8b1e8krajcevski// mod_val is chosen from a palette of values based on the index of the
266c354881b63935626a0700366937530d38b8b1e8krajcevski// given pixel. The palette is chosen by the value stored in mod.
276c354881b63935626a0700366937530d38b8b1e8krajcevski// This formula returns a value between 0 and 2047, which is converted
286c354881b63935626a0700366937530d38b8b1e8krajcevski// to a float from 0 to 1 in OpenGL.
296c354881b63935626a0700366937530d38b8b1e8krajcevski//
306c354881b63935626a0700366937530d38b8b1e8krajcevski// If mul is zero, then we set mul = 1/8, so that the formula becomes
316c354881b63935626a0700366937530d38b8b1e8krajcevski// clamp[0, 2047](base_cw * 8 + 4 + mod_val)
326c354881b63935626a0700366937530d38b8b1e8krajcevski
336c354881b63935626a0700366937530d38b8b1e8krajcevskistatic const int kNumR11EACPalettes = 16;
346c354881b63935626a0700366937530d38b8b1e8krajcevskistatic const int kR11EACPaletteSize = 8;
356c354881b63935626a0700366937530d38b8b1e8krajcevskistatic const int kR11EACModifierPalettes[kNumR11EACPalettes][kR11EACPaletteSize] = {
366c354881b63935626a0700366937530d38b8b1e8krajcevski    {-3, -6, -9, -15, 2, 5, 8, 14},
376c354881b63935626a0700366937530d38b8b1e8krajcevski    {-3, -7, -10, -13, 2, 6, 9, 12},
386c354881b63935626a0700366937530d38b8b1e8krajcevski    {-2, -5, -8, -13, 1, 4, 7, 12},
396c354881b63935626a0700366937530d38b8b1e8krajcevski    {-2, -4, -6, -13, 1, 3, 5, 12},
406c354881b63935626a0700366937530d38b8b1e8krajcevski    {-3, -6, -8, -12, 2, 5, 7, 11},
416c354881b63935626a0700366937530d38b8b1e8krajcevski    {-3, -7, -9, -11, 2, 6, 8, 10},
426c354881b63935626a0700366937530d38b8b1e8krajcevski    {-4, -7, -8, -11, 3, 6, 7, 10},
436c354881b63935626a0700366937530d38b8b1e8krajcevski    {-3, -5, -8, -11, 2, 4, 7, 10},
446c354881b63935626a0700366937530d38b8b1e8krajcevski    {-2, -6, -8, -10, 1, 5, 7, 9},
456c354881b63935626a0700366937530d38b8b1e8krajcevski    {-2, -5, -8, -10, 1, 4, 7, 9},
466c354881b63935626a0700366937530d38b8b1e8krajcevski    {-2, -4, -8, -10, 1, 3, 7, 9},
476c354881b63935626a0700366937530d38b8b1e8krajcevski    {-2, -5, -7, -10, 1, 4, 6, 9},
486c354881b63935626a0700366937530d38b8b1e8krajcevski    {-3, -4, -7, -10, 2, 3, 6, 9},
496c354881b63935626a0700366937530d38b8b1e8krajcevski    {-1, -2, -3, -10, 0, 1, 2, 9},
506c354881b63935626a0700366937530d38b8b1e8krajcevski    {-4, -6, -8, -9, 3, 5, 7, 8},
516c354881b63935626a0700366937530d38b8b1e8krajcevski    {-3, -5, -7, -9, 2, 4, 6, 8}
526c354881b63935626a0700366937530d38b8b1e8krajcevski};
536c354881b63935626a0700366937530d38b8b1e8krajcevski
544ad76e35111585f4da662d54943f23792dd1e0aekrajcevski#if COMPRESS_R11_EAC_SLOW
554ad76e35111585f4da662d54943f23792dd1e0aekrajcevski
566c354881b63935626a0700366937530d38b8b1e8krajcevski// Pack the base codeword, palette, and multiplier into the 64 bits necessary
576c354881b63935626a0700366937530d38b8b1e8krajcevski// to decode it.
586c354881b63935626a0700366937530d38b8b1e8krajcevskistatic uint64_t pack_r11eac_block(uint16_t base_cw, uint16_t palette, uint16_t multiplier,
596c354881b63935626a0700366937530d38b8b1e8krajcevski                                  uint64_t indices) {
606c354881b63935626a0700366937530d38b8b1e8krajcevski    SkASSERT(palette < 16);
616c354881b63935626a0700366937530d38b8b1e8krajcevski    SkASSERT(multiplier < 16);
626c354881b63935626a0700366937530d38b8b1e8krajcevski    SkASSERT(indices < (static_cast<uint64_t>(1) << 48));
636c354881b63935626a0700366937530d38b8b1e8krajcevski
646c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint64_t b = static_cast<uint64_t>(base_cw) << 56;
656c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint64_t m = static_cast<uint64_t>(multiplier) << 52;
666c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint64_t p = static_cast<uint64_t>(palette) << 48;
676c354881b63935626a0700366937530d38b8b1e8krajcevski    return SkEndian_SwapBE64(b | m | p | indices);
686c354881b63935626a0700366937530d38b8b1e8krajcevski}
696c354881b63935626a0700366937530d38b8b1e8krajcevski
706c354881b63935626a0700366937530d38b8b1e8krajcevski// Given a base codeword, a modifier, and a multiplier, compute the proper
716c354881b63935626a0700366937530d38b8b1e8krajcevski// pixel value in the range [0, 2047].
726c354881b63935626a0700366937530d38b8b1e8krajcevskistatic uint16_t compute_r11eac_pixel(int base_cw, int modifier, int multiplier) {
736c354881b63935626a0700366937530d38b8b1e8krajcevski    int ret = (base_cw * 8 + 4) + (modifier * multiplier * 8);
746c354881b63935626a0700366937530d38b8b1e8krajcevski    return (ret > 2047)? 2047 : ((ret < 0)? 0 : ret);
756c354881b63935626a0700366937530d38b8b1e8krajcevski}
766c354881b63935626a0700366937530d38b8b1e8krajcevski
776c354881b63935626a0700366937530d38b8b1e8krajcevski// Compress a block into R11 EAC format.
786c354881b63935626a0700366937530d38b8b1e8krajcevski// The compression works as follows:
796c354881b63935626a0700366937530d38b8b1e8krajcevski// 1. Find the center of the span of the block's values. Use this as the base codeword.
806c354881b63935626a0700366937530d38b8b1e8krajcevski// 2. Choose a multiplier based roughly on the size of the span of block values
816c354881b63935626a0700366937530d38b8b1e8krajcevski// 3. Iterate through each palette and choose the one with the most accurate
826c354881b63935626a0700366937530d38b8b1e8krajcevski// modifiers.
836c354881b63935626a0700366937530d38b8b1e8krajcevskistatic inline uint64_t compress_heterogeneous_r11eac_block(const uint8_t block[16]) {
846c354881b63935626a0700366937530d38b8b1e8krajcevski    // Find the center of the data...
856c354881b63935626a0700366937530d38b8b1e8krajcevski    uint16_t bmin = block[0];
866c354881b63935626a0700366937530d38b8b1e8krajcevski    uint16_t bmax = block[0];
876c354881b63935626a0700366937530d38b8b1e8krajcevski    for (int i = 1; i < 16; ++i) {
886c354881b63935626a0700366937530d38b8b1e8krajcevski        bmin = SkTMin<uint16_t>(bmin, block[i]);
896c354881b63935626a0700366937530d38b8b1e8krajcevski        bmax = SkTMax<uint16_t>(bmax, block[i]);
906c354881b63935626a0700366937530d38b8b1e8krajcevski    }
916c354881b63935626a0700366937530d38b8b1e8krajcevski
926c354881b63935626a0700366937530d38b8b1e8krajcevski    uint16_t center = (bmax + bmin) >> 1;
936c354881b63935626a0700366937530d38b8b1e8krajcevski    SkASSERT(center <= 255);
946c354881b63935626a0700366937530d38b8b1e8krajcevski
956c354881b63935626a0700366937530d38b8b1e8krajcevski    // Based on the min and max, we can guesstimate a proper multiplier
966c354881b63935626a0700366937530d38b8b1e8krajcevski    // This is kind of a magic choice to start with.
976c354881b63935626a0700366937530d38b8b1e8krajcevski    uint16_t multiplier = (bmax - center) / 10;
986c354881b63935626a0700366937530d38b8b1e8krajcevski
996c354881b63935626a0700366937530d38b8b1e8krajcevski    // Now convert the block to 11 bits and transpose it to match
1006c354881b63935626a0700366937530d38b8b1e8krajcevski    // the proper layout
1016c354881b63935626a0700366937530d38b8b1e8krajcevski    uint16_t cblock[16];
1026c354881b63935626a0700366937530d38b8b1e8krajcevski    for (int i = 0; i < 4; ++i) {
1036c354881b63935626a0700366937530d38b8b1e8krajcevski        for (int j = 0; j < 4; ++j) {
1046c354881b63935626a0700366937530d38b8b1e8krajcevski            int srcIdx = i*4+j;
1056c354881b63935626a0700366937530d38b8b1e8krajcevski            int dstIdx = j*4+i;
1066c354881b63935626a0700366937530d38b8b1e8krajcevski            cblock[dstIdx] = (block[srcIdx] << 3) | (block[srcIdx] >> 5);
1076c354881b63935626a0700366937530d38b8b1e8krajcevski        }
1086c354881b63935626a0700366937530d38b8b1e8krajcevski    }
1096c354881b63935626a0700366937530d38b8b1e8krajcevski
1106c354881b63935626a0700366937530d38b8b1e8krajcevski    // Finally, choose the proper palette and indices
1116c354881b63935626a0700366937530d38b8b1e8krajcevski    uint32_t bestError = 0xFFFFFFFF;
1126c354881b63935626a0700366937530d38b8b1e8krajcevski    uint64_t bestIndices = 0;
1136c354881b63935626a0700366937530d38b8b1e8krajcevski    uint16_t bestPalette = 0;
1146c354881b63935626a0700366937530d38b8b1e8krajcevski    for (uint16_t paletteIdx = 0; paletteIdx < kNumR11EACPalettes; ++paletteIdx) {
1156c354881b63935626a0700366937530d38b8b1e8krajcevski        const int *palette = kR11EACModifierPalettes[paletteIdx];
1166c354881b63935626a0700366937530d38b8b1e8krajcevski
1176c354881b63935626a0700366937530d38b8b1e8krajcevski        // Iterate through each pixel to find the best palette index
1186c354881b63935626a0700366937530d38b8b1e8krajcevski        // and update the indices with the choice. Also store the error
1196c354881b63935626a0700366937530d38b8b1e8krajcevski        // for this palette to be compared against the best error...
1206c354881b63935626a0700366937530d38b8b1e8krajcevski        uint32_t error = 0;
1216c354881b63935626a0700366937530d38b8b1e8krajcevski        uint64_t indices = 0;
1226c354881b63935626a0700366937530d38b8b1e8krajcevski        for (int pixelIdx = 0; pixelIdx < 16; ++pixelIdx) {
1236c354881b63935626a0700366937530d38b8b1e8krajcevski            const uint16_t pixel = cblock[pixelIdx];
1246c354881b63935626a0700366937530d38b8b1e8krajcevski
1256c354881b63935626a0700366937530d38b8b1e8krajcevski            // Iterate through each palette value to find the best index
1266c354881b63935626a0700366937530d38b8b1e8krajcevski            // for this particular pixel for this particular palette.
1276c354881b63935626a0700366937530d38b8b1e8krajcevski            uint16_t bestPixelError =
1286c354881b63935626a0700366937530d38b8b1e8krajcevski                abs_diff(pixel, compute_r11eac_pixel(center, palette[0], multiplier));
1296c354881b63935626a0700366937530d38b8b1e8krajcevski            int bestIndex = 0;
1306c354881b63935626a0700366937530d38b8b1e8krajcevski            for (int i = 1; i < kR11EACPaletteSize; ++i) {
1316c354881b63935626a0700366937530d38b8b1e8krajcevski                const uint16_t p = compute_r11eac_pixel(center, palette[i], multiplier);
1326c354881b63935626a0700366937530d38b8b1e8krajcevski                const uint16_t perror = abs_diff(pixel, p);
1336c354881b63935626a0700366937530d38b8b1e8krajcevski
1346c354881b63935626a0700366937530d38b8b1e8krajcevski                // Is this index better?
1356c354881b63935626a0700366937530d38b8b1e8krajcevski                if (perror < bestPixelError) {
1366c354881b63935626a0700366937530d38b8b1e8krajcevski                    bestIndex = i;
1376c354881b63935626a0700366937530d38b8b1e8krajcevski                    bestPixelError = perror;
1386c354881b63935626a0700366937530d38b8b1e8krajcevski                }
1396c354881b63935626a0700366937530d38b8b1e8krajcevski            }
1406c354881b63935626a0700366937530d38b8b1e8krajcevski
1416c354881b63935626a0700366937530d38b8b1e8krajcevski            SkASSERT(bestIndex < 8);
1426c354881b63935626a0700366937530d38b8b1e8krajcevski
1436c354881b63935626a0700366937530d38b8b1e8krajcevski            error += bestPixelError;
1446c354881b63935626a0700366937530d38b8b1e8krajcevski            indices <<= 3;
1456c354881b63935626a0700366937530d38b8b1e8krajcevski            indices |= bestIndex;
1466c354881b63935626a0700366937530d38b8b1e8krajcevski        }
1476c354881b63935626a0700366937530d38b8b1e8krajcevski
1486c354881b63935626a0700366937530d38b8b1e8krajcevski        SkASSERT(indices < (static_cast<uint64_t>(1) << 48));
1496c354881b63935626a0700366937530d38b8b1e8krajcevski
1506c354881b63935626a0700366937530d38b8b1e8krajcevski        // Is this palette better?
1516c354881b63935626a0700366937530d38b8b1e8krajcevski        if (error < bestError) {
1526c354881b63935626a0700366937530d38b8b1e8krajcevski            bestPalette = paletteIdx;
1536c354881b63935626a0700366937530d38b8b1e8krajcevski            bestIndices = indices;
1546c354881b63935626a0700366937530d38b8b1e8krajcevski            bestError = error;
1556c354881b63935626a0700366937530d38b8b1e8krajcevski        }
1566c354881b63935626a0700366937530d38b8b1e8krajcevski    }
1576c354881b63935626a0700366937530d38b8b1e8krajcevski
1586c354881b63935626a0700366937530d38b8b1e8krajcevski    // Finally, pack everything together...
1596c354881b63935626a0700366937530d38b8b1e8krajcevski    return pack_r11eac_block(center, bestPalette, multiplier, bestIndices);
1606c354881b63935626a0700366937530d38b8b1e8krajcevski}
1616c354881b63935626a0700366937530d38b8b1e8krajcevski#endif // COMPRESS_R11_EAC_SLOW
1626c354881b63935626a0700366937530d38b8b1e8krajcevski
1636c354881b63935626a0700366937530d38b8b1e8krajcevski#if COMPRESS_R11_EAC_FAST
1646c354881b63935626a0700366937530d38b8b1e8krajcevski// This function takes into account that most blocks that we compress have a gradation from
1656c354881b63935626a0700366937530d38b8b1e8krajcevski// fully opaque to fully transparent. The compression scheme works by selecting the
1666c354881b63935626a0700366937530d38b8b1e8krajcevski// palette and multiplier that has the tightest fit to the 0-255 range. This is encoded
1676c354881b63935626a0700366937530d38b8b1e8krajcevski// as the block header (0x8490). The indices are then selected by considering the top
1686c354881b63935626a0700366937530d38b8b1e8krajcevski// three bits of each alpha value. For alpha masks, this reduces the dynamic range from
1696c354881b63935626a0700366937530d38b8b1e8krajcevski// 17 to 8, but the quality is still acceptable.
1706c354881b63935626a0700366937530d38b8b1e8krajcevski//
1716c354881b63935626a0700366937530d38b8b1e8krajcevski// There are a few caveats that need to be taken care of...
1726c354881b63935626a0700366937530d38b8b1e8krajcevski//
1736c354881b63935626a0700366937530d38b8b1e8krajcevski// 1. The block is read in as scanlines, so the indices are stored as:
1746c354881b63935626a0700366937530d38b8b1e8krajcevski//     0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
1756c354881b63935626a0700366937530d38b8b1e8krajcevski//    However, the decomrpession routine reads them in column-major order, so they
1766c354881b63935626a0700366937530d38b8b1e8krajcevski//    need to be packed as:
1776c354881b63935626a0700366937530d38b8b1e8krajcevski//     0 4 8 12 1 5 9 13 2 6 10 14 3 7 11 15
1786c354881b63935626a0700366937530d38b8b1e8krajcevski//    So when reading, they must be transposed.
1796c354881b63935626a0700366937530d38b8b1e8krajcevski//
1806c354881b63935626a0700366937530d38b8b1e8krajcevski// 2. We cannot use the top three bits as an index directly, since the R11 EAC palettes
1816c354881b63935626a0700366937530d38b8b1e8krajcevski//    above store the modulation values first decreasing and then increasing:
1826c354881b63935626a0700366937530d38b8b1e8krajcevski//      e.g. {-3, -6, -9, -15, 2, 5, 8, 14}
1836c354881b63935626a0700366937530d38b8b1e8krajcevski//    Hence, we need to convert the indices with the following mapping:
1846c354881b63935626a0700366937530d38b8b1e8krajcevski//      From: 0 1 2 3 4 5 6 7
1856c354881b63935626a0700366937530d38b8b1e8krajcevski//      To:   3 2 1 0 4 5 6 7
1866c354881b63935626a0700366937530d38b8b1e8krajcevskistatic inline uint64_t compress_heterogeneous_r11eac_block(const uint8_t block[16]) {
1876c354881b63935626a0700366937530d38b8b1e8krajcevski    uint64_t retVal = static_cast<uint64_t>(0x8490) << 48;
1886c354881b63935626a0700366937530d38b8b1e8krajcevski    for(int i = 0; i < 4; ++i) {
1896c354881b63935626a0700366937530d38b8b1e8krajcevski        for(int j = 0; j < 4; ++j) {
1906c354881b63935626a0700366937530d38b8b1e8krajcevski            const int shift = 45-3*(j*4+i);
1916c354881b63935626a0700366937530d38b8b1e8krajcevski            SkASSERT(shift <= 45);
1926c354881b63935626a0700366937530d38b8b1e8krajcevski            const uint64_t idx = block[i*4+j] >> 5;
1936c354881b63935626a0700366937530d38b8b1e8krajcevski            SkASSERT(idx < 8);
1946c354881b63935626a0700366937530d38b8b1e8krajcevski
1956c354881b63935626a0700366937530d38b8b1e8krajcevski            // !SPEED! This is slightly faster than having an if-statement.
1966c354881b63935626a0700366937530d38b8b1e8krajcevski            switch(idx) {
1976c354881b63935626a0700366937530d38b8b1e8krajcevski                case 0:
1986c354881b63935626a0700366937530d38b8b1e8krajcevski                case 1:
1996c354881b63935626a0700366937530d38b8b1e8krajcevski                case 2:
2006c354881b63935626a0700366937530d38b8b1e8krajcevski                case 3:
2016c354881b63935626a0700366937530d38b8b1e8krajcevski                    retVal |= (3-idx) << shift;
2026c354881b63935626a0700366937530d38b8b1e8krajcevski                    break;
2036c354881b63935626a0700366937530d38b8b1e8krajcevski                default:
2046c354881b63935626a0700366937530d38b8b1e8krajcevski                    retVal |= idx << shift;
2056c354881b63935626a0700366937530d38b8b1e8krajcevski                    break;
2066c354881b63935626a0700366937530d38b8b1e8krajcevski            }
2076c354881b63935626a0700366937530d38b8b1e8krajcevski        }
2086c354881b63935626a0700366937530d38b8b1e8krajcevski    }
2096c354881b63935626a0700366937530d38b8b1e8krajcevski
2106c354881b63935626a0700366937530d38b8b1e8krajcevski    return SkEndian_SwapBE64(retVal);
2116c354881b63935626a0700366937530d38b8b1e8krajcevski}
2126c354881b63935626a0700366937530d38b8b1e8krajcevski#endif // COMPRESS_R11_EAC_FAST
2136c354881b63935626a0700366937530d38b8b1e8krajcevski
2146c354881b63935626a0700366937530d38b8b1e8krajcevski#if (COMPRESS_R11_EAC_SLOW) || (COMPRESS_R11_EAC_FAST)
2156c354881b63935626a0700366937530d38b8b1e8krajcevskistatic uint64_t compress_r11eac_block(const uint8_t block[16]) {
2166c354881b63935626a0700366937530d38b8b1e8krajcevski    // Are all blocks a solid color?
2176c354881b63935626a0700366937530d38b8b1e8krajcevski    bool solid = true;
2186c354881b63935626a0700366937530d38b8b1e8krajcevski    for (int i = 1; i < 16; ++i) {
2196c354881b63935626a0700366937530d38b8b1e8krajcevski        if (block[i] != block[0]) {
2206c354881b63935626a0700366937530d38b8b1e8krajcevski            solid = false;
2216c354881b63935626a0700366937530d38b8b1e8krajcevski            break;
2226c354881b63935626a0700366937530d38b8b1e8krajcevski        }
2236c354881b63935626a0700366937530d38b8b1e8krajcevski    }
2246c354881b63935626a0700366937530d38b8b1e8krajcevski
2256c354881b63935626a0700366937530d38b8b1e8krajcevski    if (solid) {
2266c354881b63935626a0700366937530d38b8b1e8krajcevski        switch(block[0]) {
2276c354881b63935626a0700366937530d38b8b1e8krajcevski            // Fully transparent? We know the encoding...
2286c354881b63935626a0700366937530d38b8b1e8krajcevski            case 0:
2296c354881b63935626a0700366937530d38b8b1e8krajcevski                // (0x0020 << 48) produces the following:
2306c354881b63935626a0700366937530d38b8b1e8krajcevski                // basw_cw: 0
2316c354881b63935626a0700366937530d38b8b1e8krajcevski                // mod: 0, palette: {-3, -6, -9, -15, 2, 5, 8, 14}
2326c354881b63935626a0700366937530d38b8b1e8krajcevski                // multiplier: 2
2336c354881b63935626a0700366937530d38b8b1e8krajcevski                // mod_val: -3
2346c354881b63935626a0700366937530d38b8b1e8krajcevski                //
2356c354881b63935626a0700366937530d38b8b1e8krajcevski                // this gives the following formula:
2366c354881b63935626a0700366937530d38b8b1e8krajcevski                // clamp[0, 2047](0*8+4+(-3)*2*8) = 0
2376c354881b63935626a0700366937530d38b8b1e8krajcevski                //
2386c354881b63935626a0700366937530d38b8b1e8krajcevski                // Furthermore, it is impervious to endianness:
2396c354881b63935626a0700366937530d38b8b1e8krajcevski                // 0x0020000000002000ULL
2406c354881b63935626a0700366937530d38b8b1e8krajcevski                // Will produce one pixel with index 2, which gives:
2416c354881b63935626a0700366937530d38b8b1e8krajcevski                // clamp[0, 2047](0*8+4+(-9)*2*8) = 0
2426c354881b63935626a0700366937530d38b8b1e8krajcevski                return 0x0020000000002000ULL;
2436c354881b63935626a0700366937530d38b8b1e8krajcevski
2446c354881b63935626a0700366937530d38b8b1e8krajcevski            // Fully opaque? We know this encoding too...
2456c354881b63935626a0700366937530d38b8b1e8krajcevski            case 255:
2466c354881b63935626a0700366937530d38b8b1e8krajcevski
2476c354881b63935626a0700366937530d38b8b1e8krajcevski                // -1 produces the following:
2486c354881b63935626a0700366937530d38b8b1e8krajcevski                // basw_cw: 255
2496c354881b63935626a0700366937530d38b8b1e8krajcevski                // mod: 15, palette: {-3, -5, -7, -9, 2, 4, 6, 8}
2506c354881b63935626a0700366937530d38b8b1e8krajcevski                // mod_val: 8
2516c354881b63935626a0700366937530d38b8b1e8krajcevski                //
2526c354881b63935626a0700366937530d38b8b1e8krajcevski                // this gives the following formula:
2536c354881b63935626a0700366937530d38b8b1e8krajcevski                // clamp[0, 2047](255*8+4+8*8*8) = clamp[0, 2047](2556) = 2047
2546c354881b63935626a0700366937530d38b8b1e8krajcevski                return 0xFFFFFFFFFFFFFFFFULL;
2556c354881b63935626a0700366937530d38b8b1e8krajcevski
2566c354881b63935626a0700366937530d38b8b1e8krajcevski            default:
2576c354881b63935626a0700366937530d38b8b1e8krajcevski                // !TODO! krajcevski:
2586c354881b63935626a0700366937530d38b8b1e8krajcevski                // This will probably never happen, since we're using this format
2596c354881b63935626a0700366937530d38b8b1e8krajcevski                // primarily for compressing alpha maps. Usually the only
2606c354881b63935626a0700366937530d38b8b1e8krajcevski                // non-fullly opaque or fully transparent blocks are not a solid
2616c354881b63935626a0700366937530d38b8b1e8krajcevski                // intermediate color. If we notice that they are, then we can
2626c354881b63935626a0700366937530d38b8b1e8krajcevski                // add another optimization...
2636c354881b63935626a0700366937530d38b8b1e8krajcevski                break;
2646c354881b63935626a0700366937530d38b8b1e8krajcevski        }
2656c354881b63935626a0700366937530d38b8b1e8krajcevski    }
2666c354881b63935626a0700366937530d38b8b1e8krajcevski
2676c354881b63935626a0700366937530d38b8b1e8krajcevski    return compress_heterogeneous_r11eac_block(block);
2686c354881b63935626a0700366937530d38b8b1e8krajcevski}
2696c354881b63935626a0700366937530d38b8b1e8krajcevski
2706c354881b63935626a0700366937530d38b8b1e8krajcevski// This function is used by R11 EAC to compress 4x4 blocks
2716c354881b63935626a0700366937530d38b8b1e8krajcevski// of 8-bit alpha into 64-bit values that comprise the compressed data.
2726c354881b63935626a0700366937530d38b8b1e8krajcevski// We need to make sure that the dimensions of the src pixels are divisible
2736c354881b63935626a0700366937530d38b8b1e8krajcevski// by 4, and copy 4x4 blocks one at a time for compression.
2746c354881b63935626a0700366937530d38b8b1e8krajcevskitypedef uint64_t (*A84x4To64BitProc)(const uint8_t block[]);
2756c354881b63935626a0700366937530d38b8b1e8krajcevski
2766c354881b63935626a0700366937530d38b8b1e8krajcevskistatic bool compress_4x4_a8_to_64bit(uint8_t* dst, const uint8_t* src,
2776c354881b63935626a0700366937530d38b8b1e8krajcevski                                     int width, int height, int rowBytes,
2786c354881b63935626a0700366937530d38b8b1e8krajcevski                                     A84x4To64BitProc proc) {
2796c354881b63935626a0700366937530d38b8b1e8krajcevski    // Make sure that our data is well-formed enough to be considered for compression
2806c354881b63935626a0700366937530d38b8b1e8krajcevski    if (0 == width || 0 == height || (width % 4) != 0 || (height % 4) != 0) {
2816c354881b63935626a0700366937530d38b8b1e8krajcevski        return false;
2826c354881b63935626a0700366937530d38b8b1e8krajcevski    }
2836c354881b63935626a0700366937530d38b8b1e8krajcevski
2846c354881b63935626a0700366937530d38b8b1e8krajcevski    int blocksX = width >> 2;
2856c354881b63935626a0700366937530d38b8b1e8krajcevski    int blocksY = height >> 2;
2866c354881b63935626a0700366937530d38b8b1e8krajcevski
2876c354881b63935626a0700366937530d38b8b1e8krajcevski    uint8_t block[16];
2886c354881b63935626a0700366937530d38b8b1e8krajcevski    uint64_t* encPtr = reinterpret_cast<uint64_t*>(dst);
2896c354881b63935626a0700366937530d38b8b1e8krajcevski    for (int y = 0; y < blocksY; ++y) {
2906c354881b63935626a0700366937530d38b8b1e8krajcevski        for (int x = 0; x < blocksX; ++x) {
2916c354881b63935626a0700366937530d38b8b1e8krajcevski            // Load block
2926c354881b63935626a0700366937530d38b8b1e8krajcevski            for (int k = 0; k < 4; ++k) {
2936c354881b63935626a0700366937530d38b8b1e8krajcevski                memcpy(block + k*4, src + k*rowBytes + 4*x, 4);
2946c354881b63935626a0700366937530d38b8b1e8krajcevski            }
2956c354881b63935626a0700366937530d38b8b1e8krajcevski
2966c354881b63935626a0700366937530d38b8b1e8krajcevski            // Compress it
2976c354881b63935626a0700366937530d38b8b1e8krajcevski            *encPtr = proc(block);
2986c354881b63935626a0700366937530d38b8b1e8krajcevski            ++encPtr;
2996c354881b63935626a0700366937530d38b8b1e8krajcevski        }
3006c354881b63935626a0700366937530d38b8b1e8krajcevski        src += 4 * rowBytes;
3016c354881b63935626a0700366937530d38b8b1e8krajcevski    }
3026c354881b63935626a0700366937530d38b8b1e8krajcevski
3036c354881b63935626a0700366937530d38b8b1e8krajcevski    return true;
3046c354881b63935626a0700366937530d38b8b1e8krajcevski}
3056c354881b63935626a0700366937530d38b8b1e8krajcevski#endif  // (COMPRESS_R11_EAC_SLOW) || (COMPRESS_R11_EAC_FAST)
3066c354881b63935626a0700366937530d38b8b1e8krajcevski
307d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// This function converts an integer containing four bytes of alpha
308d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// values into an integer containing four bytes of indices into R11 EAC.
309d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// Note, there needs to be a mapping of indices:
310d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// 0 1 2 3 4 5 6 7
311d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// 3 2 1 0 4 5 6 7
312d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski//
313d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// To compute this, we first negate each byte, and then add three, which
314d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// gives the mapping
315d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// 3 2 1 0 -1 -2 -3 -4
316d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski//
317d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// Then we mask out the negative values, take their absolute value, and
318d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// add three.
319d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski//
320d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski// Most of the voodoo in this function comes from Hacker's Delight, section 2-18
321d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevskistatic inline uint32_t convert_indices(uint32_t x) {
322d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    // Take the top three bits...
323d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    x = (x & 0xE0E0E0E0) >> 5;
324d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
325d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    // Negate...
326d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    x = ~((0x80808080 - x) ^ 0x7F7F7F7F);
327d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
328d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    // Add three
329d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t s = (x & 0x7F7F7F7F) + 0x03030303;
330d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    x = ((x ^ 0x03030303) & 0x80808080) ^ s;
331d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
332d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    // Absolute value
333d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t a = x & 0x80808080;
334d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t b = a >> 7;
335d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
336d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    // Aside: mask negatives (m is three if the byte was negative)
337d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t m = (a >> 6) | b;
338d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
339d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    // .. continue absolute value
340d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    x = (x ^ ((a - b) | a)) + b;
341d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
342d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    // Add three
343d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    return x + m;
344d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski}
345d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
3466c354881b63935626a0700366937530d38b8b1e8krajcevski#if COMPRESS_R11_EAC_FASTEST
3476c354881b63935626a0700366937530d38b8b1e8krajcevskitemplate<unsigned shift>
3486c354881b63935626a0700366937530d38b8b1e8krajcevskistatic inline uint64_t swap_shift(uint64_t x, uint64_t mask) {
3496c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint64_t t = (x ^ (x >> shift)) & mask;
3506c354881b63935626a0700366937530d38b8b1e8krajcevski    return x ^ t ^ (t << shift);
3516c354881b63935626a0700366937530d38b8b1e8krajcevski}
3526c354881b63935626a0700366937530d38b8b1e8krajcevski
3536c354881b63935626a0700366937530d38b8b1e8krajcevskistatic inline uint64_t interleave6(uint64_t topRows, uint64_t bottomRows) {
3546c354881b63935626a0700366937530d38b8b1e8krajcevski    // If our 3-bit block indices are laid out as:
3556c354881b63935626a0700366937530d38b8b1e8krajcevski    // a b c d
3566c354881b63935626a0700366937530d38b8b1e8krajcevski    // e f g h
3576c354881b63935626a0700366937530d38b8b1e8krajcevski    // i j k l
3586c354881b63935626a0700366937530d38b8b1e8krajcevski    // m n o p
3596c354881b63935626a0700366937530d38b8b1e8krajcevski    //
3606c354881b63935626a0700366937530d38b8b1e8krajcevski    // This function expects topRows and bottomRows to contain the first two rows
3616c354881b63935626a0700366937530d38b8b1e8krajcevski    // of indices interleaved in the least significant bits of a and b. In other words...
3626c354881b63935626a0700366937530d38b8b1e8krajcevski    //
3636c354881b63935626a0700366937530d38b8b1e8krajcevski    // If the architecture is big endian, then topRows and bottomRows will contain the following:
3646c354881b63935626a0700366937530d38b8b1e8krajcevski    // Bits 31-0:
3656c354881b63935626a0700366937530d38b8b1e8krajcevski    // a: 00 a e 00 b f 00 c g 00 d h
3666c354881b63935626a0700366937530d38b8b1e8krajcevski    // b: 00 i m 00 j n 00 k o 00 l p
3676c354881b63935626a0700366937530d38b8b1e8krajcevski    //
3686c354881b63935626a0700366937530d38b8b1e8krajcevski    // If the architecture is little endian, then topRows and bottomRows will contain
3696c354881b63935626a0700366937530d38b8b1e8krajcevski    // the following:
3706c354881b63935626a0700366937530d38b8b1e8krajcevski    // Bits 31-0:
3716c354881b63935626a0700366937530d38b8b1e8krajcevski    // a: 00 d h 00 c g 00 b f 00 a e
3726c354881b63935626a0700366937530d38b8b1e8krajcevski    // b: 00 l p 00 k o 00 j n 00 i m
3736c354881b63935626a0700366937530d38b8b1e8krajcevski    //
3746c354881b63935626a0700366937530d38b8b1e8krajcevski    // This function returns a 48-bit packing of the form:
3756c354881b63935626a0700366937530d38b8b1e8krajcevski    // a e i m b f j n c g k o d h l p
3766c354881b63935626a0700366937530d38b8b1e8krajcevski    //
3776c354881b63935626a0700366937530d38b8b1e8krajcevski    // !SPEED! this function might be even faster if certain SIMD intrinsics are
3786c354881b63935626a0700366937530d38b8b1e8krajcevski    // used..
3796c354881b63935626a0700366937530d38b8b1e8krajcevski
3806c354881b63935626a0700366937530d38b8b1e8krajcevski    // For both architectures, we can figure out a packing of the bits by
3816c354881b63935626a0700366937530d38b8b1e8krajcevski    // using a shuffle and a few shift-rotates...
3826c354881b63935626a0700366937530d38b8b1e8krajcevski    uint64_t x = (static_cast<uint64_t>(topRows) << 32) | static_cast<uint64_t>(bottomRows);
3836c354881b63935626a0700366937530d38b8b1e8krajcevski
3846c354881b63935626a0700366937530d38b8b1e8krajcevski    // x: 00 a e 00 b f 00 c g 00 d h 00 i m 00 j n 00 k o 00 l p
3856c354881b63935626a0700366937530d38b8b1e8krajcevski
3866c354881b63935626a0700366937530d38b8b1e8krajcevski    x = swap_shift<10>(x, 0x3FC0003FC00000ULL);
3876c354881b63935626a0700366937530d38b8b1e8krajcevski
3886c354881b63935626a0700366937530d38b8b1e8krajcevski    // x: b f 00 00 00 a e c g i m 00 00 00 d h j n 00 k o 00 l p
3896c354881b63935626a0700366937530d38b8b1e8krajcevski
3906c354881b63935626a0700366937530d38b8b1e8krajcevski    x = (x | ((x << 52) & (0x3FULL << 52)) | ((x << 20) & (0x3FULL << 28))) >> 16;
3916c354881b63935626a0700366937530d38b8b1e8krajcevski
3926c354881b63935626a0700366937530d38b8b1e8krajcevski    // x: 00 00 00 00 00 00 00 00 b f l p a e c g i m k o d h j n
3936c354881b63935626a0700366937530d38b8b1e8krajcevski
3946c354881b63935626a0700366937530d38b8b1e8krajcevski    x = swap_shift<6>(x, 0xFC0000ULL);
3956c354881b63935626a0700366937530d38b8b1e8krajcevski
3966c354881b63935626a0700366937530d38b8b1e8krajcevski#if defined (SK_CPU_BENDIAN)
3976c354881b63935626a0700366937530d38b8b1e8krajcevski    // x: 00 00 00 00 00 00 00 00 b f l p a e i m c g k o d h j n
3986c354881b63935626a0700366937530d38b8b1e8krajcevski
3996c354881b63935626a0700366937530d38b8b1e8krajcevski    x = swap_shift<36>(x, 0x3FULL);
4006c354881b63935626a0700366937530d38b8b1e8krajcevski
4016c354881b63935626a0700366937530d38b8b1e8krajcevski    // x: 00 00 00 00 00 00 00 00 b f j n a e i m c g k o d h l p
4026c354881b63935626a0700366937530d38b8b1e8krajcevski
4036c354881b63935626a0700366937530d38b8b1e8krajcevski    x = swap_shift<12>(x, 0xFFF000000ULL);
4046c354881b63935626a0700366937530d38b8b1e8krajcevski#else
4056c354881b63935626a0700366937530d38b8b1e8krajcevski    // If our CPU is little endian, then the above logic will
4066c354881b63935626a0700366937530d38b8b1e8krajcevski    // produce the following indices:
4076c354881b63935626a0700366937530d38b8b1e8krajcevski    // x: 00 00 00 00 00 00 00 00 c g i m d h l p b f j n a e k o
4086c354881b63935626a0700366937530d38b8b1e8krajcevski
4096c354881b63935626a0700366937530d38b8b1e8krajcevski    x = swap_shift<36>(x, 0xFC0ULL);
4106c354881b63935626a0700366937530d38b8b1e8krajcevski
4116c354881b63935626a0700366937530d38b8b1e8krajcevski    // x: 00 00 00 00 00 00 00 00 a e i m d h l p b f j n c g k o
4126c354881b63935626a0700366937530d38b8b1e8krajcevski
4136c354881b63935626a0700366937530d38b8b1e8krajcevski    x = (x & (0xFFFULL << 36)) | ((x & 0xFFFFFFULL) << 12) | ((x >> 24) & 0xFFFULL);
4146c354881b63935626a0700366937530d38b8b1e8krajcevski#endif
4156c354881b63935626a0700366937530d38b8b1e8krajcevski
4166c354881b63935626a0700366937530d38b8b1e8krajcevski    // x: 00 00 00 00 00 00 00 00 a e i m b f j n c g k o d h l p
4176c354881b63935626a0700366937530d38b8b1e8krajcevski    return x;
4186c354881b63935626a0700366937530d38b8b1e8krajcevski}
4196c354881b63935626a0700366937530d38b8b1e8krajcevski
4206c354881b63935626a0700366937530d38b8b1e8krajcevski// This function follows the same basic procedure as compress_heterogeneous_r11eac_block
4216c354881b63935626a0700366937530d38b8b1e8krajcevski// above when COMPRESS_R11_EAC_FAST is defined, but it avoids a few loads/stores and
4226c354881b63935626a0700366937530d38b8b1e8krajcevski// tries to optimize where it can using SIMD.
4236c354881b63935626a0700366937530d38b8b1e8krajcevskistatic uint64_t compress_r11eac_block_fast(const uint8_t* src, int rowBytes) {
4246c354881b63935626a0700366937530d38b8b1e8krajcevski    // Store each row of alpha values in an integer
4256c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t alphaRow1 = *(reinterpret_cast<const uint32_t*>(src));
4266c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t alphaRow2 = *(reinterpret_cast<const uint32_t*>(src + rowBytes));
4276c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t alphaRow3 = *(reinterpret_cast<const uint32_t*>(src + 2*rowBytes));
4286c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t alphaRow4 = *(reinterpret_cast<const uint32_t*>(src + 3*rowBytes));
4296c354881b63935626a0700366937530d38b8b1e8krajcevski
4306c354881b63935626a0700366937530d38b8b1e8krajcevski    // Check for solid blocks. The explanations for these values
4316c354881b63935626a0700366937530d38b8b1e8krajcevski    // can be found in the comments of compress_r11eac_block above
4326c354881b63935626a0700366937530d38b8b1e8krajcevski    if (alphaRow1 == alphaRow2 && alphaRow1 == alphaRow3 && alphaRow1 == alphaRow4) {
4336c354881b63935626a0700366937530d38b8b1e8krajcevski        if (0 == alphaRow1) {
4346c354881b63935626a0700366937530d38b8b1e8krajcevski            // Fully transparent block
4356c354881b63935626a0700366937530d38b8b1e8krajcevski            return 0x0020000000002000ULL;
4366c354881b63935626a0700366937530d38b8b1e8krajcevski        } else if (0xFFFFFFFF == alphaRow1) {
4376c354881b63935626a0700366937530d38b8b1e8krajcevski            // Fully opaque block
4386c354881b63935626a0700366937530d38b8b1e8krajcevski            return 0xFFFFFFFFFFFFFFFFULL;
4396c354881b63935626a0700366937530d38b8b1e8krajcevski        }
4406c354881b63935626a0700366937530d38b8b1e8krajcevski    }
4416c354881b63935626a0700366937530d38b8b1e8krajcevski
4426c354881b63935626a0700366937530d38b8b1e8krajcevski    // Convert each integer of alpha values into an integer of indices
4436c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t indexRow1 = convert_indices(alphaRow1);
4446c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t indexRow2 = convert_indices(alphaRow2);
4456c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t indexRow3 = convert_indices(alphaRow3);
4466c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t indexRow4 = convert_indices(alphaRow4);
4476c354881b63935626a0700366937530d38b8b1e8krajcevski
4486c354881b63935626a0700366937530d38b8b1e8krajcevski    // Interleave the indices from the top two rows and bottom two rows
4496c354881b63935626a0700366937530d38b8b1e8krajcevski    // prior to passing them to interleave6. Since each index is at most
4506c354881b63935626a0700366937530d38b8b1e8krajcevski    // three bits, then each byte can hold two indices... The way that the
4516c354881b63935626a0700366937530d38b8b1e8krajcevski    // compression scheme expects the packing allows us to efficiently pack
4526c354881b63935626a0700366937530d38b8b1e8krajcevski    // the top two rows and bottom two rows. Interleaving each 6-bit sequence
4536c354881b63935626a0700366937530d38b8b1e8krajcevski    // and tightly packing it into a uint64_t is a little trickier, which is
4546c354881b63935626a0700366937530d38b8b1e8krajcevski    // taken care of in interleave6.
4556c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t r1r2 = (indexRow1 << 3) | indexRow2;
4566c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t r3r4 = (indexRow3 << 3) | indexRow4;
4576c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint64_t indices = interleave6(r1r2, r3r4);
4586c354881b63935626a0700366937530d38b8b1e8krajcevski
4596c354881b63935626a0700366937530d38b8b1e8krajcevski    // Return the packed incdices in the least significant bits with the magic header
4606c354881b63935626a0700366937530d38b8b1e8krajcevski    return SkEndian_SwapBE64(0x8490000000000000ULL | indices);
4616c354881b63935626a0700366937530d38b8b1e8krajcevski}
4626c354881b63935626a0700366937530d38b8b1e8krajcevski
4636c354881b63935626a0700366937530d38b8b1e8krajcevskistatic bool compress_a8_to_r11eac_fast(uint8_t* dst, const uint8_t* src,
4646c354881b63935626a0700366937530d38b8b1e8krajcevski                                       int width, int height, int rowBytes) {
4656c354881b63935626a0700366937530d38b8b1e8krajcevski    // Make sure that our data is well-formed enough to be considered for compression
4666c354881b63935626a0700366937530d38b8b1e8krajcevski    if (0 == width || 0 == height || (width % 4) != 0 || (height % 4) != 0) {
4676c354881b63935626a0700366937530d38b8b1e8krajcevski        return false;
4686c354881b63935626a0700366937530d38b8b1e8krajcevski    }
4696c354881b63935626a0700366937530d38b8b1e8krajcevski
4706c354881b63935626a0700366937530d38b8b1e8krajcevski    const int blocksX = width >> 2;
4716c354881b63935626a0700366937530d38b8b1e8krajcevski    const int blocksY = height >> 2;
4726c354881b63935626a0700366937530d38b8b1e8krajcevski
4736c354881b63935626a0700366937530d38b8b1e8krajcevski    uint64_t* encPtr = reinterpret_cast<uint64_t*>(dst);
4746c354881b63935626a0700366937530d38b8b1e8krajcevski    for (int y = 0; y < blocksY; ++y) {
4756c354881b63935626a0700366937530d38b8b1e8krajcevski        for (int x = 0; x < blocksX; ++x) {
4766c354881b63935626a0700366937530d38b8b1e8krajcevski            // Compress it
4776c354881b63935626a0700366937530d38b8b1e8krajcevski            *encPtr = compress_r11eac_block_fast(src + 4*x, rowBytes);
4786c354881b63935626a0700366937530d38b8b1e8krajcevski            ++encPtr;
4796c354881b63935626a0700366937530d38b8b1e8krajcevski        }
4806c354881b63935626a0700366937530d38b8b1e8krajcevski        src += 4 * rowBytes;
4816c354881b63935626a0700366937530d38b8b1e8krajcevski    }
4826c354881b63935626a0700366937530d38b8b1e8krajcevski    return true;
4836c354881b63935626a0700366937530d38b8b1e8krajcevski}
4846c354881b63935626a0700366937530d38b8b1e8krajcevski#endif // COMPRESS_R11_EAC_FASTEST
4856c354881b63935626a0700366937530d38b8b1e8krajcevski
4866c354881b63935626a0700366937530d38b8b1e8krajcevski////////////////////////////////////////////////////////////////////////////////
4876c354881b63935626a0700366937530d38b8b1e8krajcevski//
4886c354881b63935626a0700366937530d38b8b1e8krajcevski// Utility functions used by the blitter
4896c354881b63935626a0700366937530d38b8b1e8krajcevski//
4906c354881b63935626a0700366937530d38b8b1e8krajcevski////////////////////////////////////////////////////////////////////////////////
4916c354881b63935626a0700366937530d38b8b1e8krajcevski
4926c354881b63935626a0700366937530d38b8b1e8krajcevski// The R11 EAC format expects that indices are given in column-major order. Since
4936c354881b63935626a0700366937530d38b8b1e8krajcevski// we receive alpha values in raster order, this usually means that we have to use
4946c354881b63935626a0700366937530d38b8b1e8krajcevski// pack6 above to properly pack our indices. However, if our indices come from the
4956c354881b63935626a0700366937530d38b8b1e8krajcevski// blitter, then each integer will be a column of indices, and hence can be efficiently
4966c354881b63935626a0700366937530d38b8b1e8krajcevski// packed. This function takes the bottom three bits of each byte and places them in
4976c354881b63935626a0700366937530d38b8b1e8krajcevski// the least significant 12 bits of the resulting integer.
4986c354881b63935626a0700366937530d38b8b1e8krajcevskistatic inline uint32_t pack_indices_vertical(uint32_t x) {
4996c354881b63935626a0700366937530d38b8b1e8krajcevski#if defined (SK_CPU_BENDIAN)
5006c354881b63935626a0700366937530d38b8b1e8krajcevski    return
5016c354881b63935626a0700366937530d38b8b1e8krajcevski        (x & 7) |
5026c354881b63935626a0700366937530d38b8b1e8krajcevski        ((x >> 5) & (7 << 3)) |
5036c354881b63935626a0700366937530d38b8b1e8krajcevski        ((x >> 10) & (7 << 6)) |
5046c354881b63935626a0700366937530d38b8b1e8krajcevski        ((x >> 15) & (7 << 9));
5056c354881b63935626a0700366937530d38b8b1e8krajcevski#else
5066c354881b63935626a0700366937530d38b8b1e8krajcevski    return
5076c354881b63935626a0700366937530d38b8b1e8krajcevski        ((x >> 24) & 7) |
5086c354881b63935626a0700366937530d38b8b1e8krajcevski        ((x >> 13) & (7 << 3)) |
5096c354881b63935626a0700366937530d38b8b1e8krajcevski        ((x >> 2) & (7 << 6)) |
5106c354881b63935626a0700366937530d38b8b1e8krajcevski        ((x << 9) & (7 << 9));
5116c354881b63935626a0700366937530d38b8b1e8krajcevski#endif
5126c354881b63935626a0700366937530d38b8b1e8krajcevski}
5136c354881b63935626a0700366937530d38b8b1e8krajcevski
5146c354881b63935626a0700366937530d38b8b1e8krajcevski// This function returns the compressed format of a block given as four columns of
5156c354881b63935626a0700366937530d38b8b1e8krajcevski// alpha values. Each column is assumed to be loaded from top to bottom, and hence
5166c354881b63935626a0700366937530d38b8b1e8krajcevski// must first be converted to indices and then packed into the resulting 64-bit
5176c354881b63935626a0700366937530d38b8b1e8krajcevski// integer.
518d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevskiinline void compress_block_vertical(uint8_t* dstPtr, const uint8_t *block) {
519d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
520d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t* src = reinterpret_cast<const uint32_t*>(block);
521d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    uint64_t* dst = reinterpret_cast<uint64_t*>(dstPtr);
522d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski
523d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t alphaColumn0 = src[0];
524d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t alphaColumn1 = src[1];
525d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t alphaColumn2 = src[2];
526d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    const uint32_t alphaColumn3 = src[3];
5276c354881b63935626a0700366937530d38b8b1e8krajcevski
5286c354881b63935626a0700366937530d38b8b1e8krajcevski    if (alphaColumn0 == alphaColumn1 &&
5296c354881b63935626a0700366937530d38b8b1e8krajcevski        alphaColumn2 == alphaColumn3 &&
5306c354881b63935626a0700366937530d38b8b1e8krajcevski        alphaColumn0 == alphaColumn2) {
5316c354881b63935626a0700366937530d38b8b1e8krajcevski
5326c354881b63935626a0700366937530d38b8b1e8krajcevski        if (0 == alphaColumn0) {
5336c354881b63935626a0700366937530d38b8b1e8krajcevski            // Transparent
534d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski            *dst = 0x0020000000002000ULL;
535d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski            return;
5366c354881b63935626a0700366937530d38b8b1e8krajcevski        }
5376c354881b63935626a0700366937530d38b8b1e8krajcevski        else if (0xFFFFFFFF == alphaColumn0) {
5386c354881b63935626a0700366937530d38b8b1e8krajcevski            // Opaque
539d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski            *dst = 0xFFFFFFFFFFFFFFFFULL;
540d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski            return;
5416c354881b63935626a0700366937530d38b8b1e8krajcevski        }
5426c354881b63935626a0700366937530d38b8b1e8krajcevski    }
5436c354881b63935626a0700366937530d38b8b1e8krajcevski
5446c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t indexColumn0 = convert_indices(alphaColumn0);
5456c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t indexColumn1 = convert_indices(alphaColumn1);
5466c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t indexColumn2 = convert_indices(alphaColumn2);
5476c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t indexColumn3 = convert_indices(alphaColumn3);
5486c354881b63935626a0700366937530d38b8b1e8krajcevski
5496c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t packedIndexColumn0 = pack_indices_vertical(indexColumn0);
5506c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t packedIndexColumn1 = pack_indices_vertical(indexColumn1);
5516c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t packedIndexColumn2 = pack_indices_vertical(indexColumn2);
5526c354881b63935626a0700366937530d38b8b1e8krajcevski    const uint32_t packedIndexColumn3 = pack_indices_vertical(indexColumn3);
5536c354881b63935626a0700366937530d38b8b1e8krajcevski
554d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski    *dst = SkEndian_SwapBE64(0x8490000000000000ULL |
5556c354881b63935626a0700366937530d38b8b1e8krajcevski                             (static_cast<uint64_t>(packedIndexColumn0) << 36) |
5566c354881b63935626a0700366937530d38b8b1e8krajcevski                             (static_cast<uint64_t>(packedIndexColumn1) << 24) |
5576c354881b63935626a0700366937530d38b8b1e8krajcevski                             static_cast<uint64_t>(packedIndexColumn2 << 12) |
5586c354881b63935626a0700366937530d38b8b1e8krajcevski                             static_cast<uint64_t>(packedIndexColumn3));
5596c354881b63935626a0700366937530d38b8b1e8krajcevski}
5606c354881b63935626a0700366937530d38b8b1e8krajcevski
5614ad76e35111585f4da662d54943f23792dd1e0aekrajcevskistatic inline int get_r11_eac_index(uint64_t block, int x, int y) {
5624ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    SkASSERT(x >= 0 && x < 4);
5634ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    SkASSERT(y >= 0 && y < 4);
5644ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    const int idx = x*4 + y;
5654ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    return (block >> ((15-idx)*3)) & 0x7;
5664ad76e35111585f4da662d54943f23792dd1e0aekrajcevski}
5674ad76e35111585f4da662d54943f23792dd1e0aekrajcevski
5684ad76e35111585f4da662d54943f23792dd1e0aekrajcevskistatic void decompress_r11_eac_block(uint8_t* dst, int dstRowBytes, const uint8_t* src) {
5694ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    const uint64_t block = SkEndian_SwapBE64(*(reinterpret_cast<const uint64_t *>(src)));
5704ad76e35111585f4da662d54943f23792dd1e0aekrajcevski
5714ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    const int base_cw = (block >> 56) & 0xFF;
5724ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    const int mod = (block >> 52) & 0xF;
5734ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    const int palette_idx = (block >> 48) & 0xF;
5744ad76e35111585f4da662d54943f23792dd1e0aekrajcevski
5754ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    const int* palette = kR11EACModifierPalettes[palette_idx];
5764ad76e35111585f4da662d54943f23792dd1e0aekrajcevski
5774ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    for (int j = 0; j < 4; ++j) {
5784ad76e35111585f4da662d54943f23792dd1e0aekrajcevski        for (int i = 0; i < 4; ++i) {
5794ad76e35111585f4da662d54943f23792dd1e0aekrajcevski            const int idx = get_r11_eac_index(block, i, j);
5804ad76e35111585f4da662d54943f23792dd1e0aekrajcevski            const int val = base_cw*8 + 4 + palette[idx]*mod*8;
5814ad76e35111585f4da662d54943f23792dd1e0aekrajcevski            if (val < 0) {
5824ad76e35111585f4da662d54943f23792dd1e0aekrajcevski                dst[i] = 0;
5834ad76e35111585f4da662d54943f23792dd1e0aekrajcevski            } else if (val > 2047) {
5844ad76e35111585f4da662d54943f23792dd1e0aekrajcevski                dst[i] = 0xFF;
5854ad76e35111585f4da662d54943f23792dd1e0aekrajcevski            } else {
5864ad76e35111585f4da662d54943f23792dd1e0aekrajcevski                dst[i] = (val >> 3) & 0xFF;
5874ad76e35111585f4da662d54943f23792dd1e0aekrajcevski            }
5884ad76e35111585f4da662d54943f23792dd1e0aekrajcevski        }
5894ad76e35111585f4da662d54943f23792dd1e0aekrajcevski        dst += dstRowBytes;
5904ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    }
5914ad76e35111585f4da662d54943f23792dd1e0aekrajcevski}
5924ad76e35111585f4da662d54943f23792dd1e0aekrajcevski
59345a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski// This is the type passed as the CompressorType argument of the compressed
59445a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski// blitter for the R11 EAC format. The static functions required to be in this
59545a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski// struct are documented in SkTextureCompressor_Blitter.h
59645a0bf505914adf0ee8c69e2647230618bbb3a63krajcevskistruct CompressorR11EAC {
59745a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski    static inline void CompressA8Vertical(uint8_t* dst, const uint8_t* src) {
59845a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski        compress_block_vertical(dst, src);
59945a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski    }
60045a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski
60145a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski    static inline void CompressA8Horizontal(uint8_t* dst, const uint8_t* src,
60245a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski                                            int srcRowBytes) {
60345a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski        *(reinterpret_cast<uint64_t*>(dst)) = compress_r11eac_block_fast(src, srcRowBytes);
60445a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski    }
60545a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski
606a10555a354cf294bde217044472d33c3161df249krajcevski#if PEDANTIC_BLIT_RECT
607a10555a354cf294bde217044472d33c3161df249krajcevski    static inline void UpdateBlock(uint8_t* dst, const uint8_t* src, int srcRowBytes,
608a10555a354cf294bde217044472d33c3161df249krajcevski                                   const uint8_t* mask) {
609a10555a354cf294bde217044472d33c3161df249krajcevski        // TODO: krajcevski
610a10555a354cf294bde217044472d33c3161df249krajcevski        // The implementation of this function should be similar to that of LATC, since
611a10555a354cf294bde217044472d33c3161df249krajcevski        // the R11EAC indices directly correspond to pixel values.
612a10555a354cf294bde217044472d33c3161df249krajcevski        SkFAIL("Implement me!");
61345a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski    }
614a10555a354cf294bde217044472d33c3161df249krajcevski#endif
61545a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski};
61645a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski
6176c354881b63935626a0700366937530d38b8b1e8krajcevski////////////////////////////////////////////////////////////////////////////////
6186c354881b63935626a0700366937530d38b8b1e8krajcevski
6196c354881b63935626a0700366937530d38b8b1e8krajcevskinamespace SkTextureCompressor {
6206c354881b63935626a0700366937530d38b8b1e8krajcevski
6216c354881b63935626a0700366937530d38b8b1e8krajcevskibool CompressA8ToR11EAC(uint8_t* dst, const uint8_t* src, int width, int height, int rowBytes) {
6226c354881b63935626a0700366937530d38b8b1e8krajcevski
6236c354881b63935626a0700366937530d38b8b1e8krajcevski#if (COMPRESS_R11_EAC_SLOW) || (COMPRESS_R11_EAC_FAST)
6246c354881b63935626a0700366937530d38b8b1e8krajcevski
6256c354881b63935626a0700366937530d38b8b1e8krajcevski    return compress_4x4_a8_to_64bit(dst, src, width, height, rowBytes, compress_r11eac_block);
6266c354881b63935626a0700366937530d38b8b1e8krajcevski
6276c354881b63935626a0700366937530d38b8b1e8krajcevski#elif COMPRESS_R11_EAC_FASTEST
6286c354881b63935626a0700366937530d38b8b1e8krajcevski
6296c354881b63935626a0700366937530d38b8b1e8krajcevski    return compress_a8_to_r11eac_fast(dst, src, width, height, rowBytes);
6306c354881b63935626a0700366937530d38b8b1e8krajcevski
6316c354881b63935626a0700366937530d38b8b1e8krajcevski#else
6326c354881b63935626a0700366937530d38b8b1e8krajcevski#error "Must choose R11 EAC algorithm"
6336c354881b63935626a0700366937530d38b8b1e8krajcevski#endif
6346c354881b63935626a0700366937530d38b8b1e8krajcevski}
6356c354881b63935626a0700366937530d38b8b1e8krajcevski
636b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevskiSkBlitter* CreateR11EACBlitter(int width, int height, void* outputBuffer,
637b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski                               SkTBlitterAllocator* allocator) {
638b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski
639b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    if ((width % 4) != 0 || (height % 4) != 0) {
640b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski        return NULL;
641b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    }
642b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski
643b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    // Memset the output buffer to an encoding that decodes to zero. We must do this
644b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    // in order to avoid having uninitialized values in the buffer if the blitter
645b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    // decides not to write certain scanlines (and skip entire rows of blocks).
646b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    // In the case of R11, we use the encoding from recognizing all zero pixels from above.
647b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    const int nBlocks = (width * height / 16);  // 4x4 pixel blocks.
648b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    uint64_t *dst = reinterpret_cast<uint64_t *>(outputBuffer);
649b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    for (int i = 0; i < nBlocks; ++i) {
650b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski        *dst = 0x0020000000002000ULL;
651b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski        ++dst;
652b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    }
653b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski
654b8ccc2f6d258a8466f79fc418e9e0a55aeaf58cekrajcevski    return allocator->createT<
65545a0bf505914adf0ee8c69e2647230618bbb3a63krajcevski        SkTCompressedAlphaBlitter<4, 8, CompressorR11EAC>, int, int, void*>
656d5e46c7893afdd5976c1581a2ae81168252f5deckrajcevski        (width, height, outputBuffer);
6576c354881b63935626a0700366937530d38b8b1e8krajcevski}
6586c354881b63935626a0700366937530d38b8b1e8krajcevski
6594ad76e35111585f4da662d54943f23792dd1e0aekrajcevskivoid DecompressR11EAC(uint8_t* dst, int dstRowBytes, const uint8_t* src, int width, int height) {
6604ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    for (int j = 0; j < height; j += 4) {
6614ad76e35111585f4da662d54943f23792dd1e0aekrajcevski        for (int i = 0; i < width; i += 4) {
6624ad76e35111585f4da662d54943f23792dd1e0aekrajcevski            decompress_r11_eac_block(dst + i, dstRowBytes, src);
6634ad76e35111585f4da662d54943f23792dd1e0aekrajcevski            src += 8;
6644ad76e35111585f4da662d54943f23792dd1e0aekrajcevski        }
6654ad76e35111585f4da662d54943f23792dd1e0aekrajcevski        dst += 4 * dstRowBytes;
6664ad76e35111585f4da662d54943f23792dd1e0aekrajcevski    }
6674ad76e35111585f4da662d54943f23792dd1e0aekrajcevski}
6684ad76e35111585f4da662d54943f23792dd1e0aekrajcevski
6696c354881b63935626a0700366937530d38b8b1e8krajcevski}  // namespace SkTextureCompressor
670