185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/*
285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho******************************************************************************
385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*
485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   Copyright (C) 2001-2008, International Business Machines
585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   Corporation and others.  All Rights Reserved.
685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*
785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho******************************************************************************
885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   file name:  utrie2_impl.h
985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   encoding:   US-ASCII
1085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   tab size:   8 (not used)
1185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   indentation:4
1285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*
1385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   created on: 2008sep26 (split off from utrie2.c)
1485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   created by: Markus W. Scherer
1585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*
1685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   Definitions needed for both runtime and builder code for UTrie2,
1785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*   used by utrie2.c and utrie2_builder.c.
1885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho*/
1985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
2085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#ifndef __UTRIE2_IMPL_H__
2185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#define __UTRIE2_IMPL_H__
2285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
2385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#include "utrie2.h"
2485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
2585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/* Public UTrie2 API implementation ----------------------------------------- */
2685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
2785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/*
2885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * These definitions are mostly needed by utrie2.c,
2985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * but also by utrie2_serialize() and utrie2_swap().
3085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho */
3185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
3285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/*
3385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * UTrie and UTrie2 signature values,
3485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * in platform endianness and opposite endianness.
3585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho */
3685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#define UTRIE_SIG       0x54726965
3785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#define UTRIE_OE_SIG    0x65697254
3885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
3985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#define UTRIE2_SIG      0x54726932
4085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#define UTRIE2_OE_SIG   0x32697254
4185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
4285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/**
4385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * Trie data structure in serialized form:
4485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho *
4585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * UTrie2Header header;
4685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * uint16_t index[header.index2Length];
4785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * uint16_t data[header.shiftedDataLength<<2];  -- or uint32_t data[...]
4885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * @internal
4985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho */
5085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hotypedef struct UTrie2Header {
5185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /** "Tri2" in big-endian US-ASCII (0x54726932) */
5285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    uint32_t signature;
5385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
5485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /**
5585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * options bit field:
5685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * 15.. 4   reserved (0)
5785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     *  3.. 0   UTrie2ValueBits valueBits
5885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     */
5985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    uint16_t options;
6085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
6185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /** UTRIE2_INDEX_1_OFFSET..UTRIE2_MAX_INDEX_LENGTH */
6285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    uint16_t indexLength;
6385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
6485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /** (UTRIE2_DATA_START_OFFSET..UTRIE2_MAX_DATA_LENGTH)>>UTRIE2_INDEX_SHIFT */
6585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    uint16_t shiftedDataLength;
6685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
6785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /** Null index and data blocks, not shifted. */
6885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    uint16_t index2NullOffset, dataNullOffset;
6985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
7085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /**
7185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * First code point of the single-value range ending with U+10ffff,
7285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * rounded up and then shifted right by UTRIE2_SHIFT_1.
7385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     */
7485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    uint16_t shiftedHighStart;
7585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho} UTrie2Header;
7685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
7785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/**
7885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * Constants for use with UTrie2Header.options.
7985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * @internal
8085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho */
8185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hoenum {
8285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /** Mask to get the UTrie2ValueBits valueBits from options. */
8385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UTRIE2_OPTIONS_VALUE_BITS_MASK=0xf
8485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho};
8585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
8685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/* Building a trie ---------------------------------------------------------- */
8785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
8885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/*
8985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * These definitions are mostly needed by utrie2_builder.c, but also by
9085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * utrie2_get32() and utrie2_enum().
9185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho */
9285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
9385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hoenum {
9485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /**
9585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * At build time, leave a gap in the index-2 table,
9685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * at least as long as the maximum lengths of the 2-byte UTF-8 index-2 table
9785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * and the supplementary index-1 table.
9885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * Round up to UTRIE2_INDEX_2_BLOCK_LENGTH for proper compacting.
9985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     */
10085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UNEWTRIE2_INDEX_GAP_OFFSET=UTRIE2_INDEX_2_BMP_LENGTH,
10185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UNEWTRIE2_INDEX_GAP_LENGTH=
10285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        ((UTRIE2_UTF8_2B_INDEX_2_LENGTH+UTRIE2_MAX_INDEX_1_LENGTH)+UTRIE2_INDEX_2_MASK)&
10385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        ~UTRIE2_INDEX_2_MASK,
10485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
10585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /**
10685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * Maximum length of the build-time index-2 array.
10785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * Maximum number of Unicode code points (0x110000) shifted right by UTRIE2_SHIFT_2,
10885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * plus the part of the index-2 table for lead surrogate code points,
10985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * plus the build-time index gap,
11085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * plus the null index-2 block.
11185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     */
11285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UNEWTRIE2_MAX_INDEX_2_LENGTH=
11385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        (0x110000>>UTRIE2_SHIFT_2)+
11485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        UTRIE2_LSCP_INDEX_2_LENGTH+
11585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        UNEWTRIE2_INDEX_GAP_LENGTH+
11685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho        UTRIE2_INDEX_2_BLOCK_LENGTH,
11785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
11885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UNEWTRIE2_INDEX_1_LENGTH=0x110000>>UTRIE2_SHIFT_1
11985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho};
12085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
12185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/**
12285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * Maximum length of the build-time data array.
12385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * One entry per 0x110000 code points, plus the illegal-UTF-8 block and the null block,
12485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * plus values for the 0x400 surrogate code units.
12585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho */
12685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#define UNEWTRIE2_MAX_DATA_LENGTH (0x110000+0x40+0x40+0x400)
12785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
12885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho/*
12985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * Build-time trie structure.
13085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho *
13185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * Just using a boolean flag for "repeat use" could lead to data array overflow
13285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * because we would not be able to detect when a data block becomes unused.
13385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * It also leads to orphan data blocks that are kept through serialization.
13485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho *
13585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * Need to use reference counting for data blocks,
13685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * and allocDataBlock() needs to look for a free block before increasing dataLength.
13785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho *
13885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * This scheme seems like overkill for index-2 blocks since the whole index array is
13985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * preallocated anyway (unlike the growable data array).
14085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho * Just allocating multiple index-2 blocks as needed.
14185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho */
14285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Hostruct UNewTrie2 {
14385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t index1[UNEWTRIE2_INDEX_1_LENGTH];
14485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t index2[UNEWTRIE2_MAX_INDEX_2_LENGTH];
14585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    uint32_t *data;
14685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
14785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    uint32_t initialValue, errorValue;
14885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t index2Length, dataCapacity, dataLength;
14985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t firstFreeBlock;
15085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t index2NullOffset, dataNullOffset;
15185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UChar32 highStart;
15285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    UBool isCompacted;
15385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
15485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    /**
15585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * Multi-purpose per-data-block table.
15685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     *
15785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * Before compacting:
15885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     *
15985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * Per-data-block reference counters/free-block list.
16085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     *  0: unused
16185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * >0: reference counter (number of index-2 entries pointing here)
16285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * <0: next free data block in free-block list
16385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     *
16485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * While compacting:
16585bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     *
16685bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * Map of adjusted indexes, used in compactData() and compactIndex2().
16785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     * Maps from original indexes to new ones.
16885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho     */
16985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho    int32_t map[UNEWTRIE2_MAX_DATA_LENGTH>>UTRIE2_SHIFT_2];
17085bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho};
17185bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho
17285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho#endif
173