154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/* 254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius******************************************************************************* 359d709d503bab6e2b61931737e662dd293b40578ccornelius* Copyright (C) 2013, International Business Machines 454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius* Corporation and others. All Rights Reserved. 554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius******************************************************************************* 654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius* dictionarydata.h 754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius* 854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius* created on: 2012may31 954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius* created by: Markus W. Scherer & Maxime Serrano 1054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius*/ 1154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 1254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#ifndef __DICTIONARYDATA_H__ 1354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#define __DICTIONARYDATA_H__ 1454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 1554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/utypes.h" 1654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 1754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#if !UCONFIG_NO_BREAK_ITERATION 1854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 1954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/utext.h" 2054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/udata.h" 2154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "udataswp.h" 2254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/uobject.h" 2354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#include "unicode/ustringtrie.h" 2454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 2554dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusU_NAMESPACE_BEGIN 2654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 2754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass UCharsTrie; 2854dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass BytesTrie; 2954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 3054dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass U_COMMON_API DictionaryData : public UMemory { 3154dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliuspublic: 3259d709d503bab6e2b61931737e662dd293b40578ccornelius static const int32_t TRIE_TYPE_BYTES; // = 0; 3359d709d503bab6e2b61931737e662dd293b40578ccornelius static const int32_t TRIE_TYPE_UCHARS; // = 1; 3459d709d503bab6e2b61931737e662dd293b40578ccornelius static const int32_t TRIE_TYPE_MASK; // = 7; 3559d709d503bab6e2b61931737e662dd293b40578ccornelius static const int32_t TRIE_HAS_VALUES; // = 8; 3659d709d503bab6e2b61931737e662dd293b40578ccornelius 3759d709d503bab6e2b61931737e662dd293b40578ccornelius static const int32_t TRANSFORM_NONE; // = 0; 3859d709d503bab6e2b61931737e662dd293b40578ccornelius static const int32_t TRANSFORM_TYPE_OFFSET; // = 0x1000000; 3959d709d503bab6e2b61931737e662dd293b40578ccornelius static const int32_t TRANSFORM_TYPE_MASK; // = 0x7f000000; 4059d709d503bab6e2b61931737e662dd293b40578ccornelius static const int32_t TRANSFORM_OFFSET_MASK; // = 0x1fffff; 4154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 4254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius enum { 4354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Byte offsets from the start of the data, after the generic header. 4454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_STRING_TRIE_OFFSET, 4554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_RESERVED1_OFFSET, 4654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_RESERVED2_OFFSET, 4754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_TOTAL_SIZE, 4854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 4954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Trie type: TRIE_HAS_VALUES | TRIE_TYPE_BYTES etc. 5054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_TRIE_TYPE, 5154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // Transform specification: TRANSFORM_TYPE_OFFSET | 0xe00 etc. 5254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_TRANSFORM, 5354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 5454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_RESERVED6, 5554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_RESERVED7, 5654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius IX_COUNT 5754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius }; 5854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}; 5954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 6054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/** 6154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Wrapper class around generic dictionaries, implementing matches(). 6254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * getType() should return a TRIE_TYPE_??? constant from DictionaryData. 6354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 6454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * All implementations of this interface must be thread-safe if they are to be used inside of the 6554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * dictionary-based break iteration code. 6654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 6754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass U_COMMON_API DictionaryMatcher : public UMemory { 6854dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliuspublic: 6954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual ~DictionaryMatcher(); 7054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // this should emulate CompactTrieDictionary::matches() 7154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, 7254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t limit, int32_t *values = NULL) const = 0; 7354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius /** @return DictionaryData::TRIE_TYPE_XYZ */ 7454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual int32_t getType() const = 0; 7554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}; 7654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 7754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// Implementation of the DictionaryMatcher interface for a UCharsTrie dictionary 7854dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass U_COMMON_API UCharsDictionaryMatcher : public DictionaryMatcher { 7954dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliuspublic: 8054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // constructs a new UCharsDictionaryMatcher. 8154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // The UDataMemory * will be closed on this object's destruction. 8254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UCharsDictionaryMatcher(const UChar *c, UDataMemory *f) : characters(c), file(f) { } 8354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual ~UCharsDictionaryMatcher(); 8454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, 8554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t limit, int32_t *values = NULL) const; 8654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual int32_t getType() const; 8754dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusprivate: 8854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const UChar *characters; 8954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UDataMemory *file; 9054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}; 9154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 9254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius// Implementation of the DictionaryMatcher interface for a BytesTrie dictionary 9354dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusclass U_COMMON_API BytesDictionaryMatcher : public DictionaryMatcher { 9454dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliuspublic: 9554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // constructs a new BytesTrieDictionaryMatcher 9654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // the transform constant should be the constant read from the file, not a masked version! 9754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius // the UDataMemory * fed in here will be closed on this object's destruction 9854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius BytesDictionaryMatcher(const char *c, int32_t t, UDataMemory *f) 9954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius : characters(c), transformConstant(t), file(f) { } 10054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual ~BytesDictionaryMatcher(); 10154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual int32_t matches(UText *text, int32_t maxLength, int32_t *lengths, int32_t &count, 10254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t limit, int32_t *values = NULL) const; 10354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius virtual int32_t getType() const; 10454dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusprivate: 10554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UChar32 transform(UChar32 c) const; 10654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 10754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius const char *characters; 10854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius int32_t transformConstant; 10954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius UDataMemory *file; 11054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius}; 11154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 11254dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusU_NAMESPACE_END 11354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 11454dcd9b6a06071f647dac967e9e267abb9410720Craig CorneliusU_CAPI int32_t U_EXPORT2 11554dcd9b6a06071f647dac967e9e267abb9410720Craig Corneliusudict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData, UErrorCode *pErrorCode); 11654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 11754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius/** 11854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Format of dictionary .dict data files. 11954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Format version 1.0. 12054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 12154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * A dictionary .dict data file contains a byte-serialized BytesTrie or 12254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * a UChars-serialized UCharsTrie. 12354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Such files are used in dictionary-based break iteration (DBBI). 12454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 12554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * For a BytesTrie, a transformation type is specified for 12654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * transforming Unicode strings into byte sequences. 12754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 12854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * A .dict file begins with a standard ICU data file header 12954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * (DataHeader, see ucmndata.h and unicode/udata.h). 13054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * The UDataInfo.dataVersion field is currently unused (set to 0.0.0.0). 13154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 13254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * After the header, the file contains the following parts. 13354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Constants are defined in the DictionaryData class. 13454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 13554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * For the data structure of BytesTrie & UCharsTrie see 13654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * http://site.icu-project.org/design/struct/tries 13754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * and the bytestrie.h and ucharstrie.h header files. 13854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 13954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * int32_t indexes[indexesLength]; -- indexesLength=indexes[IX_STRING_TRIE_OFFSET]/4; 14054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 14154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * The first four indexes are byte offsets in ascending order. 14254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Each byte offset marks the start of the next part in the data file, 14354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * and the end of the previous one. 14454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * When two consecutive byte offsets are the same, then the corresponding part is empty. 14554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Byte offsets are offsets from after the header, 14654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * that is, from the beginning of the indexes[]. 14754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Each part starts at an offset with proper alignment for its data. 14854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * If necessary, the previous part may include padding bytes to achieve this alignment. 14954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 15054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * trieType=indexes[IX_TRIE_TYPE] defines the trie type. 15154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * transform=indexes[IX_TRANSFORM] defines the Unicode-to-bytes transformation. 15254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * If the transformation type is TRANSFORM_TYPE_OFFSET, 15354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * then the lower 21 bits contain the offset code point. 15454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Each code point c is mapped to byte b = (c - offset). 15554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * Code points outside the range offset..(offset+0xff) cannot be mapped 15654dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * and do not occur in the dictionary. 15754dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 15854dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * stringTrie; -- a serialized BytesTrie or UCharsTrie 15954dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * 16054dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * The dictionary maps strings to specific values (TRIE_HAS_VALUES bit set in trieType), 16154dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius * or it maps all strings to 0 (TRIE_HAS_VALUES bit not set). 16254dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius */ 16354dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius 16454dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#endif /* !UCONFIG_NO_BREAK_ITERATION */ 16554dcd9b6a06071f647dac967e9e267abb9410720Craig Cornelius#endif /* __DICTIONARYDATA_H__ */ 166