ucm.h revision 85bf2e2fbc60a9f938064abc8127d61da7d19882
1/* 2 ******************************************************************************* 3 * Copyright (C) 2003-2009, International Business Machines 4 * Corporation and others. All Rights Reserved. 5 ******************************************************************************* 6 * file name: ucm.h 7 * encoding: US-ASCII 8 * tab size: 8 (not used) 9 * indentation:4 10 * 11 * created on: 2003jun20 12 * created by: Markus W. Scherer 13 * 14 * Definitions for the .ucm file parser and handler module ucm.c. 15 */ 16 17#ifndef __UCM_H__ 18#define __UCM_H__ 19 20#include "unicode/utypes.h" 21#include "ucnvmbcs.h" 22#include "ucnv_ext.h" 23#include "filestrm.h" 24#include <stdio.h> 25 26#if !UCONFIG_NO_CONVERSION 27 28U_CDECL_BEGIN 29 30/* constants for UCMapping.moveFlag */ 31enum { 32 UCM_MOVE_TO_EXT=1, 33 UCM_REMOVE_MAPPING=2 34}; 35 36/* 37 * Per-mapping data structure 38 * 39 * u if uLen==1: Unicode code point 40 * else index to uLen code points 41 * b if bLen<=4: up to 4 bytes 42 * else index to bLen bytes 43 * uLen number of code points 44 * bLen number of words containing left-justified bytes 45 * bIsMultipleChars indicates that the bytes contain more than one sequence 46 * according to the state table 47 * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) 48 * same values as in the source file after | 49 */ 50typedef struct UCMapping { 51 UChar32 u; 52 union { 53 uint32_t idx; 54 uint8_t bytes[4]; 55 } b; 56 int8_t uLen, bLen, f, moveFlag; 57} UCMapping; 58 59/* constants for UCMTable.flagsType */ 60enum { 61 UCM_FLAGS_INITIAL, /* no mappings parsed yet */ 62 UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ 63 UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ 64 UCM_FLAGS_MIXED /* both implicit and explicit */ 65}; 66 67typedef struct UCMTable { 68 UCMapping *mappings; 69 int32_t mappingsCapacity, mappingsLength; 70 71 UChar32 *codePoints; 72 int32_t codePointsCapacity, codePointsLength; 73 74 uint8_t *bytes; 75 int32_t bytesCapacity, bytesLength; 76 77 /* index map for mapping by bytes first */ 78 int32_t *reverseMap; 79 80 uint8_t unicodeMask; 81 int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ 82 UBool isSorted; 83} UCMTable; 84 85enum { 86 MBCS_STATE_FLAG_DIRECT=1, 87 MBCS_STATE_FLAG_SURROGATES, 88 89 MBCS_STATE_FLAG_READY=16 90}; 91 92typedef struct UCMStates { 93 int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; 94 uint32_t stateFlags[MBCS_MAX_STATE_COUNT], 95 stateOffsetSum[MBCS_MAX_STATE_COUNT]; 96 97 int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; 98 int8_t conversionType, outputType; 99} UCMStates; 100 101typedef struct UCMFile { 102 UCMTable *base, *ext; 103 UCMStates states; 104 105 char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; 106} UCMFile; 107 108/* simple accesses ---------------------------------------------------------- */ 109 110#define UCM_GET_CODE_POINTS(t, m) \ 111 (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) 112 113#define UCM_GET_BYTES(t, m) \ 114 (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) 115 116/* APIs --------------------------------------------------------------------- */ 117 118U_CAPI UCMFile * U_EXPORT2 119ucm_open(void); 120 121U_CAPI void U_EXPORT2 122ucm_close(UCMFile *ucm); 123 124U_CAPI UBool U_EXPORT2 125ucm_parseHeaderLine(UCMFile *ucm, 126 char *line, char **pKey, char **pValue); 127 128/* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ 129U_CAPI int32_t U_EXPORT2 130ucm_mappingType(UCMStates *baseStates, 131 UCMapping *m, 132 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 133 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 134 135/* add a mapping to the base or extension table as appropriate */ 136U_CAPI UBool U_EXPORT2 137ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, 138 UCMapping *m, 139 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 140 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 141 142U_CAPI UBool U_EXPORT2 143ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); 144 145 146U_CAPI UCMTable * U_EXPORT2 147ucm_openTable(void); 148 149U_CAPI void U_EXPORT2 150ucm_closeTable(UCMTable *table); 151 152U_CAPI void U_EXPORT2 153ucm_resetTable(UCMTable *table); 154 155U_CAPI void U_EXPORT2 156ucm_sortTable(UCMTable *t); 157 158/* 159 * Remove mappings with their move flag set from the base table 160 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. 161 */ 162U_CAPI void U_EXPORT2 163ucm_moveMappings(UCMTable *base, UCMTable *ext); 164 165/** 166 * Read a table from a .ucm file, from after the CHARMAP line to 167 * including the END CHARMAP line. 168 */ 169U_CAPI void U_EXPORT2 170ucm_readTable(UCMFile *ucm, FileStream* convFile, 171 UBool forBase, UCMStates *baseStates, 172 UErrorCode *pErrorCode); 173 174/** 175 * Check the validity of mappings against a base table's states; 176 * necessary for extension-only tables that were read before their base tables. 177 */ 178U_CAPI UBool U_EXPORT2 179ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); 180 181/** 182 * Check a base table against an extension table. 183 * Set the moveTarget!=NULL if it is possible to move mappings from the base. 184 * This is the case where base and extension tables are parsed from a single file 185 * (moveTarget==ext) 186 * or when delta file mappings are subtracted from a base table. 187 * 188 * When a base table cannot be modified because a delta file is parsed in makeconv, 189 * then set moveTarget=NULL. 190 * 191 * if(intersectBase) then mappings that exist in the base table but not in 192 * the extension table are moved to moveTarget instead of showing an error. 193 * 194 * Special mode: 195 * If intersectBase==2 for a DBCS extension table, then SBCS mappings are 196 * not moved out of the base unless their Unicode input requires it. 197 * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. 198 * 199 * For both tables in the same file, the extension table is automatically 200 * built. 201 * For separate files, the extension file can use a complete mapping table (.ucm file), 202 * so that common mappings need not be stripped out manually. 203 * 204 * 205 * Sort both tables, and then for each mapping direction: 206 * 207 * If intersectBase is TRUE and the base table contains a mapping 208 * that does not exist in the extension table, then this mapping is moved 209 * to moveTarget. 210 * 211 * - otherwise - 212 * 213 * If the base table contains a mapping for which the input sequence is 214 * the same as the extension input, then 215 * - if the output is the same: remove the extension mapping 216 * - else: error 217 * 218 * If the base table contains a mapping for which the input sequence is 219 * a prefix of the extension input, then 220 * - if moveTarget!=NULL: move the base mapping to the moveTarget table 221 * - else: error 222 * 223 * @return FALSE in case of an irreparable error 224 */ 225U_CAPI UBool U_EXPORT2 226ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 227 UCMTable *moveTarget, UBool intersectBase); 228 229U_CAPI void U_EXPORT2 230ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); 231 232U_CAPI void U_EXPORT2 233ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); 234 235 236U_CAPI void U_EXPORT2 237ucm_addState(UCMStates *states, const char *s); 238 239U_CAPI void U_EXPORT2 240ucm_processStates(UCMStates *states); 241 242U_CAPI int32_t U_EXPORT2 243ucm_countChars(UCMStates *states, 244 const uint8_t *bytes, int32_t length); 245 246 247U_CAPI int8_t U_EXPORT2 248ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); 249 250U_CAPI UBool U_EXPORT2 251ucm_parseMappingLine(UCMapping *m, 252 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 253 uint8_t bytes[UCNV_EXT_MAX_BYTES], 254 const char *line); 255 256U_CAPI void U_EXPORT2 257ucm_addMapping(UCMTable *table, 258 UCMapping *m, 259 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 260 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 261 262/* very makeconv-specific functions ----------------------------------------- */ 263 264/* finalize and optimize states after the toUnicode mappings are processed */ 265U_CAPI void U_EXPORT2 266ucm_optimizeStates(UCMStates *states, 267 uint16_t **pUnicodeCodeUnits, 268 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 269 UBool verbose); 270 271/* moved here because it is used inside ucmstate.c */ 272U_CAPI int32_t U_EXPORT2 273ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 274 uint32_t offset); 275 276/* very rptp2ucm-specific functions ----------------------------------------- */ 277 278/* 279 * Input: Separate tables with mappings from/to Unicode, 280 * subchar and subchar1 (0 if none). 281 * All mappings must have flag 0. 282 * 283 * Output: fromUTable will contain the union of mappings with the correct 284 * precision flags, and be sorted. 285 */ 286U_CAPI void U_EXPORT2 287ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, 288 const uint8_t *subchar, int32_t subcharLength, 289 uint8_t subchar1); 290 291U_CAPI UBool U_EXPORT2 292ucm_separateMappings(UCMFile *ucm, UBool isSISO); 293 294U_CDECL_END 295 296#endif 297 298#endif 299 300