1// Copyright (C) 2016 and later: Unicode, Inc. and others. 2// License & terms of use: http://www.unicode.org/copyright.html 3/* 4 ******************************************************************************* 5 * Copyright (C) 2003-2013, International Business Machines 6 * Corporation and others. All Rights Reserved. 7 ******************************************************************************* 8 * file name: ucm.h 9 * encoding: US-ASCII 10 * tab size: 8 (not used) 11 * indentation:4 12 * 13 * created on: 2003jun20 14 * created by: Markus W. Scherer 15 * 16 * Definitions for the .ucm file parser and handler module ucm.c. 17 */ 18 19#ifndef __UCM_H__ 20#define __UCM_H__ 21 22#include "unicode/utypes.h" 23#include "ucnvmbcs.h" 24#include "ucnv_ext.h" 25#include "filestrm.h" 26#include <stdio.h> 27 28#if !UCONFIG_NO_CONVERSION 29 30U_CDECL_BEGIN 31 32/* constants for UCMapping.moveFlag */ 33enum { 34 UCM_MOVE_TO_EXT=1, 35 UCM_REMOVE_MAPPING=2 36}; 37 38/* 39 * Per-mapping data structure 40 * 41 * u if uLen==1: Unicode code point 42 * else index to uLen code points 43 * b if bLen<=4: up to 4 bytes 44 * else index to bLen bytes 45 * uLen number of code points 46 * bLen number of words containing left-justified bytes 47 * bIsMultipleChars indicates that the bytes contain more than one sequence 48 * according to the state table 49 * f flag for roundtrip (0), fallback (1), sub mapping (2), reverse fallback (3) 50 * or "good one-way" mapping (4). 51 * Same values as in the source file after | 52 */ 53typedef struct UCMapping { 54 UChar32 u; 55 union { 56 uint32_t idx; 57 uint8_t bytes[4]; 58 } b; 59 int8_t uLen, bLen, f, moveFlag; 60} UCMapping; 61 62/* constants for UCMTable.flagsType */ 63enum { 64 UCM_FLAGS_INITIAL, /* no mappings parsed yet */ 65 UCM_FLAGS_EXPLICIT, /* .ucm file has mappings with | fallback indicators */ 66 UCM_FLAGS_IMPLICIT, /* .ucm file has mappings without | fallback indicators, later wins */ 67 UCM_FLAGS_MIXED /* both implicit and explicit */ 68}; 69 70typedef struct UCMTable { 71 UCMapping *mappings; 72 int32_t mappingsCapacity, mappingsLength; 73 74 UChar32 *codePoints; 75 int32_t codePointsCapacity, codePointsLength; 76 77 uint8_t *bytes; 78 int32_t bytesCapacity, bytesLength; 79 80 /* index map for mapping by bytes first */ 81 int32_t *reverseMap; 82 83 uint8_t unicodeMask; 84 int8_t flagsType; /* UCM_FLAGS_INITIAL etc. */ 85 UBool isSorted; 86} UCMTable; 87 88enum { 89 MBCS_STATE_FLAG_DIRECT=1, 90 MBCS_STATE_FLAG_SURROGATES, 91 92 MBCS_STATE_FLAG_READY=16 93}; 94 95typedef struct UCMStates { 96 int32_t stateTable[MBCS_MAX_STATE_COUNT][256]; 97 uint32_t stateFlags[MBCS_MAX_STATE_COUNT], 98 stateOffsetSum[MBCS_MAX_STATE_COUNT]; 99 100 int32_t countStates, minCharLength, maxCharLength, countToUCodeUnits; 101 int8_t conversionType, outputType; 102} UCMStates; 103 104typedef struct UCMFile { 105 UCMTable *base, *ext; 106 UCMStates states; 107 108 char baseName[UCNV_MAX_CONVERTER_NAME_LENGTH]; 109} UCMFile; 110 111/* simple accesses ---------------------------------------------------------- */ 112 113#define UCM_GET_CODE_POINTS(t, m) \ 114 (((m)->uLen==1) ? &(m)->u : (t)->codePoints+(m)->u) 115 116#define UCM_GET_BYTES(t, m) \ 117 (((m)->bLen<=4) ? (m)->b.bytes : (t)->bytes+(m)->b.idx) 118 119/* APIs --------------------------------------------------------------------- */ 120 121U_CAPI UCMFile * U_EXPORT2 122ucm_open(void); 123 124U_CAPI void U_EXPORT2 125ucm_close(UCMFile *ucm); 126 127U_CAPI UBool U_EXPORT2 128ucm_parseHeaderLine(UCMFile *ucm, 129 char *line, char **pKey, char **pValue); 130 131/* @return -1 illegal bytes 0 suitable for base table 1 needs to go into extension table */ 132U_CAPI int32_t U_EXPORT2 133ucm_mappingType(UCMStates *baseStates, 134 UCMapping *m, 135 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 136 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 137 138/* add a mapping to the base or extension table as appropriate */ 139U_CAPI UBool U_EXPORT2 140ucm_addMappingAuto(UCMFile *ucm, UBool forBase, UCMStates *baseStates, 141 UCMapping *m, 142 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 143 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 144 145U_CAPI UBool U_EXPORT2 146ucm_addMappingFromLine(UCMFile *ucm, const char *line, UBool forBase, UCMStates *baseStates); 147 148 149U_CAPI UCMTable * U_EXPORT2 150ucm_openTable(void); 151 152U_CAPI void U_EXPORT2 153ucm_closeTable(UCMTable *table); 154 155U_CAPI void U_EXPORT2 156ucm_resetTable(UCMTable *table); 157 158U_CAPI void U_EXPORT2 159ucm_sortTable(UCMTable *t); 160 161/* 162 * Remove mappings with their move flag set from the base table 163 * and move some of them (with UCM_MOVE_TO_EXT) to the extension table. 164 */ 165U_CAPI void U_EXPORT2 166ucm_moveMappings(UCMTable *base, UCMTable *ext); 167 168/** 169 * Read a table from a .ucm file, from after the CHARMAP line to 170 * including the END CHARMAP line. 171 */ 172U_CAPI void U_EXPORT2 173ucm_readTable(UCMFile *ucm, FileStream* convFile, 174 UBool forBase, UCMStates *baseStates, 175 UErrorCode *pErrorCode); 176 177/** 178 * Check the validity of mappings against a base table's states; 179 * necessary for extension-only tables that were read before their base tables. 180 */ 181U_CAPI UBool U_EXPORT2 182ucm_checkValidity(UCMTable *ext, UCMStates *baseStates); 183 184/** 185 * Check a base table against an extension table. 186 * Set the moveTarget!=NULL if it is possible to move mappings from the base. 187 * This is the case where base and extension tables are parsed from a single file 188 * (moveTarget==ext) 189 * or when delta file mappings are subtracted from a base table. 190 * 191 * When a base table cannot be modified because a delta file is parsed in makeconv, 192 * then set moveTarget=NULL. 193 * 194 * if(intersectBase) then mappings that exist in the base table but not in 195 * the extension table are moved to moveTarget instead of showing an error. 196 * 197 * Special mode: 198 * If intersectBase==2 for a DBCS extension table, then SBCS mappings are 199 * not moved out of the base unless their Unicode input requires it. 200 * This helps ucmkbase generate base tables for DBCS-only extension .cnv files. 201 * 202 * For both tables in the same file, the extension table is automatically 203 * built. 204 * For separate files, the extension file can use a complete mapping table (.ucm file), 205 * so that common mappings need not be stripped out manually. 206 * 207 * 208 * Sort both tables, and then for each mapping direction: 209 * 210 * If intersectBase is TRUE and the base table contains a mapping 211 * that does not exist in the extension table, then this mapping is moved 212 * to moveTarget. 213 * 214 * - otherwise - 215 * 216 * If the base table contains a mapping for which the input sequence is 217 * the same as the extension input, then 218 * - if the output is the same: remove the extension mapping 219 * - else: error 220 * 221 * If the base table contains a mapping for which the input sequence is 222 * a prefix of the extension input, then 223 * - if moveTarget!=NULL: move the base mapping to the moveTarget table 224 * - else: error 225 * 226 * @return FALSE in case of an irreparable error 227 */ 228U_CAPI UBool U_EXPORT2 229ucm_checkBaseExt(UCMStates *baseStates, UCMTable *base, UCMTable *ext, 230 UCMTable *moveTarget, UBool intersectBase); 231 232U_CAPI void U_EXPORT2 233ucm_printTable(UCMTable *table, FILE *f, UBool byUnicode); 234 235U_CAPI void U_EXPORT2 236ucm_printMapping(UCMTable *table, UCMapping *m, FILE *f); 237 238 239U_CAPI void U_EXPORT2 240ucm_addState(UCMStates *states, const char *s); 241 242U_CAPI void U_EXPORT2 243ucm_processStates(UCMStates *states, UBool ignoreSISOCheck); 244 245U_CAPI int32_t U_EXPORT2 246ucm_countChars(UCMStates *states, 247 const uint8_t *bytes, int32_t length); 248 249 250U_CAPI int8_t U_EXPORT2 251ucm_parseBytes(uint8_t bytes[UCNV_EXT_MAX_BYTES], const char *line, const char **ps); 252 253U_CAPI UBool U_EXPORT2 254ucm_parseMappingLine(UCMapping *m, 255 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 256 uint8_t bytes[UCNV_EXT_MAX_BYTES], 257 const char *line); 258 259U_CAPI void U_EXPORT2 260ucm_addMapping(UCMTable *table, 261 UCMapping *m, 262 UChar32 codePoints[UCNV_EXT_MAX_UCHARS], 263 uint8_t bytes[UCNV_EXT_MAX_BYTES]); 264 265/* very makeconv-specific functions ----------------------------------------- */ 266 267/* finalize and optimize states after the toUnicode mappings are processed */ 268U_CAPI void U_EXPORT2 269ucm_optimizeStates(UCMStates *states, 270 uint16_t **pUnicodeCodeUnits, 271 _MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 272 UBool verbose); 273 274/* moved here because it is used inside ucmstate.c */ 275U_CAPI int32_t U_EXPORT2 276ucm_findFallback(_MBCSToUFallback *toUFallbacks, int32_t countToUFallbacks, 277 uint32_t offset); 278 279/* very rptp2ucm-specific functions ----------------------------------------- */ 280 281/* 282 * Input: Separate tables with mappings from/to Unicode, 283 * subchar and subchar1 (0 if none). 284 * All mappings must have flag 0. 285 * 286 * Output: fromUTable will contain the union of mappings with the correct 287 * precision flags, and be sorted. 288 */ 289U_CAPI void U_EXPORT2 290ucm_mergeTables(UCMTable *fromUTable, UCMTable *toUTable, 291 const uint8_t *subchar, int32_t subcharLength, 292 uint8_t subchar1); 293 294U_CAPI UBool U_EXPORT2 295ucm_separateMappings(UCMFile *ucm, UBool isSISO); 296 297U_CDECL_END 298 299#endif 300 301#endif 302 303