1b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru/* 2b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *************************************************************************** 3b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * Copyright (C) 2008-2009, International Business Machines Corporation 4b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * and others. All Rights Reserved. 5b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *************************************************************************** 6b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * file name: uspoof_build.cpp 7b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * encoding: US-ASCII 8b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * tab size: 8 (not used) 9b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * indentation:4 10b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * 11b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * created on: 2008 Dec 8 12b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * created by: Andy Heninger 13b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * 14b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * Unicode Spoof Detection Data Builder 15b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * Builder-related functions are kept in separate files so that applications not needing 16b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * the builder can more easily exclude them, typically by means of static linking. 17b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * 18b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * There are three relatively independent sets of Spoof data, 19b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * Confusables, 20b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * Whole Script Confusables 21b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * ID character extensions. 22b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * 23b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * The data tables for each are built separately, each from its own definitions 24b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru */ 25b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 26b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/utypes.h" 27b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uspoof.h" 28b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/unorm.h" 29b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uregex.h" 30b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/ustring.h" 31b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "cmemory.h" 32b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uspoof_impl.h" 33b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uhash.h" 34b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uvector.h" 35b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uassert.h" 36b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uarrsort.h" 3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uspoof_conf.h" 3850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uspoof_wsconf.h" 39b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 40b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if !UCONFIG_NO_NORMALIZATION 41b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 42b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_NAMESPACE_USE 43b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 44b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 45b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// The main data building function 46b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 47b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI USpoofChecker * U_EXPORT2 48b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruuspoof_openFromSource(const char *confusables, int32_t confusablesLen, 49b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru const char *confusablesWholeScript, int32_t confusablesWholeScriptLen, 50b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru int32_t *errorType, UParseError *pe, UErrorCode *status) { 51b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 52b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(*status)) { 53b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return NULL; 54b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 55b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if UCONFIG_NO_REGULAR_EXPRESSIONS 56b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *status = U_UNSUPPORTED_ERROR; 57b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return NULL; 58b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#else 59b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (errorType!=NULL) { 60b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *errorType = 0; 61b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 62b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (pe != NULL) { 63b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru pe->line = 0; 64b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru pe->offset = 0; 65b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru pe->preContext[0] = 0; 66b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru pe->postContext[0] = 0; 67b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 68b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 69b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Set up a shell of a spoof detector, with empty data. 70b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru SpoofData *newSpoofData = new SpoofData(*status); 71b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru SpoofImpl *This = new SpoofImpl(newSpoofData, *status); 72b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 73b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru // Compile the binary data from the source (text) format. 74b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ConfusabledataBuilder::buildConfusableData(This, confusables, confusablesLen, errorType, pe, *status); 75b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru buildWSConfusableData(This, confusablesWholeScript, confusablesWholeScriptLen, pe, *status); 76b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 77b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru if (U_FAILURE(*status)) { 78b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru delete This; 79b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru This = NULL; 80b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru } 81b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru return (USpoofChecker *)This; 82b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif // UCONFIG_NO_REGULAR_EXPRESSIONS 83b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru} 84b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru 85b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif 86