1b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru/*
2b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ***************************************************************************
3b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * Copyright (C) 2008-2009, International Business Machines Corporation
4b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru * and others. All Rights Reserved.
5b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru ***************************************************************************
6b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   file name:  uspoof_build.cpp
7b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   encoding:   US-ASCII
8b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   tab size:   8 (not used)
9b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   indentation:4
10b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *
11b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   created on: 2008 Dec 8
12b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   created by: Andy Heninger
13b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *
14b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   Unicode Spoof Detection Data Builder
15b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   Builder-related functions are kept in separate files so that applications not needing
16b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   the builder can more easily exclude them, typically by means of static linking.
17b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *
18b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   There are three relatively independent sets of Spoof data,
19b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *      Confusables,
20b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *      Whole Script Confusables
21b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *      ID character extensions.
22b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *
23b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru *   The data tables for each are built separately, each from its own definitions
24b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru */
25b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
26b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/utypes.h"
27b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uspoof.h"
28b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/unorm.h"
29b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/uregex.h"
30b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "unicode/ustring.h"
31b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "cmemory.h"
32b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uspoof_impl.h"
33b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uhash.h"
34b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uvector.h"
35b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uassert.h"
36b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#include "uarrsort.h"
3750294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uspoof_conf.h"
3850294ead5e5d23f5bbfed76e00e6b510bd41eee1claireho#include "uspoof_wsconf.h"
39b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
40b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if !UCONFIG_NO_NORMALIZATION
41b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
42b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_NAMESPACE_USE
43b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
44b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
45b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru// The main data building function
46b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
47b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste QueruU_CAPI USpoofChecker * U_EXPORT2
48b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queruuspoof_openFromSource(const char *confusables,  int32_t confusablesLen,
49b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                      const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
50b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru                      int32_t *errorType, UParseError *pe, UErrorCode *status) {
51b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
52b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (U_FAILURE(*status)) {
53b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        return NULL;
54b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
55b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#if UCONFIG_NO_REGULAR_EXPRESSIONS
56b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    *status = U_UNSUPPORTED_ERROR;
57b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    return NULL;
58b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#else
59b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (errorType!=NULL) {
60b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        *errorType = 0;
61b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
62b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (pe != NULL) {
63b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        pe->line = 0;
64b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        pe->offset = 0;
65b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        pe->preContext[0] = 0;
66b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        pe->postContext[0] = 0;
67b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
68b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
69b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Set up a shell of a spoof detector, with empty data.
70b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    SpoofData *newSpoofData = new SpoofData(*status);
71b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    SpoofImpl *This = new SpoofImpl(newSpoofData, *status);
72b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
73b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    // Compile the binary data from the source (text) format.
74b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    ConfusabledataBuilder::buildConfusableData(This, confusables, confusablesLen, errorType, pe, *status);
75b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    buildWSConfusableData(This, confusablesWholeScript, confusablesWholeScriptLen, pe, *status);
76b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
77b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    if (U_FAILURE(*status)) {
78b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        delete This;
79b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru        This = NULL;
80b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    }
81b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru    return (USpoofChecker *)This;
82b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
83b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru}
84b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru
85b0ac937921a2c196d8b9da665135bf6ba01a1ccfJean-Baptiste Queru#endif
86