1f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)/*
2f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ***************************************************************************
3f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * Copyright (C) 2008-2009, International Business Machines Corporation
4f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) * and others. All Rights Reserved.
5f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) ***************************************************************************
6f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   file name:  uspoof_build.cpp
7f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   encoding:   US-ASCII
8f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   tab size:   8 (not used)
9f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   indentation:4
10f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
11f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   created on: 2008 Dec 8
12f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   created by: Andy Heninger
13f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
14f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   Unicode Spoof Detection Data Builder
15f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   Builder-related functions are kept in separate files so that applications not needing
16f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   the builder can more easily exclude them, typically by means of static linking.
17f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
18f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   There are three relatively independent sets of Spoof data,
19f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *      Confusables,
20f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *      Whole Script Confusables
21f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *      ID character extensions.
22f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *
23f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) *   The data tables for each are built separately, each from its own definitions
24f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles) */
25f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
26f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/utypes.h"
27f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/uspoof.h"
28f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/unorm.h"
29f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/uregex.h"
30f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "unicode/ustring.h"
31f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "cmemory.h"
32f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uspoof_impl.h"
33f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uhash.h"
34f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uvector.h"
35f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uassert.h"
36f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uarrsort.h"
37f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uspoof_conf.h"
38f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#include "uspoof_wsconf.h"
39f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
40f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#if !UCONFIG_NO_NORMALIZATION
41f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
42f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_NAMESPACE_USE
43f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
44f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
45f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)// The main data building function
46f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
47f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)U_CAPI USpoofChecker * U_EXPORT2
48f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)uspoof_openFromSource(const char *confusables,  int32_t confusablesLen,
49f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                      const char *confusablesWholeScript, int32_t confusablesWholeScriptLen,
50f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)                      int32_t *errorType, UParseError *pe, UErrorCode *status) {
51f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
52f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if (U_FAILURE(*status)) {
53f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        return NULL;
54f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
55f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#if UCONFIG_NO_REGULAR_EXPRESSIONS
56f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    *status = U_UNSUPPORTED_ERROR;
57f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return NULL;
58f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#else
59f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if (errorType!=NULL) {
60f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        *errorType = 0;
61f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
62f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if (pe != NULL) {
63f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pe->line = 0;
64f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pe->offset = 0;
65f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pe->preContext[0] = 0;
66f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        pe->postContext[0] = 0;
67f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
68f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
69f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Set up a shell of a spoof detector, with empty data.
70f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    SpoofData *newSpoofData = new SpoofData(*status);
71f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    SpoofImpl *This = new SpoofImpl(newSpoofData, *status);
72f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
73f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    // Compile the binary data from the source (text) format.
74f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    ConfusabledataBuilder::buildConfusableData(This, confusables, confusablesLen, errorType, pe, *status);
75f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    buildWSConfusableData(This, confusablesWholeScript, confusablesWholeScriptLen, pe, *status);
76f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
77f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    if (U_FAILURE(*status)) {
78f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        delete This;
79f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)        This = NULL;
80f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    }
81f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)    return (USpoofChecker *)This;
82f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif // UCONFIG_NO_REGULAR_EXPRESSIONS
83f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)}
84f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)
85f4ed1cf5d184064c4cf0e4359c6d5d8aadb50afaTorne (Richard Coles)#endif
86