164339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// Copyright (C) 2016 and later: Unicode, Inc. and others. 264339d36f8bd4db5025fe2988eda22b491a9219cFredrik Roubert// License & terms of use: http://www.unicode.org/copyright.html 3ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 4ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// file: rbbirb.cpp 5ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius// Copyright (C) 2002-2011, International Business Machines Corporation and others. 7ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// All Rights Reserved. 8ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 9ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// This file contains the RBBIRuleBuilder class implementation. This is the main class for 10ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// building (compiling) break rules into the tables required by the runtime 11ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// RBBI engine. 12ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 13ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 14ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/utypes.h" 15ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 16ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#if !UCONFIG_NO_BREAK_ITERATION 17ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 18ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/brkiter.h" 19ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/rbbi.h" 20ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/ubrk.h" 21ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/unistr.h" 22ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uniset.h" 23ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchar.h" 24ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/uchriter.h" 25ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/parsepos.h" 26ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "unicode/parseerr.h" 27ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cmemory.h" 28ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "cstring.h" 29ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 30ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbirb.h" 31ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbinode.h" 32ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 33ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbiscan.h" 34ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbisetb.h" 35ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbitblb.h" 36ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#include "rbbidata.h" 37ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 38ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 39ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_BEGIN 40ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 41ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 42ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 43ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 44ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Constructor. 45ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 46ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 47ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruRBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString &rules, 4885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho UParseError *parseErr, 49ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode &status) 50ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru : fRules(rules) 51ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 52ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fStatus = &status; // status is checked below 5385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho fParseError = parseErr; 54ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fDebugEnv = NULL; 55ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifdef RBBI_DEBUG 56ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fDebugEnv = getenv("U_RBBIDEBUG"); 57ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 58ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 59ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 60ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fForwardTree = NULL; 61ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fReverseTree = NULL; 62ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSafeFwdTree = NULL; 63ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSafeRevTree = NULL; 64ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fDefaultTree = &fForwardTree; 65ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fForwardTables = NULL; 66ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fReverseTables = NULL; 67ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSafeFwdTables = NULL; 68ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSafeRevTables = NULL; 69ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fRuleStatusVals = NULL; 70ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fChainRules = FALSE; 71ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fLBCMNoChain = FALSE; 72ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fLookAheadHardBreak = FALSE; 73ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fUSetNodes = NULL; 74ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fRuleStatusVals = NULL; 75ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fScanner = NULL; 76ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSetBuilder = NULL; 7785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho if (parseErr) { 7885bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho uprv_memset(parseErr, 0, sizeof(UParseError)); 7985bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 80ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 81ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 82ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 83ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 84ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 85ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fUSetNodes = new UVector(status); // bcos status gets overwritten here 86ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fRuleStatusVals = new UVector(status); 87ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fScanner = new RBBIRuleScanner(this); 88ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSetBuilder = new RBBISetBuilder(this); 89ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 90ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return; 91ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 92ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if(fSetBuilder == 0 || fScanner == 0 || fUSetNodes == 0 || fRuleStatusVals == 0) { 93ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 94ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 95ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 96ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 97ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 98ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 99ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 100ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 101ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// Destructor 102ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 103ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 104ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruRBBIRuleBuilder::~RBBIRuleBuilder() { 105ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 106ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int i; 107ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (i=0; ; i++) { 108ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBINode *n = (RBBINode *)fUSetNodes->elementAt(i); 109ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (n==NULL) { 110ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru break; 111ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 112ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete n; 113ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 114ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 115ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fUSetNodes; 116ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fSetBuilder; 117ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fForwardTables; 118ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fReverseTables; 119ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fSafeFwdTables; 120ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fSafeRevTables; 121ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 122ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fForwardTree; 123ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fReverseTree; 124ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fSafeFwdTree; 125ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fSafeRevTree; 126ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fScanner; 127ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete fRuleStatusVals; 128ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 129ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 130ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 131ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 132ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 133ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 134ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 135ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 136ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// flattenData() - Collect up the compiled RBBI rule data and put it into 137ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// the format for saving in ICU data files, 138ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// which is also the format needed by the RBBI runtime engine. 139ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 140ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 141ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Querustatic int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;} 142ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 143ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruRBBIDataHeader *RBBIRuleBuilder::flattenData() { 144ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t i; 145ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 146ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(*fStatus)) { 147ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 148ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 149ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 150ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Remove comments and whitespace from the rules to make it smaller. 151ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UnicodeString strippedRules((const UnicodeString&)RBBIRuleScanner::stripRules(fRules)); 152ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 153ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Calculate the size of each section in the data. 154ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Sizes here are padded up to a multiple of 8 for better memory alignment. 155ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Sections sizes actually stored in the header are for the actual data 156ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // without the padding. 157ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 158ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t headerSize = align8(sizeof(RBBIDataHeader)); 159ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t forwardTableSize = align8(fForwardTables->getTableSize()); 160ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t reverseTableSize = align8(fReverseTables->getTableSize()); 161ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t safeFwdTableSize = align8(fSafeFwdTables->getTableSize()); 162ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t safeRevTableSize = align8(fSafeRevTables->getTableSize()); 163ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t trieSize = align8(fSetBuilder->getTrieSize()); 164ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t statusTableSize = align8(fRuleStatusVals->size() * sizeof(int32_t)); 165ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t rulesSize = align8((strippedRules.length()+1) * sizeof(UChar)); 166ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 167ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t totalSize = headerSize + forwardTableSize + reverseTableSize 168ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru + safeFwdTableSize + safeRevTableSize 169ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru + statusTableSize + trieSize + rulesSize; 170ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 171ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIDataHeader *data = (RBBIDataHeader *)uprv_malloc(totalSize); 172ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (data == NULL) { 173ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru *fStatus = U_MEMORY_ALLOCATION_ERROR; 174ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 175ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 176ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(data, 0, totalSize); 177ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 178ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 179ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fMagic = 0xb1a0; 180ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fFormatVersion[0] = 3; 181ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fFormatVersion[1] = 1; 182ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fFormatVersion[2] = 0; 183ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fFormatVersion[3] = 0; 184ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fLength = totalSize; 185ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fCatCount = fSetBuilder->getNumCharCategories(); 186ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 187ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fFTable = headerSize; 188ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fFTableLen = forwardTableSize; 189ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fRTable = data->fFTable + forwardTableSize; 190ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fRTableLen = reverseTableSize; 191ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fSFTable = data->fRTable + reverseTableSize; 192ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fSFTableLen = safeFwdTableSize; 193ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fSRTable = data->fSFTable + safeFwdTableSize; 194ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fSRTableLen = safeRevTableSize; 195ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 196ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fTrie = data->fSRTable + safeRevTableSize; 197ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fTrieLen = fSetBuilder->getTrieSize(); 198ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fStatusTable = data->fTrie + trieSize; 199ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fStatusTableLen= statusTableSize; 200ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fRuleSource = data->fStatusTable + statusTableSize; 201ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru data->fRuleSourceLen = strippedRules.length() * sizeof(UChar); 202ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 203ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru uprv_memset(data->fReserved, 0, sizeof(data->fReserved)); 204ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 205ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fForwardTables->exportTable((uint8_t *)data + data->fFTable); 206ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fReverseTables->exportTable((uint8_t *)data + data->fRTable); 207ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSafeFwdTables->exportTable((uint8_t *)data + data->fSFTable); 208ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSafeRevTables->exportTable((uint8_t *)data + data->fSRTable); 209ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru fSetBuilder->serializeTrie ((uint8_t *)data + data->fTrie); 210ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 211ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru int32_t *ruleStatusTable = (int32_t *)((uint8_t *)data + data->fStatusTable); 212ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru for (i=0; i<fRuleStatusVals->size(); i++) { 213ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru ruleStatusTable[i] = fRuleStatusVals->elementAti(i); 214ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 215ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 216ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru strippedRules.extract((UChar *)((uint8_t *)data+data->fRuleSource), rulesSize/2+1, *fStatus); 217ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 218ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return data; 219ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 220ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 221ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 222ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 223ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 224ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 225ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 226ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 227ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 228ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// createRuleBasedBreakIterator construct from source rules that are passed in 229ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// in a UnicodeString 230ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru// 231ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru//---------------------------------------------------------------------------------------- 232ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruBreakIterator * 233ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruRBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString &rules, 23485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho UParseError *parseError, 235ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru UErrorCode &status) 236ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru{ 237ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // status checked below 238ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 239ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 240ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Read the input rules, generate a parse tree, symbol table, 241ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // and list of all Unicode Sets referenced by the rules. 242ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 243ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIRuleBuilder builder(rules, parseError, status); 244ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { // status checked here bcos build below doesn't 245ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 246ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 24785bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho builder.fScanner->parse(); 248ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 249ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 250ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // UnicodeSet processing. 251ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Munge the Unicode Sets to create a set of character categories. 252ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Generate the mapping tables (TRIE) from input 32-bit characters to 253ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // the character categories. 254ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 255ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fSetBuilder->build(); 256ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 257ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 258ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 259ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Generate the DFA state transition table. 260ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 261ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fForwardTables = new RBBITableBuilder(&builder, &builder.fForwardTree); 262ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fReverseTables = new RBBITableBuilder(&builder, &builder.fReverseTree); 263ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fSafeFwdTables = new RBBITableBuilder(&builder, &builder.fSafeFwdTree); 264ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fSafeRevTables = new RBBITableBuilder(&builder, &builder.fSafeRevTree); 26583a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius if (builder.fForwardTables == NULL || builder.fReverseTables == NULL || 26683a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius builder.fSafeFwdTables == NULL || builder.fSafeRevTables == NULL) 267ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru { 268ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 26983a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius delete builder.fForwardTables; builder.fForwardTables = NULL; 27083a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius delete builder.fReverseTables; builder.fReverseTables = NULL; 27183a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius delete builder.fSafeFwdTables; builder.fSafeFwdTables = NULL; 27283a171d1a62abf406f7f44ae671823d5ec20db7dCraig Cornelius delete builder.fSafeRevTables; builder.fSafeRevTables = NULL; 273ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return NULL; 274ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 275ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 276ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fForwardTables->build(); 277ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fReverseTables->build(); 278ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fSafeFwdTables->build(); 279ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fSafeRevTables->build(); 280ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 281ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#ifdef RBBI_DEBUG 282ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (builder.fDebugEnv && uprv_strstr(builder.fDebugEnv, "states")) { 283ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru builder.fForwardTables->printRuleStatusTable(); 284ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 285ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif 286ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 287ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 288ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Package up the compiled data into a memory image 289ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // in the run-time format. 290ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 291ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RBBIDataHeader *data = builder.flattenData(); // returns NULL if error 29285bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho if (U_FAILURE(*builder.fStatus)) { 29385bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho return NULL; 29485bf2e2fbc60a9f938064abc8127d61da7d19882Claire Ho } 295ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 296ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 297ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 298ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Clean up the compiler related stuff 299ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 300ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 301ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 302ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 303ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // Create a break iterator from the compiled rules. 304ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // (Identical to creation from stored pre-compiled rules) 305ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // 306ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru // status is checked after init in construction. 307ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status); 308ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru if (U_FAILURE(status)) { 309ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru delete This; 310ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru This = NULL; 311ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 312ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru else if(This == NULL) { // test for NULL 313ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru status = U_MEMORY_ALLOCATION_ERROR; 314ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru } 315ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru return This; 316ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru} 317ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 318ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste QueruU_NAMESPACE_END 319ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru 320ac04d0bbe12b3ef54518635711412f178cb4d16Jean-Baptiste Queru#endif /* #if !UCONFIG_NO_BREAK_ITERATION */ 321