1/**
2 *******************************************************************************
3 * Copyright (C) 2006-2008, International Business Machines Corporation        *
4 * and others. All Rights Reserved.                                            *
5 *******************************************************************************
6 */
7
8#include "unicode/utypes.h"
9
10#if !UCONFIG_NO_BREAK_ITERATION
11
12#include "triedict.h"
13#include "unicode/chariter.h"
14#include "unicode/uchriter.h"
15#include "unicode/strenum.h"
16#include "unicode/uenum.h"
17#include "unicode/udata.h"
18#include "cmemory.h"
19#include "udataswp.h"
20#include "uvector.h"
21#include "uvectr32.h"
22#include "uarrsort.h"
23
24//#define DEBUG_TRIE_DICT 1
25
26#ifdef DEBUG_TRIE_DICT
27#include <sys/times.h>
28#include <limits.h>
29#include <stdio.h>
30#endif
31
32U_NAMESPACE_BEGIN
33
34/*******************************************************************
35 * TrieWordDictionary
36 */
37
38TrieWordDictionary::TrieWordDictionary() {
39}
40
41TrieWordDictionary::~TrieWordDictionary() {
42}
43
44/*******************************************************************
45 * MutableTrieDictionary
46 */
47
48// Node structure for the ternary, uncompressed trie
49struct TernaryNode : public UMemory {
50    UChar       ch;         // UTF-16 code unit
51    uint16_t    flags;      // Flag word
52    TernaryNode *low;       // Less-than link
53    TernaryNode *equal;     // Equal link
54    TernaryNode *high;      // Greater-than link
55
56    TernaryNode(UChar uc);
57    ~TernaryNode();
58};
59
60enum MutableTrieNodeFlags {
61    kEndsWord = 0x0001      // This node marks the end of a valid word
62};
63
64inline
65TernaryNode::TernaryNode(UChar uc) {
66    ch = uc;
67    flags = 0;
68    low = NULL;
69    equal = NULL;
70    high = NULL;
71}
72
73// Not inline since it's recursive
74TernaryNode::~TernaryNode() {
75    delete low;
76    delete equal;
77    delete high;
78}
79
80MutableTrieDictionary::MutableTrieDictionary( UChar median, UErrorCode &status ) {
81    // Start the trie off with something. Having the root node already present
82    // cuts a special case out of the search/insertion functions.
83    // Making it a median character cuts the worse case for searches from
84    // 4x a balanced trie to 2x a balanced trie. It's best to choose something
85    // that starts a word that is midway in the list.
86    fTrie = new TernaryNode(median);
87    if (fTrie == NULL) {
88        status = U_MEMORY_ALLOCATION_ERROR;
89    }
90    fIter = utext_openUChars(NULL, NULL, 0, &status);
91    if (U_SUCCESS(status) && fIter == NULL) {
92        status = U_MEMORY_ALLOCATION_ERROR;
93    }
94}
95
96MutableTrieDictionary::MutableTrieDictionary( UErrorCode &status ) {
97    fTrie = NULL;
98    fIter = utext_openUChars(NULL, NULL, 0, &status);
99    if (U_SUCCESS(status) && fIter == NULL) {
100        status = U_MEMORY_ALLOCATION_ERROR;
101    }
102}
103
104MutableTrieDictionary::~MutableTrieDictionary() {
105    delete fTrie;
106    utext_close(fIter);
107}
108
109int32_t
110MutableTrieDictionary::search( UText *text,
111                                   int32_t maxLength,
112                                   int32_t *lengths,
113                                   int &count,
114                                   int limit,
115                                   TernaryNode *&parent,
116                                   UBool &pMatched ) const {
117    // TODO: current implementation works in UTF-16 space
118    const TernaryNode *up = NULL;
119    const TernaryNode *p = fTrie;
120    int mycount = 0;
121    pMatched = TRUE;
122    int i;
123
124    UChar uc = utext_current32(text);
125    for (i = 0; i < maxLength && p != NULL; ++i) {
126        while (p != NULL) {
127            if (uc < p->ch) {
128                up = p;
129                p = p->low;
130            }
131            else if (uc == p->ch) {
132                break;
133            }
134            else {
135                up = p;
136                p = p->high;
137            }
138        }
139        if (p == NULL) {
140            pMatched = FALSE;
141            break;
142        }
143        // Must be equal to get here
144        if (limit > 0 && (p->flags & kEndsWord)) {
145            lengths[mycount++] = i+1;
146            --limit;
147        }
148        up = p;
149        p = p->equal;
150        uc = utext_next32(text);
151        uc = utext_current32(text);
152    }
153
154    // Note that there is no way to reach here with up == 0 unless
155    // maxLength is 0 coming in.
156    parent = (TernaryNode *)up;
157    count = mycount;
158    return i;
159}
160
161void
162MutableTrieDictionary::addWord( const UChar *word,
163                                int32_t length,
164                                UErrorCode &status ) {
165#if 0
166    if (length <= 0) {
167        status = U_ILLEGAL_ARGUMENT_ERROR;
168        return;
169    }
170#endif
171    TernaryNode *parent;
172    UBool pMatched;
173    int count;
174    fIter = utext_openUChars(fIter, word, length, &status);
175
176    int matched;
177    matched = search(fIter, length, NULL, count, 0, parent, pMatched);
178
179    while (matched++ < length) {
180        UChar32 uc = utext_next32(fIter);  // TODO:  supplemetary support?
181        U_ASSERT(uc != U_SENTINEL);
182        TernaryNode *newNode = new TernaryNode(uc);
183        if (newNode == NULL) {
184            status = U_MEMORY_ALLOCATION_ERROR;
185            return;
186        }
187        if (pMatched) {
188            parent->equal = newNode;
189        }
190        else {
191            pMatched = TRUE;
192            if (uc < parent->ch) {
193                parent->low = newNode;
194            }
195            else {
196                parent->high = newNode;
197            }
198        }
199        parent = newNode;
200    }
201
202    parent->flags |= kEndsWord;
203}
204
205#if 0
206void
207MutableTrieDictionary::addWords( UEnumeration *words,
208                                  UErrorCode &status ) {
209    int32_t length;
210    const UChar *word;
211    while ((word = uenum_unext(words, &length, &status)) && U_SUCCESS(status)) {
212        addWord(word, length, status);
213    }
214}
215#endif
216
217int32_t
218MutableTrieDictionary::matches( UText *text,
219                                int32_t maxLength,
220                                int32_t *lengths,
221                                int &count,
222                                int limit ) const {
223    TernaryNode *parent;
224    UBool pMatched;
225    return search(text, maxLength, lengths, count, limit, parent, pMatched);
226}
227
228// Implementation of iteration for MutableTrieDictionary
229class MutableTrieEnumeration : public StringEnumeration {
230private:
231    UStack      fNodeStack;     // Stack of nodes to process
232    UVector32   fBranchStack;   // Stack of which branch we are working on
233    TernaryNode *fRoot;         // Root node
234    enum StackBranch {
235        kLessThan,
236        kEqual,
237        kGreaterThan,
238        kDone
239    };
240
241public:
242    static UClassID U_EXPORT2 getStaticClassID(void);
243    virtual UClassID getDynamicClassID(void) const;
244public:
245    MutableTrieEnumeration(TernaryNode *root, UErrorCode &status)
246        : fNodeStack(status), fBranchStack(status) {
247        fRoot = root;
248        fNodeStack.push(root, status);
249        fBranchStack.push(kLessThan, status);
250        unistr.remove();
251    }
252
253    virtual ~MutableTrieEnumeration() {
254    }
255
256    virtual StringEnumeration *clone() const {
257        UErrorCode status = U_ZERO_ERROR;
258        return new MutableTrieEnumeration(fRoot, status);
259    }
260
261    virtual const UnicodeString *snext(UErrorCode &status) {
262        if (fNodeStack.empty() || U_FAILURE(status)) {
263            return NULL;
264        }
265        TernaryNode *node = (TernaryNode *) fNodeStack.peek();
266        StackBranch where = (StackBranch) fBranchStack.peeki();
267        while (!fNodeStack.empty() && U_SUCCESS(status)) {
268            UBool emit;
269            UBool equal;
270
271            switch (where) {
272            case kLessThan:
273                if (node->low != NULL) {
274                    fBranchStack.setElementAt(kEqual, fBranchStack.size()-1);
275                    node = (TernaryNode *) fNodeStack.push(node->low, status);
276                    where = (StackBranch) fBranchStack.push(kLessThan, status);
277                    break;
278                }
279            case kEqual:
280                emit = (node->flags & kEndsWord) != 0;
281                equal = (node->equal != NULL);
282                // If this node should be part of the next emitted string, append
283                // the UChar to the string, and make sure we pop it when we come
284                // back to this node. The character should only be in the string
285                // for as long as we're traversing the equal subtree of this node
286                if (equal || emit) {
287                    unistr.append(node->ch);
288                    fBranchStack.setElementAt(kGreaterThan, fBranchStack.size()-1);
289                }
290                if (equal) {
291                    node = (TernaryNode *) fNodeStack.push(node->equal, status);
292                    where = (StackBranch) fBranchStack.push(kLessThan, status);
293                }
294                if (emit) {
295                    return &unistr;
296                }
297                if (equal) {
298                    break;
299                }
300            case kGreaterThan:
301                // If this node's character is in the string, remove it.
302                if (node->equal != NULL || (node->flags & kEndsWord)) {
303                    unistr.truncate(unistr.length()-1);
304                }
305                if (node->high != NULL) {
306                    fBranchStack.setElementAt(kDone, fBranchStack.size()-1);
307                    node = (TernaryNode *) fNodeStack.push(node->high, status);
308                    where = (StackBranch) fBranchStack.push(kLessThan, status);
309                    break;
310                }
311            case kDone:
312                fNodeStack.pop();
313                fBranchStack.popi();
314                node = (TernaryNode *) fNodeStack.peek();
315                where = (StackBranch) fBranchStack.peeki();
316                break;
317            default:
318                return NULL;
319            }
320        }
321        return NULL;
322    }
323
324    // Very expensive, but this should never be used.
325    virtual int32_t count(UErrorCode &status) const {
326        MutableTrieEnumeration counter(fRoot, status);
327        int32_t result = 0;
328        while (counter.snext(status) != NULL && U_SUCCESS(status)) {
329            ++result;
330        }
331        return result;
332    }
333
334    virtual void reset(UErrorCode &status) {
335        fNodeStack.removeAllElements();
336        fBranchStack.removeAllElements();
337        fNodeStack.push(fRoot, status);
338        fBranchStack.push(kLessThan, status);
339        unistr.remove();
340    }
341};
342
343UOBJECT_DEFINE_RTTI_IMPLEMENTATION(MutableTrieEnumeration)
344
345StringEnumeration *
346MutableTrieDictionary::openWords( UErrorCode &status ) const {
347    if (U_FAILURE(status)) {
348        return NULL;
349    }
350    return new MutableTrieEnumeration(fTrie, status);
351}
352
353/*******************************************************************
354 * CompactTrieDictionary
355 */
356
357struct CompactTrieHeader {
358    uint32_t        size;           // Size of the data in bytes
359    uint32_t        magic;          // Magic number (including version)
360    uint16_t        nodeCount;      // Number of entries in offsets[]
361    uint16_t        root;           // Node number of the root node
362    uint32_t        offsets[1];      // Offsets to nodes from start of data
363};
364
365// Note that to avoid platform-specific alignment issues, all members of the node
366// structures should be the same size, or should contain explicit padding to
367// natural alignment boundaries.
368
369// We can't use a bitfield for the flags+count field, because the layout of those
370// is not portable. 12 bits of count allows for up to 4096 entries in a node.
371struct CompactTrieNode {
372    uint16_t        flagscount;     // Count of sub-entries, plus flags
373};
374
375enum CompactTrieNodeFlags {
376    kVerticalNode   = 0x1000,       // This is a vertical node
377    kParentEndsWord = 0x2000,       // The node whose equal link points to this ends a word
378    kReservedFlag1  = 0x4000,
379    kReservedFlag2  = 0x8000,
380    kCountMask      = 0x0FFF,       // The count portion of flagscount
381    kFlagMask       = 0xF000        // The flags portion of flagscount
382};
383
384// The two node types are distinguished by the kVerticalNode flag.
385
386struct CompactTrieHorizontalEntry {
387    uint16_t        ch;             // UChar
388    uint16_t        equal;          // Equal link node index
389};
390
391// We don't use inheritance here because C++ does not guarantee that the
392// base class comes first in memory!!
393
394struct CompactTrieHorizontalNode {
395    uint16_t        flagscount;     // Count of sub-entries, plus flags
396    CompactTrieHorizontalEntry      entries[1];
397};
398
399struct CompactTrieVerticalNode {
400    uint16_t        flagscount;     // Count of sub-entries, plus flags
401    uint16_t        equal;          // Equal link node index
402    uint16_t        chars[1];       // Code units
403};
404
405// {'Dic', 1}, version 1
406#define COMPACT_TRIE_MAGIC_1 0x44696301
407
408CompactTrieDictionary::CompactTrieDictionary(UDataMemory *dataObj,
409                                                UErrorCode &status )
410: fUData(dataObj)
411{
412    fData = (const CompactTrieHeader *) udata_getMemory(dataObj);
413    fOwnData = FALSE;
414    if (fData->magic != COMPACT_TRIE_MAGIC_1) {
415        status = U_ILLEGAL_ARGUMENT_ERROR;
416        fData = NULL;
417    }
418}
419CompactTrieDictionary::CompactTrieDictionary( const void *data,
420                                                UErrorCode &status )
421: fUData(NULL)
422{
423    fData = (const CompactTrieHeader *) data;
424    fOwnData = FALSE;
425    if (fData->magic != COMPACT_TRIE_MAGIC_1) {
426        status = U_ILLEGAL_ARGUMENT_ERROR;
427        fData = NULL;
428    }
429}
430
431CompactTrieDictionary::CompactTrieDictionary( const MutableTrieDictionary &dict,
432                                                UErrorCode &status )
433: fUData(NULL)
434{
435    fData = compactMutableTrieDictionary(dict, status);
436    fOwnData = !U_FAILURE(status);
437}
438
439CompactTrieDictionary::~CompactTrieDictionary() {
440    if (fOwnData) {
441        uprv_free((void *)fData);
442    }
443    if (fUData) {
444        udata_close(fUData);
445    }
446}
447
448uint32_t
449CompactTrieDictionary::dataSize() const {
450    return fData->size;
451}
452
453const void *
454CompactTrieDictionary::data() const {
455    return fData;
456}
457
458// This function finds the address of a node for us, given its node ID
459static inline const CompactTrieNode *
460getCompactNode(const CompactTrieHeader *header, uint16_t node) {
461    return (const CompactTrieNode *)((const uint8_t *)header + header->offsets[node]);
462}
463
464int32_t
465CompactTrieDictionary::matches( UText *text,
466                                int32_t maxLength,
467                                int32_t *lengths,
468                                int &count,
469                                int limit ) const {
470    // TODO: current implementation works in UTF-16 space
471    const CompactTrieNode *node = getCompactNode(fData, fData->root);
472    int mycount = 0;
473
474    UChar uc = utext_current32(text);
475    int i = 0;
476
477    while (node != NULL) {
478        // Check if the node we just exited ends a word
479        if (limit > 0 && (node->flagscount & kParentEndsWord)) {
480            lengths[mycount++] = i;
481            --limit;
482        }
483        // Check that we haven't exceeded the maximum number of input characters.
484        // We have to do that here rather than in the while condition so that
485        // we can check for ending a word, above.
486        if (i >= maxLength) {
487            break;
488        }
489
490        int nodeCount = (node->flagscount & kCountMask);
491        if (nodeCount == 0) {
492            // Special terminal node; return now
493            break;
494        }
495        if (node->flagscount & kVerticalNode) {
496            // Vertical node; check all the characters in it
497            const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
498            for (int j = 0; j < nodeCount && i < maxLength; ++j) {
499                if (uc != vnode->chars[j]) {
500                    // We hit a non-equal character; return
501                    goto exit;
502                }
503                utext_next32(text);
504                uc = utext_current32(text);
505                ++i;
506            }
507            // To get here we must have come through the whole list successfully;
508            // go on to the next node. Note that a word cannot end in the middle
509            // of a vertical node.
510            node = getCompactNode(fData, vnode->equal);
511        }
512        else {
513            // Horizontal node; do binary search
514            const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
515            int low = 0;
516            int high = nodeCount-1;
517            int middle;
518            node = NULL;    // If we don't find a match, we'll fall out of the loop
519            while (high >= low) {
520                middle = (high+low)/2;
521                if (uc == hnode->entries[middle].ch) {
522                    // We hit a match; get the next node and next character
523                    node = getCompactNode(fData, hnode->entries[middle].equal);
524                    utext_next32(text);
525                    uc = utext_current32(text);
526                    ++i;
527                    break;
528                }
529                else if (uc < hnode->entries[middle].ch) {
530                    high = middle-1;
531                }
532                else {
533                    low = middle+1;
534                }
535            }
536        }
537    }
538exit:
539    count = mycount;
540    return i;
541}
542
543// Implementation of iteration for CompactTrieDictionary
544class CompactTrieEnumeration : public StringEnumeration {
545private:
546    UVector32               fNodeStack;     // Stack of nodes to process
547    UVector32               fIndexStack;    // Stack of where in node we are
548    const CompactTrieHeader *fHeader;       // Trie data
549
550public:
551    static UClassID U_EXPORT2 getStaticClassID(void);
552    virtual UClassID getDynamicClassID(void) const;
553public:
554    CompactTrieEnumeration(const CompactTrieHeader *header, UErrorCode &status)
555        : fNodeStack(status), fIndexStack(status) {
556        fHeader = header;
557        fNodeStack.push(header->root, status);
558        fIndexStack.push(0, status);
559        unistr.remove();
560    }
561
562    virtual ~CompactTrieEnumeration() {
563    }
564
565    virtual StringEnumeration *clone() const {
566        UErrorCode status = U_ZERO_ERROR;
567        return new CompactTrieEnumeration(fHeader, status);
568    }
569
570    virtual const UnicodeString * snext(UErrorCode &status);
571
572    // Very expensive, but this should never be used.
573    virtual int32_t count(UErrorCode &status) const {
574        CompactTrieEnumeration counter(fHeader, status);
575        int32_t result = 0;
576        while (counter.snext(status) != NULL && U_SUCCESS(status)) {
577            ++result;
578        }
579        return result;
580    }
581
582    virtual void reset(UErrorCode &status) {
583        fNodeStack.removeAllElements();
584        fIndexStack.removeAllElements();
585        fNodeStack.push(fHeader->root, status);
586        fIndexStack.push(0, status);
587        unistr.remove();
588    }
589};
590
591UOBJECT_DEFINE_RTTI_IMPLEMENTATION(CompactTrieEnumeration)
592
593const UnicodeString *
594CompactTrieEnumeration::snext(UErrorCode &status) {
595    if (fNodeStack.empty() || U_FAILURE(status)) {
596        return NULL;
597    }
598    const CompactTrieNode *node = getCompactNode(fHeader, fNodeStack.peeki());
599    int where = fIndexStack.peeki();
600    while (!fNodeStack.empty() && U_SUCCESS(status)) {
601        int nodeCount = (node->flagscount & kCountMask);
602        UBool goingDown = FALSE;
603        if (nodeCount == 0) {
604            // Terminal node; go up immediately
605            fNodeStack.popi();
606            fIndexStack.popi();
607            node = getCompactNode(fHeader, fNodeStack.peeki());
608            where = fIndexStack.peeki();
609        }
610        else if (node->flagscount & kVerticalNode) {
611            // Vertical node
612            const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
613            if (where == 0) {
614                // Going down
615                unistr.append((const UChar *)vnode->chars, (int32_t) nodeCount);
616                fIndexStack.setElementAt(1, fIndexStack.size()-1);
617                node = getCompactNode(fHeader, fNodeStack.push(vnode->equal, status));
618                where = fIndexStack.push(0, status);
619                goingDown = TRUE;
620            }
621            else {
622                // Going up
623                unistr.truncate(unistr.length()-nodeCount);
624                fNodeStack.popi();
625                fIndexStack.popi();
626                node = getCompactNode(fHeader, fNodeStack.peeki());
627                where = fIndexStack.peeki();
628            }
629        }
630        else {
631            // Horizontal node
632            const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
633            if (where > 0) {
634                // Pop previous char
635                unistr.truncate(unistr.length()-1);
636            }
637            if (where < nodeCount) {
638                // Push on next node
639                unistr.append((UChar)hnode->entries[where].ch);
640                fIndexStack.setElementAt(where+1, fIndexStack.size()-1);
641                node = getCompactNode(fHeader, fNodeStack.push(hnode->entries[where].equal, status));
642                where = fIndexStack.push(0, status);
643                goingDown = TRUE;
644            }
645            else {
646                // Going up
647                fNodeStack.popi();
648                fIndexStack.popi();
649                node = getCompactNode(fHeader, fNodeStack.peeki());
650                where = fIndexStack.peeki();
651            }
652        }
653        // Check if the parent of the node we've just gone down to ends a
654        // word. If so, return it.
655        if (goingDown && (node->flagscount & kParentEndsWord)) {
656            return &unistr;
657        }
658    }
659    return NULL;
660}
661
662StringEnumeration *
663CompactTrieDictionary::openWords( UErrorCode &status ) const {
664    if (U_FAILURE(status)) {
665        return NULL;
666    }
667    return new CompactTrieEnumeration(fData, status);
668}
669
670//
671// Below here is all code related to converting a ternary trie to a compact trie
672// and back again
673//
674
675// Helper classes to construct the compact trie
676class BuildCompactTrieNode: public UMemory {
677 public:
678    UBool           fParentEndsWord;
679    UBool           fVertical;
680    UBool           fHasDuplicate;
681    int32_t         fNodeID;
682    UnicodeString   fChars;
683
684 public:
685    BuildCompactTrieNode(UBool parentEndsWord, UBool vertical, UStack &nodes, UErrorCode &status) {
686        fParentEndsWord = parentEndsWord;
687        fHasDuplicate = FALSE;
688        fVertical = vertical;
689        fNodeID = nodes.size();
690        nodes.push(this, status);
691    }
692
693    virtual ~BuildCompactTrieNode() {
694    }
695
696    virtual uint32_t size() {
697        return sizeof(uint16_t);
698    }
699
700    virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &/*translate*/) {
701        // Write flag/count
702        *((uint16_t *)(bytes+offset)) = (fChars.length() & kCountMask)
703            | (fVertical ? kVerticalNode : 0) | (fParentEndsWord ? kParentEndsWord : 0 );
704        offset += sizeof(uint16_t);
705    }
706};
707
708class BuildCompactTrieHorizontalNode: public BuildCompactTrieNode {
709 public:
710    UStack          fLinks;
711
712 public:
713    BuildCompactTrieHorizontalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
714        : BuildCompactTrieNode(parentEndsWord, FALSE, nodes, status), fLinks(status) {
715    }
716
717    virtual ~BuildCompactTrieHorizontalNode() {
718    }
719
720    virtual uint32_t size() {
721        return offsetof(CompactTrieHorizontalNode,entries) +
722                (fChars.length()*sizeof(CompactTrieHorizontalEntry));
723    }
724
725    virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
726        BuildCompactTrieNode::write(bytes, offset, translate);
727        int32_t count = fChars.length();
728        for (int32_t i = 0; i < count; ++i) {
729            CompactTrieHorizontalEntry *entry = (CompactTrieHorizontalEntry *)(bytes+offset);
730            entry->ch = fChars[i];
731            entry->equal = translate.elementAti(((BuildCompactTrieNode *)fLinks[i])->fNodeID);
732#ifdef DEBUG_TRIE_DICT
733            if (entry->equal == 0) {
734                fprintf(stderr, "ERROR: horizontal link %d, logical node %d maps to physical node zero\n",
735                        i, ((BuildCompactTrieNode *)fLinks[i])->fNodeID);
736            }
737#endif
738            offset += sizeof(CompactTrieHorizontalEntry);
739        }
740    }
741
742    void addNode(UChar ch, BuildCompactTrieNode *link, UErrorCode &status) {
743        fChars.append(ch);
744        fLinks.push(link, status);
745    }
746};
747
748class BuildCompactTrieVerticalNode: public BuildCompactTrieNode {
749 public:
750    BuildCompactTrieNode    *fEqual;
751
752 public:
753    BuildCompactTrieVerticalNode(UBool parentEndsWord, UStack &nodes, UErrorCode &status)
754        : BuildCompactTrieNode(parentEndsWord, TRUE, nodes, status) {
755        fEqual = NULL;
756    }
757
758    virtual ~BuildCompactTrieVerticalNode() {
759    }
760
761    virtual uint32_t size() {
762        return offsetof(CompactTrieVerticalNode,chars) + (fChars.length()*sizeof(uint16_t));
763    }
764
765    virtual void write(uint8_t *bytes, uint32_t &offset, const UVector32 &translate) {
766        CompactTrieVerticalNode *node = (CompactTrieVerticalNode *)(bytes+offset);
767        BuildCompactTrieNode::write(bytes, offset, translate);
768        node->equal = translate.elementAti(fEqual->fNodeID);
769        offset += sizeof(node->equal);
770#ifdef DEBUG_TRIE_DICT
771        if (node->equal == 0) {
772            fprintf(stderr, "ERROR: vertical link, logical node %d maps to physical node zero\n",
773                    fEqual->fNodeID);
774        }
775#endif
776        fChars.extract(0, fChars.length(), (UChar *)node->chars);
777        offset += sizeof(uint16_t)*fChars.length();
778    }
779
780    void addChar(UChar ch) {
781        fChars.append(ch);
782    }
783
784    void setLink(BuildCompactTrieNode *node) {
785        fEqual = node;
786    }
787};
788
789// Forward declaration
790static void walkHorizontal(const TernaryNode *node,
791                            BuildCompactTrieHorizontalNode *building,
792                            UStack &nodes,
793                            UErrorCode &status);
794
795// Convert one node. Uses recursion.
796
797static BuildCompactTrieNode *
798compactOneNode(const TernaryNode *node, UBool parentEndsWord, UStack &nodes, UErrorCode &status) {
799    if (U_FAILURE(status)) {
800        return NULL;
801    }
802    BuildCompactTrieNode *result = NULL;
803    UBool horizontal = (node->low != NULL || node->high != NULL);
804    if (horizontal) {
805        BuildCompactTrieHorizontalNode *hResult =
806                new BuildCompactTrieHorizontalNode(parentEndsWord, nodes, status);
807        if (hResult == NULL) {
808            status = U_MEMORY_ALLOCATION_ERROR;
809            return NULL;
810        }
811        if (U_SUCCESS(status)) {
812            walkHorizontal(node, hResult, nodes, status);
813            result = hResult;
814        }
815    }
816    else {
817        BuildCompactTrieVerticalNode *vResult =
818                new BuildCompactTrieVerticalNode(parentEndsWord, nodes, status);
819        if (vResult == NULL) {
820            status = U_MEMORY_ALLOCATION_ERROR;
821        }
822        else if (U_SUCCESS(status)) {
823            UBool   endsWord = FALSE;
824            // Take up nodes until we end a word, or hit a node with < or > links
825            do {
826                vResult->addChar(node->ch);
827                endsWord = (node->flags & kEndsWord) != 0;
828                node = node->equal;
829            }
830            while(node != NULL && !endsWord && node->low == NULL && node->high == NULL);
831            if (node == NULL) {
832                if (!endsWord) {
833                    status = U_ILLEGAL_ARGUMENT_ERROR;  // Corrupt input trie
834                }
835                else {
836                    vResult->setLink((BuildCompactTrieNode *)nodes[1]);
837                }
838            }
839            else {
840                vResult->setLink(compactOneNode(node, endsWord, nodes, status));
841            }
842            result = vResult;
843        }
844    }
845    return result;
846}
847
848// Walk the set of peers at the same level, to build a horizontal node.
849// Uses recursion.
850
851static void walkHorizontal(const TernaryNode *node,
852                            BuildCompactTrieHorizontalNode *building,
853                            UStack &nodes,
854                            UErrorCode &status) {
855    while (U_SUCCESS(status) && node != NULL) {
856        if (node->low != NULL) {
857            walkHorizontal(node->low, building, nodes, status);
858        }
859        BuildCompactTrieNode *link = NULL;
860        if (node->equal != NULL) {
861            link = compactOneNode(node->equal, (node->flags & kEndsWord) != 0, nodes, status);
862        }
863        else if (node->flags & kEndsWord) {
864            link = (BuildCompactTrieNode *)nodes[1];
865        }
866        if (U_SUCCESS(status) && link != NULL) {
867            building->addNode(node->ch, link, status);
868        }
869        // Tail recurse manually instead of leaving it to the compiler.
870        //if (node->high != NULL) {
871        //    walkHorizontal(node->high, building, nodes, status);
872        //}
873        node = node->high;
874    }
875}
876
877U_NAMESPACE_END
878U_NAMESPACE_USE
879U_CDECL_BEGIN
880static int32_t U_CALLCONV
881_sortBuildNodes(const void * /*context*/, const void *voidl, const void *voidr) {
882    BuildCompactTrieNode *left = *(BuildCompactTrieNode **)voidl;
883    BuildCompactTrieNode *right = *(BuildCompactTrieNode **)voidr;
884    // Check for comparing a node to itself, to avoid spurious duplicates
885    if (left == right) {
886        return 0;
887    }
888    // Most significant is type of node. Can never coalesce.
889    if (left->fVertical != right->fVertical) {
890        return left->fVertical - right->fVertical;
891    }
892    // Next, the "parent ends word" flag. If that differs, we cannot coalesce.
893    if (left->fParentEndsWord != right->fParentEndsWord) {
894        return left->fParentEndsWord - right->fParentEndsWord;
895    }
896    // Next, the string. If that differs, we can never coalesce.
897    int32_t result = left->fChars.compare(right->fChars);
898    if (result != 0) {
899        return result;
900    }
901    // We know they're both the same node type, so branch for the two cases.
902    if (left->fVertical) {
903        result = ((BuildCompactTrieVerticalNode *)left)->fEqual->fNodeID
904                            - ((BuildCompactTrieVerticalNode *)right)->fEqual->fNodeID;
905    }
906    else {
907        // We need to compare the links vectors. They should be the
908        // same size because the strings were equal.
909        // We compare the node IDs instead of the pointers, to handle
910        // coalesced nodes.
911        BuildCompactTrieHorizontalNode *hleft, *hright;
912        hleft = (BuildCompactTrieHorizontalNode *)left;
913        hright = (BuildCompactTrieHorizontalNode *)right;
914        int32_t count = hleft->fLinks.size();
915        for (int32_t i = 0; i < count && result == 0; ++i) {
916            result = ((BuildCompactTrieNode *)(hleft->fLinks[i]))->fNodeID -
917                     ((BuildCompactTrieNode *)(hright->fLinks[i]))->fNodeID;
918        }
919    }
920    // If they are equal to each other, mark them (speeds coalescing)
921    if (result == 0) {
922        left->fHasDuplicate = TRUE;
923        right->fHasDuplicate = TRUE;
924    }
925    return result;
926}
927U_CDECL_END
928U_NAMESPACE_BEGIN
929
930static void coalesceDuplicates(UStack &nodes, UErrorCode &status) {
931    // We sort the array of nodes to place duplicates next to each other
932    if (U_FAILURE(status)) {
933        return;
934    }
935    int32_t size = nodes.size();
936    void **array = (void **)uprv_malloc(sizeof(void *)*size);
937    if (array == NULL) {
938        status = U_MEMORY_ALLOCATION_ERROR;
939        return;
940    }
941    (void) nodes.toArray(array);
942
943    // Now repeatedly identify duplicates until there are no more
944    int32_t dupes = 0;
945    long    passCount = 0;
946#ifdef DEBUG_TRIE_DICT
947    long    totalDupes = 0;
948#endif
949    do {
950        BuildCompactTrieNode *node;
951        BuildCompactTrieNode *first = NULL;
952        BuildCompactTrieNode **p;
953        BuildCompactTrieNode **pFirst = NULL;
954        int32_t counter = size - 2;
955        // Sort the array, skipping nodes 0 and 1. Use quicksort for the first
956        // pass for speed. For the second and subsequent passes, we use stable
957        // (insertion) sort for two reasons:
958        // 1. The array is already mostly ordered, so we get better performance.
959        // 2. The way we find one and only one instance of a set of duplicates is to
960        //    check that the node ID equals the array index. If we used an unstable
961        //    sort for the second or later passes, it's possible that none of the
962        //    duplicates would wind up with a node ID equal to its array index.
963        //    The sort stability guarantees that, because as we coalesce more and
964        //    more groups, the first element of the resultant group will be one of
965        //    the first elements of the groups being coalesced.
966        // To use quicksort for the second and subsequent passes, we would have to
967        // find the minimum of the node numbers in a group, and set all the nodes
968        // in the group to that node number.
969        uprv_sortArray(array+2, counter, sizeof(void *), _sortBuildNodes, NULL, (passCount > 0), &status);
970        dupes = 0;
971        for (p = (BuildCompactTrieNode **)array + 2; counter > 0; --counter, ++p) {
972            node = *p;
973            if (node->fHasDuplicate) {
974                if (first == NULL) {
975                    first = node;
976                    pFirst = p;
977                }
978                else if (_sortBuildNodes(NULL, pFirst, p) != 0) {
979                    // Starting a new run of dupes
980                    first = node;
981                    pFirst = p;
982                }
983                else if (node->fNodeID != first->fNodeID) {
984                    // Slave one to the other, note duplicate
985                    node->fNodeID = first->fNodeID;
986                    dupes += 1;
987                }
988            }
989            else {
990                // This node has no dupes
991                first = NULL;
992                pFirst = NULL;
993            }
994        }
995        passCount += 1;
996#ifdef DEBUG_TRIE_DICT
997        totalDupes += dupes;
998        fprintf(stderr, "Trie node dupe removal, pass %d: %d nodes tagged\n", passCount, dupes);
999#endif
1000    }
1001    while (dupes > 0);
1002#ifdef DEBUG_TRIE_DICT
1003    fprintf(stderr, "Trie node dupe removal complete: %d tagged in %d passes\n", totalDupes, passCount);
1004#endif
1005
1006    // We no longer need the temporary array, as the nodes have all been marked appropriately.
1007    uprv_free(array);
1008}
1009
1010U_NAMESPACE_END
1011U_CDECL_BEGIN
1012static void U_CALLCONV _deleteBuildNode(void *obj) {
1013    delete (BuildCompactTrieNode *) obj;
1014}
1015U_CDECL_END
1016U_NAMESPACE_BEGIN
1017
1018CompactTrieHeader *
1019CompactTrieDictionary::compactMutableTrieDictionary( const MutableTrieDictionary &dict,
1020                                UErrorCode &status ) {
1021    if (U_FAILURE(status)) {
1022        return NULL;
1023    }
1024#ifdef DEBUG_TRIE_DICT
1025    struct tms timing;
1026    struct tms previous;
1027    (void) ::times(&previous);
1028#endif
1029    UStack nodes(_deleteBuildNode, NULL, status);      // Index of nodes
1030
1031    // Add node 0, used as the NULL pointer/sentinel.
1032    nodes.addElement((int32_t)0, status);
1033
1034    // Start by creating the special empty node we use to indicate that the parent
1035    // terminates a word. This must be node 1, because the builder assumes
1036    // that.
1037    if (U_FAILURE(status)) {
1038        return NULL;
1039    }
1040    BuildCompactTrieNode *terminal = new BuildCompactTrieNode(TRUE, FALSE, nodes, status);
1041    if (terminal == NULL) {
1042        status = U_MEMORY_ALLOCATION_ERROR;
1043    }
1044
1045    // This call does all the work of building the new trie structure. The root
1046    // will be node 2.
1047    BuildCompactTrieNode *root = compactOneNode(dict.fTrie, FALSE, nodes, status);
1048#ifdef DEBUG_TRIE_DICT
1049    (void) ::times(&timing);
1050    fprintf(stderr, "Compact trie built, %d nodes, time user %f system %f\n",
1051        nodes.size(), (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1052        (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1053    previous = timing;
1054#endif
1055
1056    // Now coalesce all duplicate nodes.
1057    coalesceDuplicates(nodes, status);
1058#ifdef DEBUG_TRIE_DICT
1059    (void) ::times(&timing);
1060    fprintf(stderr, "Duplicates coalesced, time user %f system %f\n",
1061        (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1062        (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1063    previous = timing;
1064#endif
1065
1066    // Next, build the output trie.
1067    // First we compute all the sizes and build the node ID translation table.
1068    uint32_t totalSize = offsetof(CompactTrieHeader,offsets);
1069    int32_t count = nodes.size();
1070    int32_t nodeCount = 1;              // The sentinel node we already have
1071    BuildCompactTrieNode *node;
1072    int32_t i;
1073    UVector32 translate(count, status); // Should be no growth needed after this
1074    translate.push(0, status);          // The sentinel node
1075
1076    if (U_FAILURE(status)) {
1077        return NULL;
1078    }
1079
1080    for (i = 1; i < count; ++i) {
1081        node = (BuildCompactTrieNode *)nodes[i];
1082        if (node->fNodeID == i) {
1083            // Only one node out of each duplicate set is used
1084            if (i >= translate.size()) {
1085                // Logically extend the mapping table
1086                translate.setSize(i+1);
1087            }
1088            translate.setElementAt(nodeCount++, i);
1089            totalSize += node->size();
1090        }
1091    }
1092
1093    // Check for overflowing 16 bits worth of nodes.
1094    if (nodeCount > 0x10000) {
1095        status = U_ILLEGAL_ARGUMENT_ERROR;
1096        return NULL;
1097    }
1098
1099    // Add enough room for the offsets.
1100    totalSize += nodeCount*sizeof(uint32_t);
1101#ifdef DEBUG_TRIE_DICT
1102    (void) ::times(&timing);
1103    fprintf(stderr, "Sizes/mapping done, time user %f system %f\n",
1104        (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1105        (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1106    previous = timing;
1107    fprintf(stderr, "%d nodes, %d unique, %d bytes\n", nodes.size(), nodeCount, totalSize);
1108#endif
1109    uint8_t *bytes = (uint8_t *)uprv_malloc(totalSize);
1110    if (bytes == NULL) {
1111        status = U_MEMORY_ALLOCATION_ERROR;
1112        return NULL;
1113    }
1114
1115    CompactTrieHeader *header = (CompactTrieHeader *)bytes;
1116    header->size = totalSize;
1117    header->nodeCount = nodeCount;
1118    header->offsets[0] = 0;                     // Sentinel
1119    header->root = translate.elementAti(root->fNodeID);
1120#ifdef DEBUG_TRIE_DICT
1121    if (header->root == 0) {
1122        fprintf(stderr, "ERROR: root node %d translate to physical zero\n", root->fNodeID);
1123    }
1124#endif
1125    uint32_t offset = offsetof(CompactTrieHeader,offsets)+(nodeCount*sizeof(uint32_t));
1126    nodeCount = 1;
1127    // Now write the data
1128    for (i = 1; i < count; ++i) {
1129        node = (BuildCompactTrieNode *)nodes[i];
1130        if (node->fNodeID == i) {
1131            header->offsets[nodeCount++] = offset;
1132            node->write(bytes, offset, translate);
1133        }
1134    }
1135#ifdef DEBUG_TRIE_DICT
1136    (void) ::times(&timing);
1137    fprintf(stderr, "Trie built, time user %f system %f\n",
1138        (double)(timing.tms_utime-previous.tms_utime)/CLK_TCK,
1139        (double)(timing.tms_stime-previous.tms_stime)/CLK_TCK);
1140    previous = timing;
1141    fprintf(stderr, "Final offset is %d\n", offset);
1142
1143    // Collect statistics on node types and sizes
1144    int hCount = 0;
1145    int vCount = 0;
1146    size_t hSize = 0;
1147    size_t vSize = 0;
1148    size_t hItemCount = 0;
1149    size_t vItemCount = 0;
1150    uint32_t previousOff = offset;
1151    for (uint16_t nodeIdx = nodeCount-1; nodeIdx >= 2; --nodeIdx) {
1152        const CompactTrieNode *node = getCompactNode(header, nodeIdx);
1153        if (node->flagscount & kVerticalNode) {
1154            vCount += 1;
1155            vItemCount += (node->flagscount & kCountMask);
1156            vSize += previousOff-header->offsets[nodeIdx];
1157        }
1158        else {
1159            hCount += 1;
1160            hItemCount += (node->flagscount & kCountMask);
1161            hSize += previousOff-header->offsets[nodeIdx];
1162        }
1163        previousOff = header->offsets[nodeIdx];
1164    }
1165    fprintf(stderr, "Horizontal nodes: %d total, average %f bytes with %f items\n", hCount,
1166                (double)hSize/hCount, (double)hItemCount/hCount);
1167    fprintf(stderr, "Vertical nodes: %d total, average %f bytes with %f items\n", vCount,
1168                (double)vSize/vCount, (double)vItemCount/vCount);
1169#endif
1170
1171    if (U_FAILURE(status)) {
1172        uprv_free(bytes);
1173        header = NULL;
1174    }
1175    else {
1176        header->magic = COMPACT_TRIE_MAGIC_1;
1177    }
1178    return header;
1179}
1180
1181// Forward declaration
1182static TernaryNode *
1183unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status );
1184
1185
1186// Convert a horizontal node (or subarray thereof) into a ternary subtrie
1187static TernaryNode *
1188unpackHorizontalArray( const CompactTrieHeader *header, const CompactTrieHorizontalEntry *array,
1189                            int low, int high, UErrorCode &status ) {
1190    if (U_FAILURE(status) || low > high) {
1191        return NULL;
1192    }
1193    int middle = (low+high)/2;
1194    TernaryNode *result = new TernaryNode(array[middle].ch);
1195    if (result == NULL) {
1196        status = U_MEMORY_ALLOCATION_ERROR;
1197        return NULL;
1198    }
1199    const CompactTrieNode *equal = getCompactNode(header, array[middle].equal);
1200    if (equal->flagscount & kParentEndsWord) {
1201        result->flags |= kEndsWord;
1202    }
1203    result->low = unpackHorizontalArray(header, array, low, middle-1, status);
1204    result->high = unpackHorizontalArray(header, array, middle+1, high, status);
1205    result->equal = unpackOneNode(header, equal, status);
1206    return result;
1207}
1208
1209// Convert one compact trie node into a ternary subtrie
1210static TernaryNode *
1211unpackOneNode( const CompactTrieHeader *header, const CompactTrieNode *node, UErrorCode &status ) {
1212    int nodeCount = (node->flagscount & kCountMask);
1213    if (nodeCount == 0 || U_FAILURE(status)) {
1214        // Failure, or terminal node
1215        return NULL;
1216    }
1217    if (node->flagscount & kVerticalNode) {
1218        const CompactTrieVerticalNode *vnode = (const CompactTrieVerticalNode *)node;
1219        TernaryNode *head = NULL;
1220        TernaryNode *previous = NULL;
1221        TernaryNode *latest = NULL;
1222        for (int i = 0; i < nodeCount; ++i) {
1223            latest = new TernaryNode(vnode->chars[i]);
1224            if (latest == NULL) {
1225                status = U_MEMORY_ALLOCATION_ERROR;
1226                break;
1227            }
1228            if (head == NULL) {
1229                head = latest;
1230            }
1231            if (previous != NULL) {
1232                previous->equal = latest;
1233            }
1234            previous = latest;
1235        }
1236        if (latest != NULL) {
1237            const CompactTrieNode *equal = getCompactNode(header, vnode->equal);
1238            if (equal->flagscount & kParentEndsWord) {
1239                latest->flags |= kEndsWord;
1240            }
1241            latest->equal = unpackOneNode(header, equal, status);
1242        }
1243        return head;
1244    }
1245    else {
1246        // Horizontal node
1247        const CompactTrieHorizontalNode *hnode = (const CompactTrieHorizontalNode *)node;
1248        return unpackHorizontalArray(header, &hnode->entries[0], 0, nodeCount-1, status);
1249    }
1250}
1251
1252MutableTrieDictionary *
1253CompactTrieDictionary::cloneMutable( UErrorCode &status ) const {
1254    MutableTrieDictionary *result = new MutableTrieDictionary( status );
1255    if (result == NULL) {
1256        status = U_MEMORY_ALLOCATION_ERROR;
1257        return NULL;
1258    }
1259    TernaryNode *root = unpackOneNode(fData, getCompactNode(fData, fData->root), status);
1260    if (U_FAILURE(status)) {
1261        delete root;    // Clean up
1262        delete result;
1263        return NULL;
1264    }
1265    result->fTrie = root;
1266    return result;
1267}
1268
1269U_NAMESPACE_END
1270
1271U_CAPI int32_t U_EXPORT2
1272triedict_swap(const UDataSwapper *ds, const void *inData, int32_t length, void *outData,
1273           UErrorCode *status) {
1274
1275    if (status == NULL || U_FAILURE(*status)) {
1276        return 0;
1277    }
1278    if(ds==NULL || inData==NULL || length<-1 || (length>0 && outData==NULL)) {
1279        *status=U_ILLEGAL_ARGUMENT_ERROR;
1280        return 0;
1281    }
1282
1283    //
1284    //  Check that the data header is for for dictionary data.
1285    //    (Header contents are defined in genxxx.cpp)
1286    //
1287    const UDataInfo *pInfo = (const UDataInfo *)((const uint8_t *)inData+4);
1288    if(!(  pInfo->dataFormat[0]==0x54 &&   /* dataFormat="TrDc" */
1289           pInfo->dataFormat[1]==0x72 &&
1290           pInfo->dataFormat[2]==0x44 &&
1291           pInfo->dataFormat[3]==0x63 &&
1292           pInfo->formatVersion[0]==1  )) {
1293        udata_printError(ds, "triedict_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized\n",
1294                         pInfo->dataFormat[0], pInfo->dataFormat[1],
1295                         pInfo->dataFormat[2], pInfo->dataFormat[3],
1296                         pInfo->formatVersion[0]);
1297        *status=U_UNSUPPORTED_ERROR;
1298        return 0;
1299    }
1300
1301    //
1302    // Swap the data header.  (This is the generic ICU Data Header, not the
1303    //                         CompactTrieHeader).  This swap also conveniently gets us
1304    //                         the size of the ICU d.h., which lets us locate the start
1305    //                         of the RBBI specific data.
1306    //
1307    int32_t headerSize=udata_swapDataHeader(ds, inData, length, outData, status);
1308
1309    //
1310    // Get the CompactTrieHeader, and check that it appears to be OK.
1311    //
1312    const uint8_t  *inBytes =(const uint8_t *)inData+headerSize;
1313    const CompactTrieHeader *header = (const CompactTrieHeader *)inBytes;
1314    if (ds->readUInt32(header->magic) != COMPACT_TRIE_MAGIC_1
1315            || ds->readUInt32(header->size) < sizeof(CompactTrieHeader))
1316    {
1317        udata_printError(ds, "triedict_swap(): CompactTrieHeader is invalid.\n");
1318        *status=U_UNSUPPORTED_ERROR;
1319        return 0;
1320    }
1321
1322    //
1323    // Prefight operation?  Just return the size
1324    //
1325    uint32_t totalSize = ds->readUInt32(header->size);
1326    int32_t sizeWithUData = (int32_t)totalSize + headerSize;
1327    if (length < 0) {
1328        return sizeWithUData;
1329    }
1330
1331    //
1332    // Check that length passed in is consistent with length from RBBI data header.
1333    //
1334    if (length < sizeWithUData) {
1335        udata_printError(ds, "triedict_swap(): too few bytes (%d after ICU Data header) for trie data.\n",
1336                            totalSize);
1337        *status=U_INDEX_OUTOFBOUNDS_ERROR;
1338        return 0;
1339        }
1340
1341    //
1342    // Swap the Data.  Do the data itself first, then the CompactTrieHeader, because
1343    //                 we need to reference the header to locate the data, and an
1344    //                 inplace swap of the header leaves it unusable.
1345    //
1346    uint8_t             *outBytes = (uint8_t *)outData + headerSize;
1347    CompactTrieHeader   *outputHeader = (CompactTrieHeader *)outBytes;
1348
1349#if 0
1350    //
1351    // If not swapping in place, zero out the output buffer before starting.
1352    //
1353    if (inBytes != outBytes) {
1354        uprv_memset(outBytes, 0, totalSize);
1355    }
1356
1357    // We need to loop through all the nodes in the offset table, and swap each one.
1358    uint16_t nodeCount = ds->readUInt16(header->nodeCount);
1359    // Skip node 0, which should always be 0.
1360    for (int i = 1; i < nodeCount; ++i) {
1361        uint32_t nodeOff = ds->readUInt32(header->offsets[i]);
1362        const CompactTrieNode *inNode = (const CompactTrieNode *)(inBytes + nodeOff);
1363        CompactTrieNode *outNode = (CompactTrieNode *)(outBytes + nodeOff);
1364        uint16_t flagscount = ds->readUInt16(inNode->flagscount);
1365        uint16_t itemCount = flagscount & kCountMask;
1366        ds->writeUInt16(&outNode->flagscount, flagscount);
1367        if (itemCount > 0) {
1368            if (flagscount & kVerticalNode) {
1369                ds->swapArray16(ds, inBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars),
1370                                    itemCount*sizeof(uint16_t),
1371                                    outBytes+nodeOff+offsetof(CompactTrieVerticalNode,chars), status);
1372                uint16_t equal = ds->readUInt16(inBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal);
1373                ds->writeUInt16(outBytes+nodeOff+offsetof(CompactTrieVerticalNode,equal));
1374            }
1375            else {
1376                const CompactTrieHorizontalNode *inHNode = (const CompactTrieHorizontalNode *)inNode;
1377                CompactTrieHorizontalNode *outHNode = (CompactTrieHorizontalNode *)outNode;
1378                for (int j = 0; j < itemCount; ++j) {
1379                    uint16_t word = ds->readUInt16(inHNode->entries[j].ch);
1380                    ds->writeUInt16(&outHNode->entries[j].ch, word);
1381                    word = ds->readUInt16(inHNode->entries[j].equal);
1382                    ds->writeUInt16(&outHNode->entries[j].equal, word);
1383                }
1384            }
1385        }
1386    }
1387#endif
1388
1389    // All the data in all the nodes consist of 16 bit items. Swap them all at once.
1390    uint16_t nodeCount = ds->readUInt16(header->nodeCount);
1391    uint32_t nodesOff = offsetof(CompactTrieHeader,offsets)+((uint32_t)nodeCount*sizeof(uint32_t));
1392    ds->swapArray16(ds, inBytes+nodesOff, totalSize-nodesOff, outBytes+nodesOff, status);
1393
1394    // Swap the header
1395    ds->writeUInt32(&outputHeader->size, totalSize);
1396    uint32_t magic = ds->readUInt32(header->magic);
1397    ds->writeUInt32(&outputHeader->magic, magic);
1398    ds->writeUInt16(&outputHeader->nodeCount, nodeCount);
1399    uint16_t root = ds->readUInt16(header->root);
1400    ds->writeUInt16(&outputHeader->root, root);
1401    ds->swapArray32(ds, inBytes+offsetof(CompactTrieHeader,offsets),
1402            sizeof(uint32_t)*(int32_t)nodeCount,
1403            outBytes+offsetof(CompactTrieHeader,offsets), status);
1404
1405    return sizeWithUData;
1406}
1407
1408#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1409