1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3
4// edits.h
5// created: 2016dec30 Markus W. Scherer
6
7#ifndef __EDITS_H__
8#define __EDITS_H__
9
10#include "unicode/utypes.h"
11#include "unicode/uobject.h"
12
13/**
14 * \file
15 * \brief C++ API: C++ class Edits for low-level string transformations on styled text.
16 */
17
18U_NAMESPACE_BEGIN
19
20#ifndef U_HIDE_DRAFT_API
21
22/**
23 * Records lengths of string edits but not replacement text.
24 * Supports replacements, insertions, deletions in linear progression.
25 * Does not support moving/reordering of text.
26 *
27 * An Edits object tracks a separate UErrorCode, but ICU string transformation functions
28 * (e.g., case mapping functions) merge any such errors into their API's UErrorCode.
29 *
30 * @draft ICU 59
31 */
32class U_COMMON_API Edits U_FINAL : public UMemory {
33public:
34    /**
35     * Constructs an empty object.
36     * @draft ICU 59
37     */
38    Edits() :
39            array(stackArray), capacity(STACK_CAPACITY), length(0), delta(0), numChanges(0),
40            errorCode_(U_ZERO_ERROR) {}
41    /**
42     * Copy constructor.
43     * @param other source edits
44     * @draft ICU 60
45     */
46    Edits(const Edits &other) :
47            array(stackArray), capacity(STACK_CAPACITY), length(other.length),
48            delta(other.delta), numChanges(other.numChanges),
49            errorCode_(other.errorCode_) {
50        copyArray(other);
51    }
52    /**
53     * Move constructor, might leave src empty.
54     * This object will have the same contents that the source object had.
55     * @param src source edits
56     * @draft ICU 60
57     */
58    Edits(Edits &&src) U_NOEXCEPT :
59            array(stackArray), capacity(STACK_CAPACITY), length(src.length),
60            delta(src.delta), numChanges(src.numChanges),
61            errorCode_(src.errorCode_) {
62        moveArray(src);
63    }
64
65    /**
66     * Destructor.
67     * @draft ICU 59
68     */
69    ~Edits();
70
71    /**
72     * Assignment operator.
73     * @param other source edits
74     * @return *this
75     * @draft ICU 60
76     */
77    Edits &operator=(const Edits &other);
78
79    /**
80     * Move assignment operator, might leave src empty.
81     * This object will have the same contents that the source object had.
82     * The behavior is undefined if *this and src are the same object.
83     * @param src source edits
84     * @return *this
85     * @draft ICU 60
86     */
87    Edits &operator=(Edits &&src) U_NOEXCEPT;
88
89    /**
90     * Resets the data but may not release memory.
91     * @draft ICU 59
92     */
93    void reset() U_NOEXCEPT;
94
95    /**
96     * Adds a record for an unchanged segment of text.
97     * Normally called from inside ICU string transformation functions, not user code.
98     * @draft ICU 59
99     */
100    void addUnchanged(int32_t unchangedLength);
101    /**
102     * Adds a record for a text replacement/insertion/deletion.
103     * Normally called from inside ICU string transformation functions, not user code.
104     * @draft ICU 59
105     */
106    void addReplace(int32_t oldLength, int32_t newLength);
107    /**
108     * Sets the UErrorCode if an error occurred while recording edits.
109     * Preserves older error codes in the outErrorCode.
110     * Normally called from inside ICU string transformation functions, not user code.
111     * @param outErrorCode Set to an error code if it does not contain one already
112     *                  and an error occurred while recording edits.
113     *                  Otherwise unchanged.
114     * @return TRUE if U_FAILURE(outErrorCode)
115     * @draft ICU 59
116     */
117    UBool copyErrorTo(UErrorCode &outErrorCode);
118
119    /**
120     * How much longer is the new text compared with the old text?
121     * @return new length minus old length
122     * @draft ICU 59
123     */
124    int32_t lengthDelta() const { return delta; }
125    /**
126     * @return TRUE if there are any change edits
127     * @draft ICU 59
128     */
129    UBool hasChanges() const { return numChanges != 0; }
130
131    /**
132     * @return the number of change edits
133     * @draft ICU 60
134     */
135    int32_t numberOfChanges() const { return numChanges; }
136
137    /**
138     * Access to the list of edits.
139     * @see getCoarseIterator
140     * @see getFineIterator
141     * @draft ICU 59
142     */
143    struct U_COMMON_API Iterator U_FINAL : public UMemory {
144        /**
145         * Default constructor, empty iterator.
146         * @draft ICU 60
147         */
148        Iterator() :
149                array(nullptr), index(0), length(0),
150                remaining(0), onlyChanges_(FALSE), coarse(FALSE),
151                dir(0), changed(FALSE), oldLength_(0), newLength_(0),
152                srcIndex(0), replIndex(0), destIndex(0) {}
153        /**
154         * Copy constructor.
155         * @draft ICU 59
156         */
157        Iterator(const Iterator &other) = default;
158        /**
159         * Assignment operator.
160         * @draft ICU 59
161         */
162        Iterator &operator=(const Iterator &other) = default;
163
164        /**
165         * Advances to the next edit.
166         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
167         *                  or else the function returns immediately. Check for U_FAILURE()
168         *                  on output or use with function chaining. (See User Guide for details.)
169         * @return TRUE if there is another edit
170         * @draft ICU 59
171         */
172        UBool next(UErrorCode &errorCode) { return next(onlyChanges_, errorCode); }
173
174        /**
175         * Finds the edit that contains the source index.
176         * The source index may be found in a non-change
177         * even if normal iteration would skip non-changes.
178         * Normal iteration can continue from a found edit.
179         *
180         * The iterator state before this search logically does not matter.
181         * (It may affect the performance of the search.)
182         *
183         * The iterator state after this search is undefined
184         * if the source index is out of bounds for the source string.
185         *
186         * @param i source index
187         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
188         *                  or else the function returns immediately. Check for U_FAILURE()
189         *                  on output or use with function chaining. (See User Guide for details.)
190         * @return TRUE if the edit for the source index was found
191         * @draft ICU 59
192         */
193        UBool findSourceIndex(int32_t i, UErrorCode &errorCode) {
194            return findIndex(i, TRUE, errorCode) == 0;
195        }
196
197        /**
198         * Finds the edit that contains the destination index.
199         * The destination index may be found in a non-change
200         * even if normal iteration would skip non-changes.
201         * Normal iteration can continue from a found edit.
202         *
203         * The iterator state before this search logically does not matter.
204         * (It may affect the performance of the search.)
205         *
206         * The iterator state after this search is undefined
207         * if the source index is out of bounds for the source string.
208         *
209         * @param i destination index
210         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
211         *                  or else the function returns immediately. Check for U_FAILURE()
212         *                  on output or use with function chaining. (See User Guide for details.)
213         * @return TRUE if the edit for the destination index was found
214         * @draft ICU 60
215         */
216        UBool findDestinationIndex(int32_t i, UErrorCode &errorCode) {
217            return findIndex(i, FALSE, errorCode) == 0;
218        }
219
220        /**
221         * Returns the destination index corresponding to the given source index.
222         * If the source index is inside a change edit (not at its start),
223         * then the destination index at the end of that edit is returned,
224         * since there is no information about index mapping inside a change edit.
225         *
226         * (This means that indexes to the start and middle of an edit,
227         * for example around a grapheme cluster, are mapped to indexes
228         * encompassing the entire edit.
229         * The alternative, mapping an interior index to the start,
230         * would map such an interval to an empty one.)
231         *
232         * This operation will usually but not always modify this object.
233         * The iterator state after this search is undefined.
234         *
235         * @param i source index
236         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
237         *                  or else the function returns immediately. Check for U_FAILURE()
238         *                  on output or use with function chaining. (See User Guide for details.)
239         * @return destination index; undefined if i is not 0..string length
240         * @draft ICU 60
241         */
242        int32_t destinationIndexFromSourceIndex(int32_t i, UErrorCode &errorCode);
243
244        /**
245         * Returns the source index corresponding to the given destination index.
246         * If the destination index is inside a change edit (not at its start),
247         * then the source index at the end of that edit is returned,
248         * since there is no information about index mapping inside a change edit.
249         *
250         * (This means that indexes to the start and middle of an edit,
251         * for example around a grapheme cluster, are mapped to indexes
252         * encompassing the entire edit.
253         * The alternative, mapping an interior index to the start,
254         * would map such an interval to an empty one.)
255         *
256         * This operation will usually but not always modify this object.
257         * The iterator state after this search is undefined.
258         *
259         * @param i destination index
260         * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
261         *                  or else the function returns immediately. Check for U_FAILURE()
262         *                  on output or use with function chaining. (See User Guide for details.)
263         * @return source index; undefined if i is not 0..string length
264         * @draft ICU 60
265         */
266        int32_t sourceIndexFromDestinationIndex(int32_t i, UErrorCode &errorCode);
267
268        /**
269         * @return TRUE if this edit replaces oldLength() units with newLength() different ones.
270         *         FALSE if oldLength units remain unchanged.
271         * @draft ICU 59
272         */
273        UBool hasChange() const { return changed; }
274        /**
275         * @return the number of units in the original string which are replaced or remain unchanged.
276         * @draft ICU 59
277         */
278        int32_t oldLength() const { return oldLength_; }
279        /**
280         * @return the number of units in the modified string, if hasChange() is TRUE.
281         *         Same as oldLength if hasChange() is FALSE.
282         * @draft ICU 59
283         */
284        int32_t newLength() const { return newLength_; }
285
286        /**
287         * @return the current index into the source string
288         * @draft ICU 59
289         */
290        int32_t sourceIndex() const { return srcIndex; }
291        /**
292         * @return the current index into the replacement-characters-only string,
293         *         not counting unchanged spans
294         * @draft ICU 59
295         */
296        int32_t replacementIndex() const { return replIndex; }
297        /**
298         * @return the current index into the full destination string
299         * @draft ICU 59
300         */
301        int32_t destinationIndex() const { return destIndex; }
302
303    private:
304        friend class Edits;
305
306        Iterator(const uint16_t *a, int32_t len, UBool oc, UBool crs);
307
308        int32_t readLength(int32_t head);
309        void updateNextIndexes();
310        void updatePreviousIndexes();
311        UBool noNext();
312        UBool next(UBool onlyChanges, UErrorCode &errorCode);
313        UBool previous(UErrorCode &errorCode);
314        /** @return -1: error or i<0; 0: found; 1: i>=string length */
315        int32_t findIndex(int32_t i, UBool findSource, UErrorCode &errorCode);
316
317        const uint16_t *array;
318        int32_t index, length;
319        // 0 if we are not within compressed equal-length changes.
320        // Otherwise the number of remaining changes, including the current one.
321        int32_t remaining;
322        UBool onlyChanges_, coarse;
323
324        int8_t dir;  // iteration direction: back(<0), initial(0), forward(>0)
325        UBool changed;
326        int32_t oldLength_, newLength_;
327        int32_t srcIndex, replIndex, destIndex;
328    };
329
330    /**
331     * Returns an Iterator for coarse-grained changes for simple string updates.
332     * Skips non-changes.
333     * @return an Iterator that merges adjacent changes.
334     * @draft ICU 59
335     */
336    Iterator getCoarseChangesIterator() const {
337        return Iterator(array, length, TRUE, TRUE);
338    }
339
340    /**
341     * Returns an Iterator for coarse-grained changes and non-changes for simple string updates.
342     * @return an Iterator that merges adjacent changes.
343     * @draft ICU 59
344     */
345    Iterator getCoarseIterator() const {
346        return Iterator(array, length, FALSE, TRUE);
347    }
348
349    /**
350     * Returns an Iterator for fine-grained changes for modifying styled text.
351     * Skips non-changes.
352     * @return an Iterator that separates adjacent changes.
353     * @draft ICU 59
354     */
355    Iterator getFineChangesIterator() const {
356        return Iterator(array, length, TRUE, FALSE);
357    }
358
359    /**
360     * Returns an Iterator for fine-grained changes and non-changes for modifying styled text.
361     * @return an Iterator that separates adjacent changes.
362     * @draft ICU 59
363     */
364    Iterator getFineIterator() const {
365        return Iterator(array, length, FALSE, FALSE);
366    }
367
368    /**
369     * Merges the two input Edits and appends the result to this object.
370     *
371     * Consider two string transformations (for example, normalization and case mapping)
372     * where each records Edits in addition to writing an output string.<br>
373     * Edits ab reflect how substrings of input string a
374     * map to substrings of intermediate string b.<br>
375     * Edits bc reflect how substrings of intermediate string b
376     * map to substrings of output string c.<br>
377     * This function merges ab and bc such that the additional edits
378     * recorded in this object reflect how substrings of input string a
379     * map to substrings of output string c.
380     *
381     * If unrelated Edits are passed in where the output string of the first
382     * has a different length than the input string of the second,
383     * then a U_ILLEGAL_ARGUMENT_ERROR is reported.
384     *
385     * @param ab reflects how substrings of input string a
386     *     map to substrings of intermediate string b.
387     * @param bc reflects how substrings of intermediate string b
388     *     map to substrings of output string c.
389     * @param errorCode ICU error code. Its input value must pass the U_SUCCESS() test,
390     *                  or else the function returns immediately. Check for U_FAILURE()
391     *                  on output or use with function chaining. (See User Guide for details.)
392     * @return *this, with the merged edits appended
393     * @draft ICU 60
394     */
395    Edits &mergeAndAppend(const Edits &ab, const Edits &bc, UErrorCode &errorCode);
396
397private:
398    void releaseArray() U_NOEXCEPT;
399    Edits &copyArray(const Edits &other);
400    Edits &moveArray(Edits &src) U_NOEXCEPT;
401
402    void setLastUnit(int32_t last) { array[length - 1] = (uint16_t)last; }
403    int32_t lastUnit() const { return length > 0 ? array[length - 1] : 0xffff; }
404
405    void append(int32_t r);
406    UBool growArray();
407
408    static const int32_t STACK_CAPACITY = 100;
409    uint16_t *array;
410    int32_t capacity;
411    int32_t length;
412    int32_t delta;
413    int32_t numChanges;
414    UErrorCode errorCode_;
415    uint16_t stackArray[STACK_CAPACITY];
416};
417
418#endif  // U_HIDE_DRAFT_API
419
420U_NAMESPACE_END
421
422#endif  // __EDITS_H__
423