1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4 *******************************************************************************
5 * Copyright (C) 1996-2014, International Business Machines Corporation and
6 * others. All Rights Reserved.
7 *******************************************************************************
8 */
9package com.ibm.icu.text;
10
11import java.text.CharacterIterator;
12
13import com.ibm.icu.lang.UCharacter;
14import com.ibm.icu.util.ICUCloneNotSupportedException;
15import com.ibm.icu.util.ULocale;
16
17
18/**
19 * Inserts the specified characters at word breaks. To restrict it to particular characters, use a filter.
20 * TODO: this is an internal class, and only temporary. Remove it once we have \b notation in Transliterator.
21 */
22final class BreakTransliterator extends Transliterator {
23    private BreakIterator bi;
24    private String insertion;
25    private int[] boundaries = new int[50];
26    private int boundaryCount = 0;
27
28    public BreakTransliterator(String ID, UnicodeFilter filter, BreakIterator bi, String insertion) {
29        super(ID, filter);
30        this.bi = bi;
31        this.insertion = insertion;
32    }
33
34    public BreakTransliterator(String ID, UnicodeFilter filter) {
35        this(ID, filter, null, " ");
36    }
37
38    ///CLOVER:OFF
39    // The following method is not called by anything and can't be reached
40    public String getInsertion() {
41        return insertion;
42    }
43    ///CLOVER:ON
44
45    ///CLOVER:OFF
46    // The following method is not called by anything and can't be reached
47    public void setInsertion(String insertion) {
48        this.insertion = insertion;
49    }
50    ///CLOVER:ON
51
52    public BreakIterator getBreakIterator() {
53        // Defer initialization of BreakIterator because it is slow,
54        // typically over 2000 ms.
55        if (bi == null) bi = BreakIterator.getWordInstance(new ULocale("th_TH"));
56        return bi;
57    }
58
59    ///CLOVER:OFF
60    // The following method is not called by anything and can't be reached
61    public void setBreakIterator(BreakIterator bi) {
62        this.bi = bi;
63    }
64    ///CLOVER:ON
65
66    static final int LETTER_OR_MARK_MASK =
67          (1<<Character.UPPERCASE_LETTER)
68        | (1<<Character.LOWERCASE_LETTER)
69        | (1<<Character.TITLECASE_LETTER)
70        | (1<<Character.MODIFIER_LETTER)
71        | (1<<Character.OTHER_LETTER)
72        | (1<<Character.COMBINING_SPACING_MARK)
73        | (1<<Character.NON_SPACING_MARK)
74        | (1<<Character.ENCLOSING_MARK)
75        ;
76    @Override
77    protected synchronized void handleTransliterate(Replaceable text, Position pos, boolean incremental) {
78        boundaryCount = 0;
79        int boundary = 0;
80        getBreakIterator(); // Lazy-create it if necessary
81        bi.setText(new ReplaceableCharacterIterator(text, pos.start, pos.limit, pos.start));
82        // TODO: fix clumsy workaround used below.
83        /*
84        char[] tempBuffer = new char[text.length()];
85        text.getChars(0, text.length(), tempBuffer, 0);
86        bi.setText(new StringCharacterIterator(new String(tempBuffer), pos.start, pos.limit, pos.start));
87        */
88        // end debugging
89
90        // To make things much easier, we will stack the boundaries, and then insert at the end.
91        // generally, we won't need too many, since we will be filtered.
92
93        for(boundary = bi.first(); boundary != BreakIterator.DONE && boundary < pos.limit; boundary = bi.next()) {
94            if (boundary == 0) continue;
95            // HACK: Check to see that preceeding item was a letter
96
97            int cp = UTF16.charAt(text, boundary-1);
98            int type = UCharacter.getType(cp);
99            //System.out.println(Integer.toString(cp,16) + " (before): " + type);
100            if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;
101
102            cp = UTF16.charAt(text, boundary);
103            type = UCharacter.getType(cp);
104            //System.out.println(Integer.toString(cp,16) + " (after): " + type);
105            if (((1<<type) & LETTER_OR_MARK_MASK) == 0) continue;
106
107            if (boundaryCount >= boundaries.length) {       // realloc if necessary
108                int[] temp = new int[boundaries.length * 2];
109                System.arraycopy(boundaries, 0, temp, 0, boundaries.length);
110                boundaries = temp;
111            }
112
113            boundaries[boundaryCount++] = boundary;
114            //System.out.println(boundary);
115        }
116
117        int delta = 0;
118        int lastBoundary = 0;
119
120        if (boundaryCount != 0) { // if we found something, adjust
121            delta = boundaryCount * insertion.length();
122            lastBoundary = boundaries[boundaryCount-1];
123
124            // we do this from the end backwards, so that we don't have to keep updating.
125
126            while (boundaryCount > 0) {
127                boundary = boundaries[--boundaryCount];
128                text.replace(boundary, boundary, insertion);
129            }
130        }
131
132        // Now fix up the return values
133        pos.contextLimit += delta;
134        pos.limit += delta;
135        pos.start = incremental ? lastBoundary + delta : pos.limit;
136    }
137
138
139    /**
140     * Registers standard variants with the system.  Called by
141     * Transliterator during initialization.
142     */
143    static void register() {
144        // false means that it is invisible
145        Transliterator trans = new BreakTransliterator("Any-BreakInternal", null);
146        Transliterator.registerInstance(trans, false);
147        /*
148        Transliterator.registerFactory("Any-Break", new Transliterator.Factory() {
149            public Transliterator getInstance(String ID) {
150                return new BreakTransliterator("Any-Break", null);
151            }
152        });
153        */
154    }
155
156    // Hack, just to get a real character iterator.
157    static final class ReplaceableCharacterIterator implements CharacterIterator
158    {
159        private Replaceable text;
160        private int begin;
161        private int end;
162        // invariant: begin <= pos <= end
163        private int pos;
164
165        /**
166        * Constructs an iterator with an initial index of 0.
167        */
168        /*public ReplaceableCharacterIterator(Replaceable text)
169        {
170            this(text, 0);
171        }*/
172
173        /**
174        * Constructs an iterator with the specified initial index.
175        *
176        * @param  text   The String to be iterated over
177        * @param  pos    Initial iterator position
178        */
179        /*public ReplaceableCharacterIterator(Replaceable text, int pos)
180        {
181            this(text, 0, text.length(), pos);
182        }*/
183
184        /**
185        * Constructs an iterator over the given range of the given string, with the
186        * index set at the specified position.
187        *
188        * @param  text   The String to be iterated over
189        * @param  begin  Index of the first character
190        * @param  end    Index of the character following the last character
191        * @param  pos    Initial iterator position
192        */
193        public ReplaceableCharacterIterator(Replaceable text, int begin, int end, int pos) {
194            if (text == null) {
195                throw new NullPointerException();
196            }
197            this.text = text;
198
199            if (begin < 0 || begin > end || end > text.length()) {
200                throw new IllegalArgumentException("Invalid substring range");
201            }
202
203            if (pos < begin || pos > end) {
204                throw new IllegalArgumentException("Invalid position");
205            }
206
207            this.begin = begin;
208            this.end = end;
209            this.pos = pos;
210        }
211
212        /**
213        * Reset this iterator to point to a new string.  This package-visible
214        * method is used by other java.text classes that want to avoid allocating
215        * new ReplaceableCharacterIterator objects every time their setText method
216        * is called.
217        *
218        * @param  text   The String to be iterated over
219        */
220        public void setText(Replaceable text) {
221            if (text == null) {
222                throw new NullPointerException();
223            }
224            this.text = text;
225            this.begin = 0;
226            this.end = text.length();
227            this.pos = 0;
228        }
229
230        /**
231        * Implements CharacterIterator.first() for String.
232        * @see CharacterIterator#first
233        */
234        @Override
235        public char first()
236        {
237            pos = begin;
238            return current();
239        }
240
241        /**
242        * Implements CharacterIterator.last() for String.
243        * @see CharacterIterator#last
244        */
245        @Override
246        public char last()
247        {
248            if (end != begin) {
249                pos = end - 1;
250            } else {
251                pos = end;
252            }
253            return current();
254        }
255
256        /**
257        * Implements CharacterIterator.setIndex() for String.
258        * @see CharacterIterator#setIndex
259        */
260        @Override
261        public char setIndex(int p)
262        {
263        if (p < begin || p > end) {
264                throw new IllegalArgumentException("Invalid index");
265        }
266            pos = p;
267            return current();
268        }
269
270        /**
271        * Implements CharacterIterator.current() for String.
272        * @see CharacterIterator#current
273        */
274        @Override
275        public char current()
276        {
277            if (pos >= begin && pos < end) {
278                return text.charAt(pos);
279            }
280            else {
281                return DONE;
282            }
283        }
284
285        /**
286        * Implements CharacterIterator.next() for String.
287        * @see CharacterIterator#next
288        */
289        @Override
290        public char next()
291        {
292            if (pos < end - 1) {
293                pos++;
294                return text.charAt(pos);
295            }
296            else {
297                pos = end;
298                return DONE;
299            }
300        }
301
302        /**
303        * Implements CharacterIterator.previous() for String.
304        * @see CharacterIterator#previous
305        */
306        @Override
307        public char previous()
308        {
309            if (pos > begin) {
310                pos--;
311                return text.charAt(pos);
312            }
313            else {
314                return DONE;
315            }
316        }
317
318        /**
319        * Implements CharacterIterator.getBeginIndex() for String.
320        * @see CharacterIterator#getBeginIndex
321        */
322        @Override
323        public int getBeginIndex()
324        {
325            return begin;
326        }
327
328        /**
329        * Implements CharacterIterator.getEndIndex() for String.
330        * @see CharacterIterator#getEndIndex
331        */
332        @Override
333        public int getEndIndex()
334        {
335            return end;
336        }
337
338        /**
339        * Implements CharacterIterator.getIndex() for String.
340        * @see CharacterIterator#getIndex
341        */
342        @Override
343        public int getIndex()
344        {
345            return pos;
346        }
347
348        /**
349        * Compares the equality of two ReplaceableCharacterIterator objects.
350        * @param obj the ReplaceableCharacterIterator object to be compared with.
351        * @return true if the given obj is the same as this
352        * ReplaceableCharacterIterator object; false otherwise.
353        */
354        @Override
355        public boolean equals(Object obj)
356        {
357            if (this == obj) {
358                return true;
359            }
360            if (!(obj instanceof ReplaceableCharacterIterator)) {
361                return false;
362            }
363
364            ReplaceableCharacterIterator that = (ReplaceableCharacterIterator) obj;
365
366            if (hashCode() != that.hashCode()) {
367                return false;
368            }
369            if (!text.equals(that.text)) {
370                return false;
371            }
372            if (pos != that.pos || begin != that.begin || end != that.end) {
373                return false;
374            }
375            return true;
376        }
377
378        /**
379        * Computes a hashcode for this iterator.
380        * @return A hash code
381        */
382        @Override
383        public int hashCode()
384        {
385            return text.hashCode() ^ pos ^ begin ^ end;
386        }
387
388        /**
389        * Creates a copy of this iterator.
390        * @return A copy of this
391        */
392        @Override
393        public Object clone()
394        {
395            try {
396                ReplaceableCharacterIterator other
397                = (ReplaceableCharacterIterator) super.clone();
398                return other;
399            }
400            catch (CloneNotSupportedException e) {
401                throw new ICUCloneNotSupportedException();
402            }
403        }
404
405    }
406    /* (non-Javadoc)
407     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
408     */
409    @Override
410    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
411        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
412        // Doesn't actually modify the source characters, so leave them alone.
413        // add the characters inserted
414        if (myFilter.size() != 0) {
415            targetSet.addAll(insertion);
416        }
417    }
418
419}
420