1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html#License
3/*
4**********************************************************************
5*   Copyright (c) 2001-2011, International Business Machines
6*   Corporation and others.  All Rights Reserved.
7**********************************************************************
8*   Date        Name        Description
9*   11/19/2001  aliu        Creation.
10**********************************************************************
11*/
12package com.ibm.icu.text;
13import com.ibm.icu.impl.Utility;
14import com.ibm.icu.lang.UCharacter;
15
16/**
17 * A transliterator that converts Unicode escape forms to the
18 * characters they represent.  Escape forms have a prefix, a suffix, a
19 * radix, and minimum and maximum digit counts.
20 *
21 * <p>This class is package private.  It registers several standard
22 * variants with the system which are then accessed via their IDs.
23 *
24 * @author Alan Liu
25 */
26class UnescapeTransliterator extends Transliterator {
27
28    /**
29     * The encoded pattern specification.  The pattern consists of
30     * zero or more forms.  Each form consists of a prefix, suffix,
31     * radix, minimum digit count, and maximum digit count.  These
32     * values are stored as a five character header.  That is, their
33     * numeric values are cast to 16-bit characters and stored in the
34     * string.  Following these five characters, the prefix
35     * characters, then suffix characters are stored.  Each form thus
36     * takes n+5 characters, where n is the total length of the prefix
37     * and suffix.  The end is marked by a header of length one
38     * consisting of the character END.
39     */
40    private char spec[];
41
42    /**
43     * Special character marking the end of the spec[] array.
44     */
45    private static final char END = 0xFFFF;
46
47    /**
48     * Registers standard variants with the system.  Called by
49     * Transliterator during initialization.
50     */
51    static void register() {
52        // Unicode: "U+10FFFF" hex, min=4, max=6
53        Transliterator.registerFactory("Hex-Any/Unicode", new Transliterator.Factory() {
54            @Override
55            public Transliterator getInstance(String ID) {
56                return new UnescapeTransliterator("Hex-Any/Unicode", new char[] {
57                    2, 0, 16, 4, 6, 'U', '+',
58                    END
59                });
60            }
61        });
62
63        // Java: "\\uFFFF" hex, min=4, max=4
64        Transliterator.registerFactory("Hex-Any/Java", new Transliterator.Factory() {
65            @Override
66            public Transliterator getInstance(String ID) {
67                return new UnescapeTransliterator("Hex-Any/Java", new char[] {
68                    2, 0, 16, 4, 4, '\\', 'u',
69                    END
70                });
71            }
72        });
73
74        // C: "\\uFFFF" hex, min=4, max=4; \\U0010FFFF hex, min=8, max=8
75        Transliterator.registerFactory("Hex-Any/C", new Transliterator.Factory() {
76            @Override
77            public Transliterator getInstance(String ID) {
78                return new UnescapeTransliterator("Hex-Any/C", new char[] {
79                    2, 0, 16, 4, 4, '\\', 'u',
80                    2, 0, 16, 8, 8, '\\', 'U',
81                    END
82                });
83            }
84        });
85
86        // XML: "&#x10FFFF;" hex, min=1, max=6
87        Transliterator.registerFactory("Hex-Any/XML", new Transliterator.Factory() {
88            @Override
89            public Transliterator getInstance(String ID) {
90                return new UnescapeTransliterator("Hex-Any/XML", new char[] {
91                    3, 1, 16, 1, 6, '&', '#', 'x', ';',
92                    END
93                });
94            }
95        });
96
97        // XML10: "&1114111;" dec, min=1, max=7 (not really "Hex-Any")
98        Transliterator.registerFactory("Hex-Any/XML10", new Transliterator.Factory() {
99            @Override
100            public Transliterator getInstance(String ID) {
101                return new UnescapeTransliterator("Hex-Any/XML10", new char[] {
102                    2, 1, 10, 1, 7, '&', '#', ';',
103                    END
104                });
105            }
106        });
107
108        // Perl: "\\x{263A}" hex, min=1, max=6
109        Transliterator.registerFactory("Hex-Any/Perl", new Transliterator.Factory() {
110            @Override
111            public Transliterator getInstance(String ID) {
112                return new UnescapeTransliterator("Hex-Any/Perl", new char[] {
113                    3, 1, 16, 1, 6, '\\', 'x', '{', '}',
114                    END
115                });
116            }
117        });
118
119        // All: Java, C, Perl, XML, XML10, Unicode
120        Transliterator.registerFactory("Hex-Any", new Transliterator.Factory() {
121            @Override
122            public Transliterator getInstance(String ID) {
123                return new UnescapeTransliterator("Hex-Any", new char[] {
124                    2, 0, 16, 4, 6, 'U', '+',            // Unicode
125                    2, 0, 16, 4, 4, '\\', 'u',           // Java
126                    2, 0, 16, 8, 8, '\\', 'U',           // C (surrogates)
127                    3, 1, 16, 1, 6, '&', '#', 'x', ';',  // XML
128                    2, 1, 10, 1, 7, '&', '#', ';',       // XML10
129                    3, 1, 16, 1, 6, '\\', 'x', '{', '}', // Perl
130                    END
131                });
132            }
133        });
134    }
135
136    /**
137     * Package private constructor.  Takes the encoded spec array.
138     */
139    UnescapeTransliterator(String ID, char spec[]) {
140        super(ID, null);
141        this.spec = spec;
142    }
143
144    /**
145     * Implements {@link Transliterator#handleTransliterate}.
146     */
147    @Override
148    protected void handleTransliterate(Replaceable text,
149                                       Position pos, boolean isIncremental) {
150        int start = pos.start;
151        int limit = pos.limit;
152        int i, ipat;
153
154      loop:
155        while (start < limit) {
156            // Loop over the forms in spec[].  Exit this loop when we
157            // match one of the specs.  Exit the outer loop if a
158            // partial match is detected and isIncremental is true.
159            for (ipat = 0; spec[ipat] != END;) {
160
161                // Read the header
162                int prefixLen = spec[ipat++];
163                int suffixLen = spec[ipat++];
164                int radix     = spec[ipat++];
165                int minDigits = spec[ipat++];
166                int maxDigits = spec[ipat++];
167
168                // s is a copy of start that is advanced over the
169                // characters as we parse them.
170                int s = start;
171                boolean match = true;
172
173                for (i=0; i<prefixLen; ++i) {
174                    if (s >= limit) {
175                        if (i > 0) {
176                            // We've already matched a character.  This is
177                            // a partial match, so we return if in
178                            // incremental mode.  In non-incremental mode,
179                            // go to the next spec.
180                            if (isIncremental) {
181                                break loop;
182                            }
183                            match = false;
184                            break;
185                        }
186                    }
187                    char c = text.charAt(s++);
188                    if (c != spec[ipat + i]) {
189                        match = false;
190                        break;
191                    }
192                }
193
194                if (match) {
195                    int u = 0;
196                    int digitCount = 0;
197                    for (;;) {
198                        if (s >= limit) {
199                            // Check for partial match in incremental mode.
200                            if (s > start && isIncremental) {
201                                break loop;
202                            }
203                            break;
204                        }
205                        int ch = text.char32At(s);
206                        int digit = UCharacter.digit(ch, radix);
207                        if (digit < 0) {
208                            break;
209                        }
210                        s += UTF16.getCharCount(ch);
211                        u = (u * radix) + digit;
212                        if (++digitCount == maxDigits) {
213                            break;
214                        }
215                    }
216
217                    match = (digitCount >= minDigits);
218
219                    if (match) {
220                        for (i=0; i<suffixLen; ++i) {
221                            if (s >= limit) {
222                                // Check for partial match in incremental mode.
223                                if (s > start && isIncremental) {
224                                    break loop;
225                                }
226                                match = false;
227                                break;
228                            }
229                            char c = text.charAt(s++);
230                            if (c != spec[ipat + prefixLen + i]) {
231                                match = false;
232                                break;
233                            }
234                        }
235
236                        if (match) {
237                            // At this point, we have a match
238                            String str = UTF16.valueOf(u);
239                            text.replace(start, s, str);
240                            limit -= s - start - str.length();
241                            // The following break statement leaves the
242                            // loop that is traversing the forms in
243                            // spec[].  We then parse the next input
244                            // character.
245                            break;
246                        }
247                    }
248                }
249
250                ipat += prefixLen + suffixLen;
251            }
252
253            if (start < limit) {
254                start += UTF16.getCharCount(text.char32At(start));
255            }
256        }
257
258        pos.contextLimit += limit - pos.limit;
259        pos.limit = limit;
260        pos.start = start;
261    }
262
263    /* (non-Javadoc)
264     * @see com.ibm.icu.text.Transliterator#addSourceTargetSet(com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet, com.ibm.icu.text.UnicodeSet)
265     */
266    @Override
267    public void addSourceTargetSet(UnicodeSet inputFilter, UnicodeSet sourceSet, UnicodeSet targetSet) {
268        // Each form consists of a prefix, suffix,
269        // * radix, minimum digit count, and maximum digit count.  These
270        // * values are stored as a five character header. ...
271        UnicodeSet myFilter = getFilterAsUnicodeSet(inputFilter);
272        UnicodeSet items = new UnicodeSet();
273        StringBuilder buffer = new StringBuilder();
274        for (int i = 0; spec[i] != END;) {
275            // first 5 items are header
276            int end = i + spec[i] + spec[i+1] + 5;
277            int radix = spec[i+2];
278            for (int j = 0; j < radix; ++j) {
279                Utility.appendNumber(buffer, j, radix, 0);
280            }
281            // then add the characters
282            for (int j = i + 5; j < end; ++j) {
283                items.add(spec[j]);
284            }
285            // and go to next block
286            i = end;
287        }
288        items.addAll(buffer.toString());
289        items.retainAll(myFilter);
290
291        if (items.size() > 0) {
292            sourceSet.addAll(items);
293            targetSet.addAll(0,0x10FFFF); // assume we can produce any character
294        }
295    }
296}
297