PrettyPrinter.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
1/**
2 *******************************************************************************
3 * Copyright (C) 1996-2012, International Business Machines Corporation and    *
4 * others. All Rights Reserved.                                                *
5 **********************************************************************
6 * Author: Mark Davis
7 **********************************************************************
8 */
9
10package com.ibm.icu.dev.util;
11
12import java.io.IOException;
13import java.text.FieldPosition;
14import java.util.Comparator;
15import java.util.TreeSet;
16
17import com.ibm.icu.impl.Utility;
18import com.ibm.icu.lang.UCharacter;
19import com.ibm.icu.text.StringTransform;
20import com.ibm.icu.text.UTF16;
21import com.ibm.icu.text.UTF16.StringComparator;
22import com.ibm.icu.text.UnicodeSet;
23import com.ibm.icu.text.UnicodeSetIterator;
24
25/** Provides more flexible formatting of UnicodeSet patterns.
26 */
27public class PrettyPrinter {
28    private static final StringComparator CODEPOINT_ORDER = new UTF16.StringComparator(true,false,0);
29    private static final UnicodeSet PATTERN_WHITESPACE = (UnicodeSet) new UnicodeSet("[[:Cn:][:Default_Ignorable_Code_Point:][:patternwhitespace:]]").freeze();
30    private static final UnicodeSet SORT_AT_END = (UnicodeSet) new UnicodeSet("[[:Cn:][:Cs:][:Co:][:Ideographic:]]").freeze();
31    private static final UnicodeSet QUOTED_SYNTAX = (UnicodeSet) new UnicodeSet("[\\[\\]\\-\\^\\&\\\\\\{\\}\\$\\:]").addAll(PATTERN_WHITESPACE).freeze();
32
33    private boolean first = true;
34    private StringBuffer target = new StringBuffer();
35    private int firstCodePoint = -2;
36    private int lastCodePoint = -2;
37    private boolean compressRanges = true;
38    private String lastString = "";
39    private UnicodeSet toQuote = new UnicodeSet(PATTERN_WHITESPACE);
40    private StringTransform quoter = null;
41
42    private Comparator<String> ordering;
43    private Comparator<String> spaceComp;
44
45    public PrettyPrinter() {
46    }
47
48    public StringTransform getQuoter() {
49        return quoter;
50    }
51
52    public PrettyPrinter setQuoter(StringTransform quoter) {
53        this.quoter = quoter;
54        return this; // for chaining
55    }
56
57    public boolean isCompressRanges() {
58        return compressRanges;
59    }
60
61    /**
62     * @param compressRanges if you want abcde instead of a-e, make this false
63     * @return
64     */
65    public PrettyPrinter setCompressRanges(boolean compressRanges) {
66        this.compressRanges = compressRanges;
67        return this;
68    }
69
70    public Comparator<String> getOrdering() {
71        return ordering;
72    }
73
74    /**
75     * @param ordering the resulting  ordering of the list of characters in the pattern
76     * @return
77     */
78    public PrettyPrinter setOrdering(Comparator ordering) {
79        this.ordering = ordering == null ? CODEPOINT_ORDER : new com.ibm.icu.impl.MultiComparator<String>(ordering, CODEPOINT_ORDER);
80        return this;
81    }
82
83    public Comparator<String> getSpaceComparator() {
84        return spaceComp;
85    }
86
87    /**
88     * @param spaceComp if the comparison returns non-zero, then a space will be inserted between characters
89     * @return this, for chaining
90     */
91    public PrettyPrinter setSpaceComparator(Comparator spaceComp) {
92        this.spaceComp = spaceComp;
93        return this;
94    }
95
96    public UnicodeSet getToQuote() {
97        return toQuote;
98    }
99
100    /**
101     * a UnicodeSet of extra characters to quote with \\uXXXX-style escaping (will automatically quote pattern whitespace)
102     * @param toQuote
103     */
104    public PrettyPrinter setToQuote(UnicodeSet toQuote) {
105        if (toQuote != null) {
106            toQuote = (UnicodeSet)toQuote.cloneAsThawed();
107            toQuote.addAll(PATTERN_WHITESPACE);
108            this.toQuote = toQuote;
109        }
110        return this;
111    }
112
113
114    /**
115     * Get the pattern for a particular set.
116     * @param uset
117     * @return formatted UnicodeSet
118     */
119    public String format(UnicodeSet uset) {
120        first = true;
121        UnicodeSet putAtEnd = new UnicodeSet(uset).retainAll(SORT_AT_END); // remove all the unassigned gorp for now
122        // make sure that comparison separates all strings, even canonically equivalent ones
123        TreeSet<String> orderedStrings = new TreeSet<String>(ordering);
124        for (UnicodeSetIterator it = new UnicodeSetIterator(uset); it.nextRange();) {
125            if (it.codepoint == UnicodeSetIterator.IS_STRING) {
126                orderedStrings.add(it.string);
127            } else {
128                for (int i = it.codepoint; i <= it.codepointEnd; ++i) {
129                    if (!putAtEnd.contains(i)) {
130                        orderedStrings.add(UTF16.valueOf(i));
131                    }
132                }
133            }
134        }
135        target.setLength(0);
136        target.append("[");
137        for (String item : orderedStrings) {
138            appendUnicodeSetItem(item);
139        }
140        for (UnicodeSetIterator it = new UnicodeSetIterator(putAtEnd); it.next();) { // add back the unassigned gorp
141            appendUnicodeSetItem(it.codepoint); // we know that these are only codepoints, not strings, so this is safe
142        }
143        flushLast();
144        target.append("]");
145        String sresult = target.toString();
146
147        // double check the results. This can be removed once we have more tests.
148        //        try {
149        //            UnicodeSet  doubleCheck = new UnicodeSet(sresult);
150        //            if (!uset.equals(doubleCheck)) {
151        //                throw new IllegalStateException("Failure to round-trip in pretty-print " + uset + " => " + sresult + Utility.LINE_SEPARATOR + " source-result: " + new UnicodeSet(uset).removeAll(doubleCheck) +  Utility.LINE_SEPARATOR + " result-source: " + new UnicodeSet(doubleCheck).removeAll(uset));
152        //            }
153        //        } catch (RuntimeException e) {
154        //            throw (RuntimeException) new IllegalStateException("Failure to round-trip in pretty-print " + uset).initCause(e);
155        //        }
156        return sresult;
157    }
158
159    private PrettyPrinter appendUnicodeSetItem(String s) {
160        if (UTF16.hasMoreCodePointsThan(s, 1)) {
161            flushLast();
162            addSpaceAsNeededBefore(s);
163            appendQuoted(s);
164            lastString = s;
165        } else {
166            appendUnicodeSetItem(UTF16.charAt(s, 0));
167        }
168        return this;
169    }
170
171    private void appendUnicodeSetItem(int cp) {
172        if (!compressRanges)
173            flushLast();
174        if (cp == lastCodePoint + 1) {
175            lastCodePoint = cp; // continue range
176        } else { // start range
177            flushLast();
178            firstCodePoint = lastCodePoint = cp;
179        }
180    }
181    /**
182     *
183     */
184    private void addSpaceAsNeededBefore(String s) {
185        if (first) {
186            first = false;
187        } else if (spaceComp != null && spaceComp.compare(s, lastString) != 0) {
188            target.append(' ');
189        } else {
190            int cp = UTF16.charAt(s,0);
191            if (!toQuote.contains(cp) && !QUOTED_SYNTAX.contains(cp)) {
192                int type = UCharacter.getType(cp);
193                if (type == UCharacter.NON_SPACING_MARK || type == UCharacter.ENCLOSING_MARK) {
194                    target.append(' ');
195                } else if (type == UCharacter.SURROGATE && cp >= UTF16.TRAIL_SURROGATE_MIN_VALUE) {
196                    target.append(' '); // make sure we don't accidentally merge two surrogates
197                }
198            }
199        }
200    }
201
202    private void addSpaceAsNeededBefore(int codepoint) {
203        addSpaceAsNeededBefore(UTF16.valueOf(codepoint));
204    }
205
206    private void flushLast() {
207        if (lastCodePoint >= 0) {
208            addSpaceAsNeededBefore(firstCodePoint);
209            if (firstCodePoint != lastCodePoint) {
210                appendQuoted(firstCodePoint);
211                if (firstCodePoint + 1 != lastCodePoint) {
212                    target.append('-');
213                } else {
214                    addSpaceAsNeededBefore(lastCodePoint);
215                }
216            }
217            appendQuoted(lastCodePoint);
218            lastString = UTF16.valueOf(lastCodePoint);
219            firstCodePoint = lastCodePoint = -2;
220        }
221    }
222
223
224    private void appendQuoted(String s) {
225        if (toQuote.containsSome(s) && quoter != null) {
226            target.append(quoter.transform(s));
227        } else {
228            int cp;
229            target.append("{");
230            for (int i = 0; i < s.length(); i += UTF16.getCharCount(cp)) {
231                appendQuoted(cp = UTF16.charAt(s, i));
232            }
233            target.append("}");
234        }
235    }
236
237    PrettyPrinter appendQuoted(int codePoint) {
238        if (toQuote.contains(codePoint)) {
239            if (quoter != null) {
240                target.append(quoter.transform(UTF16.valueOf(codePoint)));
241                return this;
242            }
243            if (codePoint > 0xFFFF) {
244                target.append("\\U");
245                target.append(Utility.hex(codePoint,8));
246            } else {
247                target.append("\\u");
248                target.append(Utility.hex(codePoint,4));
249            }
250            return this;
251        }
252        switch (codePoint) {
253        case '[': // SET_OPEN:
254        case ']': // SET_CLOSE:
255        case '-': // HYPHEN:
256        case '^': // COMPLEMENT:
257        case '&': // INTERSECTION:
258        case '\\': //BACKSLASH:
259        case '{':
260        case '}':
261        case '$':
262        case ':':
263            target.append('\\');
264            break;
265        default:
266            // Escape whitespace
267            if (PATTERN_WHITESPACE.contains(codePoint)) {
268                target.append('\\');
269            }
270            break;
271        }
272        UTF16.append(target, codePoint);
273        return this;
274    }
275    //  Appender append(String s) {
276    //  target.append(s);
277    //  return this;
278    //  }
279    //  public String toString() {
280    //  return target.toString();
281    //  }
282
283    public Appendable format(UnicodeSet obj, Appendable toAppendTo, FieldPosition pos) {
284        try {
285            return toAppendTo.append(format(obj));
286        } catch (IOException e) {
287            throw new IllegalArgumentException(e);
288        }
289    }
290}
291