PluralSamples.java revision 7935b1839a081ed19ae0d33029ad3c09632a2caa
19d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish/*
29d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish *******************************************************************************
39d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish * Copyright (C) 2013-2014, International Business Machines Corporation and    *
4e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrish * others. All Rights Reserved.                                                *
5e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrish *******************************************************************************
69d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish */
79d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishpackage com.ibm.icu.text;
8e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrish
99d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.ArrayList;
109d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.Collection;
119d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.Collections;
129d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.Comparator;
139d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.HashMap;
149d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.HashSet;
159d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.LinkedHashSet;
169d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.List;
179d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.Map;
18e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrishimport java.util.Map.Entry;
19e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrishimport java.util.Set;
209d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport java.util.TreeSet;
219d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
229d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport com.ibm.icu.text.PluralRules.FixedDecimal;
239d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport com.ibm.icu.text.PluralRules.KeywordStatus;
249d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport com.ibm.icu.text.PluralRules.StandardPluralCategories;
259d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishimport com.ibm.icu.util.Output;
269d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
279d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish/**
289d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish * @author markdavis
299d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish * Refactor samples as first step to moving into CLDR
309d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish *
319d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish * @internal
329d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish * @deprecated This API is ICU internal only.
339d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish */
349d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish@Deprecated
359d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrishpublic class PluralSamples {
369d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
379d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    private PluralRules pluralRules;
389d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    private final Map<String, List<Double>> _keySamplesMap;
399d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
409d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    /**
419d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish     * @internal
429d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish     * @deprecated This API is ICU internal only.
439d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish     */
449d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    @Deprecated
459d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    public final Map<String, Boolean> _keyLimitedMap;
469d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    private final Map<String, Set<FixedDecimal>> _keyFractionSamplesMap;
479d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    private final Set<FixedDecimal> _fractionSamples;
489d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
499d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    /**
509d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish     * @internal
519d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish     * @deprecated This API is ICU internal only.
529d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish     */
539d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    @Deprecated
549d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish    public PluralSamples(PluralRules pluralRules) {
559d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        this.pluralRules = pluralRules;
569d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        Set<String> keywords = pluralRules.getKeywords();
57e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrish        // ensure both _keySamplesMap and _keyLimitedMap are initialized.
58e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrish        // If this were allowed to vary on a per-call basis, we'd have to recheck and
59e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrish        // possibly rebuild the samples cache.  Doesn't seem worth it.
60e07896813c99dccd0e27e837fc74b02ef00b7360Jonathan Gerrish        // This 'max samples' value only applies to keywords that are unlimited, for
619d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        // other keywords all the matching values are returned.  This might be a lot.
629d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        final int MAX_SAMPLES = 3;
639d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
649d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        Map<String, Boolean> temp = new HashMap<String, Boolean>();
659d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        for (String k : keywords) {
669d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish            temp.put(k, pluralRules.isLimited(k));
679d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        }
689d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        _keyLimitedMap = temp;
699d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
709d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        Map<String, List<Double>> sampleMap = new HashMap<String, List<Double>>();
719d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        int keywordsRemaining = keywords.size();
729d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
739d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        int limit = 128; // Math.max(5, getRepeatLimit() * MAX_SAMPLES) * 2;
749d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
759d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        for (int i = 0; keywordsRemaining > 0 && i < limit; ++i) {
769d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish            keywordsRemaining = addSimpleSamples(pluralRules, MAX_SAMPLES, sampleMap, keywordsRemaining, i / 2.0);
779d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        }
789d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        // Hack for Celtic
799d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish        keywordsRemaining = addSimpleSamples(pluralRules, MAX_SAMPLES, sampleMap, keywordsRemaining, 1000000);
809d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
819d68c9a82b0c22bb716270222ec8174a1da48eafJonathan Gerrish
82        // collect explicit samples
83        Map<String, Set<FixedDecimal>> sampleFractionMap = new HashMap<String, Set<FixedDecimal>>();
84        Set<FixedDecimal> mentioned = new TreeSet<FixedDecimal>();
85        // make sure that there is at least one 'other' value
86        Map<String, Set<FixedDecimal>> foundKeywords = new HashMap<String, Set<FixedDecimal>>();
87        for (FixedDecimal s : mentioned) {
88            String keyword = pluralRules.select(s);
89            addRelation(foundKeywords, keyword, s);
90        }
91        main:
92            if (foundKeywords.size() != keywords.size()) {
93                for (int i = 1; i < 1000; ++i) {
94                    boolean done = addIfNotPresent(i, mentioned, foundKeywords);
95                    if (done) break main;
96                }
97                // if we are not done, try tenths
98                for (int i = 10; i < 1000; ++i) {
99                    boolean done = addIfNotPresent(i/10d, mentioned, foundKeywords);
100                    if (done) break main;
101                }
102                System.out.println("Failed to find sample for each keyword: " + foundKeywords + "\n\t" + pluralRules + "\n\t" + mentioned);
103            }
104        mentioned.add(new FixedDecimal(0)); // always there
105        mentioned.add(new FixedDecimal(1)); // always there
106        mentioned.add(new FixedDecimal(2)); // always there
107        mentioned.add(new FixedDecimal(0.1,1)); // always there
108        mentioned.add(new FixedDecimal(1.99,2)); // always there
109        mentioned.addAll(fractions(mentioned));
110        for (FixedDecimal s : mentioned) {
111            String keyword = pluralRules.select(s);
112            Set<FixedDecimal> list = sampleFractionMap.get(keyword);
113            if (list == null) {
114                list = new LinkedHashSet<FixedDecimal>(); // will be sorted because the iteration is
115                sampleFractionMap.put(keyword, list);
116            }
117            list.add(s);
118        }
119
120        if (keywordsRemaining > 0) {
121            for (String k : keywords) {
122                if (!sampleMap.containsKey(k)) {
123                    sampleMap.put(k, Collections.<Double>emptyList());
124                }
125                if (!sampleFractionMap.containsKey(k)) {
126                    sampleFractionMap.put(k, Collections.<FixedDecimal>emptySet());
127                }
128            }
129        }
130
131        // Make lists immutable so we can return them directly
132        for (Entry<String, List<Double>> entry : sampleMap.entrySet()) {
133            sampleMap.put(entry.getKey(), Collections.unmodifiableList(entry.getValue()));
134        }
135        for (Entry<String, Set<FixedDecimal>> entry : sampleFractionMap.entrySet()) {
136            sampleFractionMap.put(entry.getKey(), Collections.unmodifiableSet(entry.getValue()));
137        }
138        _keySamplesMap = sampleMap;
139        _keyFractionSamplesMap = sampleFractionMap;
140        _fractionSamples = Collections.unmodifiableSet(mentioned);
141    }
142
143    private int addSimpleSamples(PluralRules pluralRules, final int MAX_SAMPLES, Map<String, List<Double>> sampleMap,
144            int keywordsRemaining, double val) {
145        String keyword = pluralRules.select(val);
146        boolean keyIsLimited = _keyLimitedMap.get(keyword);
147
148        List<Double> list = sampleMap.get(keyword);
149        if (list == null) {
150            list = new ArrayList<Double>(MAX_SAMPLES);
151            sampleMap.put(keyword, list);
152        } else if (!keyIsLimited && list.size() == MAX_SAMPLES) {
153            return keywordsRemaining;
154        }
155        list.add(Double.valueOf(val));
156
157        if (!keyIsLimited && list.size() == MAX_SAMPLES) {
158            --keywordsRemaining;
159        }
160        return keywordsRemaining;
161    }
162
163    private void addRelation(Map<String, Set<FixedDecimal>> foundKeywords, String keyword, FixedDecimal s) {
164        Set<FixedDecimal> set = foundKeywords.get(keyword);
165        if (set == null) {
166            foundKeywords.put(keyword, set = new HashSet<FixedDecimal>());
167        }
168        set.add(s);
169    }
170
171    private boolean addIfNotPresent(double d, Set<FixedDecimal> mentioned, Map<String, Set<FixedDecimal>> foundKeywords) {
172        FixedDecimal numberInfo = new FixedDecimal(d);
173        String keyword = pluralRules.select(numberInfo);
174        if (!foundKeywords.containsKey(keyword) || keyword.equals("other")) {
175            addRelation(foundKeywords, keyword, numberInfo);
176            mentioned.add(numberInfo);
177            if (keyword.equals("other")) {
178                if (foundKeywords.get("other").size() > 1) {
179                    return true;
180                }
181            }
182        }
183        return false;
184    }
185
186    private static final int[] TENS = {1, 10, 100, 1000, 10000, 100000, 1000000};
187
188    private static final int LIMIT_FRACTION_SAMPLES = 3;
189
190
191    private Set<FixedDecimal> fractions(Set<FixedDecimal> original) {
192        Set<FixedDecimal> toAddTo = new HashSet<FixedDecimal>();
193
194        Set<Integer> result = new HashSet<Integer>();
195        for (FixedDecimal base1 : original) {
196            result.add((int)base1.integerValue);
197        }
198        List<Integer> ints = new ArrayList<Integer>(result);
199        Set<String> keywords = new HashSet<String>();
200
201        for (int j = 0; j < ints.size(); ++j) {
202            Integer base = ints.get(j);
203            String keyword = pluralRules.select(base);
204            if (keywords.contains(keyword)) {
205                continue;
206            }
207            keywords.add(keyword);
208            toAddTo.add(new FixedDecimal(base,1)); // add .0
209            toAddTo.add(new FixedDecimal(base,2)); // add .00
210            Integer fract = getDifferentCategory(ints, keyword);
211            if (fract >= TENS[LIMIT_FRACTION_SAMPLES-1]) { // make sure that we always get the value
212                toAddTo.add(new FixedDecimal(base + "." + fract));
213            } else {
214                for (int visibleFractions = 1; visibleFractions < LIMIT_FRACTION_SAMPLES; ++visibleFractions) {
215                    for (int i = 1; i <= visibleFractions; ++i) {
216                        // with visible fractions = 3, and fract = 1, then we should get x.10, 0.01
217                        // with visible fractions = 3, and fract = 15, then we should get x.15, x.15
218                        if (fract >= TENS[i]) {
219                            continue;
220                        }
221                        toAddTo.add(new FixedDecimal(base + fract/(double)TENS[i], visibleFractions));
222                    }
223                }
224            }
225        }
226        return toAddTo;
227    }
228
229    private Integer getDifferentCategory(List<Integer> ints, String keyword) {
230        for (int i = ints.size() - 1; i >= 0; --i) {
231            Integer other = ints.get(i);
232            String keywordOther = pluralRules.select(other);
233            if (!keywordOther.equals(keyword)) {
234                return other;
235            }
236        }
237        return 37;
238    }
239
240    @SuppressWarnings("unused")
241    private static final Comparator<String> KEYWORD_COMPARATOR = new Comparator<String> () {
242        public int compare(String arg0, String arg1) {
243            StandardPluralCategories a = StandardPluralCategories.forString(arg0);
244            StandardPluralCategories b = StandardPluralCategories.forString(arg1);
245            return a == null
246                    ? (b == null ? arg0.compareTo(arg1) : -1)
247                            : (b == null ? 1 : a.compareTo(b));
248        }
249    };
250
251    /**
252     * @internal
253     * @deprecated This API is ICU internal only.
254     */
255    @Deprecated
256    public KeywordStatus getStatus(String keyword, int offset, Set<Double> explicits, Output<Double> uniqueValue) {
257        if (uniqueValue != null) {
258            uniqueValue.value = null;
259        }
260
261        if (!pluralRules.getKeywords().contains(keyword)) {
262            return KeywordStatus.INVALID;
263        }
264        Collection<Double> values = pluralRules.getAllKeywordValues(keyword);
265        if (values == null) {
266            return KeywordStatus.UNBOUNDED;
267        }
268        int originalSize = values.size();
269
270        if (explicits == null) {
271            explicits = Collections.emptySet();
272        }
273
274        // Quick check on whether there are multiple elements
275
276        if (originalSize > explicits.size()) {
277            if (originalSize == 1) {
278                if (uniqueValue != null) {
279                    uniqueValue.value = values.iterator().next();
280                }
281                return KeywordStatus.UNIQUE;
282            }
283            return KeywordStatus.BOUNDED;
284        }
285
286        // Compute if the quick test is insufficient.
287
288        HashSet<Double> subtractedSet = new HashSet<Double>(values);
289        for (Double explicit : explicits) {
290            subtractedSet.remove(explicit - offset);
291        }
292        if (subtractedSet.size() == 0) {
293            return KeywordStatus.SUPPRESSED;
294        }
295
296        if (uniqueValue != null && subtractedSet.size() == 1) {
297            uniqueValue.value = subtractedSet.iterator().next();
298        }
299
300        return originalSize == 1 ? KeywordStatus.UNIQUE : KeywordStatus.BOUNDED;
301    }
302
303    Map<String, List<Double>> getKeySamplesMap() {
304        return _keySamplesMap;
305    }
306
307    Map<String, Set<FixedDecimal>> getKeyFractionSamplesMap() {
308        return _keyFractionSamplesMap;
309    }
310
311    Set<FixedDecimal> getFractionSamples() {
312        return _fractionSamples;
313    }
314
315    /**
316     * Returns all the values that trigger this keyword, or null if the number of such
317     * values is unlimited.
318     *
319     * @param keyword the keyword
320     * @return the values that trigger this keyword, or null.  The returned collection
321     * is immutable. It will be empty if the keyword is not defined.
322     * @stable ICU 4.8
323     */
324
325    Collection<Double> getAllKeywordValues(String keyword) {
326        // HACK for now
327        if (!pluralRules.getKeywords().contains(keyword)) {
328            return Collections.<Double>emptyList();
329        }
330        Collection<Double> result = getKeySamplesMap().get(keyword);
331
332        // We depend on MAX_SAMPLES here.  It's possible for a conjunction
333        // of unlimited rules that 'looks' unlimited to return a limited
334        // number of values.  There's no bounds to this limited number, in
335        // general, because you can construct arbitrarily complex rules.  Since
336        // we always generate 3 samples if a rule is really unlimited, that's
337        // where we put the cutoff.
338        if (result.size() > 2 && !_keyLimitedMap.get(keyword)) {
339            return null;
340        }
341        return result;
342    }
343}
344