1/*
2 * Copyright (C) 2008-2012  OMRON SOFTWARE Co., Ltd.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 *      http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17package jp.co.omronsoft.openwnn.JAJP;
18
19import jp.co.omronsoft.openwnn.*;
20import java.util.*;
21
22/**
23 * The penWnn Clause Converter class for Japanese IME.
24 *
25 * @author Copyright (C) 2009 OMRON SOFTWARE CO., LTD.  All Rights Reserved.
26 */
27public class OpenWnnClauseConverterJAJP {
28    /** Score(frequency value) of word in the learning dictionary */
29    private static final int FREQ_LEARN = 600;
30    /** Score(frequency value) of word in the user dictionary */
31    private static final int FREQ_USER  = 500;
32
33    /** Maximum limit length of input */
34    public static final int MAX_INPUT_LENGTH = 50;
35
36    /** search cache for unique independent words (jiritsugo) */
37    private HashMap<String, ArrayList<WnnWord>> mIndepWordBag;
38    /** search cache for all independent words (jiritsugo) */
39    private HashMap<String, ArrayList<WnnWord>> mAllIndepWordBag;
40    /** search cache for ancillary words (fuzokugo) */
41    private HashMap<String, ArrayList<WnnWord>> mFzkPatterns;
42
43    /** connect matrix for generating a clause */
44    private byte[][] mConnectMatrix;
45
46    /** dictionaries */
47    private WnnDictionary mDictionary;
48
49    /** candidates of conversion */
50    private LinkedList mConvertResult;
51
52    /** work area for consecutive clause conversion */
53    private WnnSentence[] mSentenceBuffer;
54
55    /** part of speech (default) */
56    private WnnPOS mPosDefault;
57    /** part of speech (end of clause/not end of sentence) */
58    private WnnPOS mPosEndOfClause1;
59    /** part of speech (end of clause/any place) */
60    private WnnPOS mPosEndOfClause2;
61    /** part of speech (end of sentence) */
62    private WnnPOS mPosEndOfClause3;
63
64    /** cost value of a clause */
65    private static final int CLAUSE_COST = -1000;
66
67    /** The candidate filter */
68    private CandidateFilter mFilter = null;
69
70    /**
71     * Constructor
72     */
73    public OpenWnnClauseConverterJAJP() {
74        mIndepWordBag  = new HashMap<String, ArrayList<WnnWord>>();
75        mAllIndepWordBag  = new HashMap<String, ArrayList<WnnWord>>();
76        mFzkPatterns   = new HashMap();
77        mConvertResult = new LinkedList();
78
79        mSentenceBuffer = new WnnSentence[MAX_INPUT_LENGTH];
80    }
81
82    /**
83     * Set the dictionary
84     *
85     * @param dict  The dictionary for phrase conversion
86     */
87    public void setDictionary(WnnDictionary dict) {
88        /* get connect matrix */
89        mConnectMatrix = dict.getConnectMatrix();
90
91        /* clear dictionary settings */
92        mDictionary = dict;
93        dict.clearDictionary();
94        dict.clearApproxPattern();
95
96        /* clear work areas */
97        mIndepWordBag.clear();
98        mAllIndepWordBag.clear();
99        mFzkPatterns.clear();
100
101        /* get part of speech tags */
102        mPosDefault      = dict.getPOS(WnnDictionary.POS_TYPE_MEISI);
103        mPosEndOfClause1 = dict.getPOS(WnnDictionary.POS_TYPE_V1);
104        mPosEndOfClause2 = dict.getPOS(WnnDictionary.POS_TYPE_V2);
105        mPosEndOfClause3 = dict.getPOS(WnnDictionary.POS_TYPE_V3);
106    }
107
108    /**
109     * Set the candidate filter
110     *
111     * @param filter	The candidate filter
112     */
113    public void setFilter(CandidateFilter filter) {
114    	mFilter = filter;
115    }
116
117    /**
118     * Kana-to-Kanji conversion (single clause).
119     * <br>
120     * This method execute single clause conversion.
121      *
122     * @param input		The input string
123     * @return			The candidates of conversion; {@code null} if an error occurs.
124     */
125     public Iterator convert(String input) {
126        /* do nothing if no dictionary is specified */
127        if (mConnectMatrix == null || mDictionary == null) {
128            return null;
129        }
130        /* do nothing if the length of input exceeds the limit */
131        if (input.length() > MAX_INPUT_LENGTH) {
132            return null;
133        }
134
135        /* clear the candidates list */
136        mConvertResult.clear();
137
138        /* try single clause conversion */
139        if (!singleClauseConvert(mConvertResult, input, mPosEndOfClause2, true)) {
140            return null;
141        }
142        return mConvertResult.iterator();
143    }
144
145    /**
146     * Consecutive clause conversion.
147     *
148     * @param input		The input string
149     * @return			The result of consecutive clause conversion; {@code null} if fail.
150     */
151    public WnnSentence consecutiveClauseConvert(String input) {
152        LinkedList clauses = new LinkedList();
153
154        /* clear the cache which is not matched */
155        for (int i = 0; i < input.length(); i++) {
156            mSentenceBuffer[i] = null;
157        }
158        WnnSentence[] sentence = mSentenceBuffer;
159
160        /* consecutive clause conversion */
161        for (int start = 0; start < input.length(); start++) {
162            if (start != 0 && sentence[start-1] == null) {
163                continue;
164            }
165
166            /* limit the length of a clause */
167            int end = input.length();
168            if (end > start + 20) {
169                end = start + 20;
170            }
171            /* make clauses */
172            for ( ; end > start; end--) {
173                int idx = end - 1;
174
175                /* cutting a branch */
176                if (sentence[idx] != null) {
177                    if (start != 0) {
178                        if (sentence[idx].frequency > sentence[start-1].frequency + CLAUSE_COST + FREQ_LEARN) {
179                            /* there may be no way to be the best sequence from the 'start' */
180                            break;
181                        }
182                    } else {
183                        if (sentence[idx].frequency > CLAUSE_COST + FREQ_LEARN) {
184                            /* there may be no way to be the best sequence from the 'start' */
185                            break;
186                        }
187                    }
188                }
189
190                String key = input.substring(start, end);
191                clauses.clear();
192                WnnClause bestClause = null;
193                if (end == input.length()) {
194                    /* get the clause which can be the end of the sentence */
195                    singleClauseConvert(clauses, key, mPosEndOfClause1, false);
196                } else {
197                    /* get the clause which is not the end of the sentence */
198                    singleClauseConvert(clauses, key, mPosEndOfClause3, false);
199                }
200                if (clauses.isEmpty()) {
201                    bestClause = defaultClause(key);
202                } else {
203                    bestClause = (WnnClause)clauses.get(0);
204                }
205
206                /* make a sub-sentence */
207                WnnSentence ws;
208                if (start == 0) {
209                    ws = new WnnSentence(key, bestClause);
210                } else {
211                    ws = new WnnSentence(sentence[start-1], bestClause);
212                }
213                ws.frequency += CLAUSE_COST;
214
215                /* update the best sub-sentence on the cache buffer */
216                if (sentence[idx] == null || (sentence[idx].frequency < ws.frequency)) {
217                    sentence[idx] = ws;
218                }
219            }
220        }
221
222        /* return the result of the consecutive clause conversion */
223        if (sentence[input.length() - 1] != null) {
224            return sentence[input.length() - 1];
225        }
226        return null;
227    }
228
229    /**
230     * Consecutive clause conversion.
231     *
232     * @param resultList	Where to store the result
233     * @param input			Input string
234     * @return				{@code true} if success; {@code false} if fail.
235     */
236    private boolean consecutiveClauseConvert(LinkedList resultList, String input) {
237        WnnSentence sentence = consecutiveClauseConvert(input);
238
239        /* set the result of the consecutive clause conversion on the top of the list */
240        if (sentence != null) {
241            resultList.add(0, sentence);
242            return true;
243        }
244        return false;
245    }
246
247    /**
248     * Single clause conversion.
249     *
250     * @param clauseList	Where to store the results
251     * @param input			Input string
252     * @param terminal		Part of speech tag at the terminal
253     * @param all			Get all candidates or not
254     * @return				{@code true} if success; {@code false} if fail.
255     */
256    private boolean singleClauseConvert(LinkedList clauseList, String input, WnnPOS terminal, boolean all) {
257        boolean ret = false;
258
259        /* get clauses without ancillary word */
260        ArrayList<WnnWord> stems = getIndependentWords(input, all);
261        if (stems != null && (!stems.isEmpty())) {
262            Iterator<WnnWord> stemsi = stems.iterator();
263            while (stemsi.hasNext()) {
264                WnnWord stem = stemsi.next();
265                if (addClause(clauseList, input, stem, null, terminal, all)) {
266                    ret = true;
267                }
268            }
269        }
270
271        /* get clauses with ancillary word */
272        int max = CLAUSE_COST * 2;
273        for (int split = 1; split < input.length(); split++) {
274            /* get ancillary patterns */
275            String str = input.substring(split);
276            ArrayList<WnnWord> fzks = getAncillaryPattern(str);
277            if (fzks == null || fzks.isEmpty()) {
278                continue;
279            }
280
281            /* get candidates of stem in a clause */
282            str = input.substring(0, split);
283            stems = getIndependentWords(str, all);
284            if (stems == null || stems.isEmpty()) {
285                if (mDictionary.searchWord(WnnDictionary.SEARCH_PREFIX, WnnDictionary.ORDER_BY_FREQUENCY, str) <= 0) {
286                    break;
287                } else {
288                    continue;
289                }
290            }
291            /* make clauses */
292            Iterator<WnnWord> stemsi = stems.iterator();
293            while (stemsi.hasNext()) {
294                WnnWord stem = stemsi.next();
295                if (all || stem.frequency > max) {
296                    Iterator<WnnWord> fzksi  = fzks.iterator();
297                    while (fzksi.hasNext()) {
298                        WnnWord fzk = fzksi.next();
299                        if (addClause(clauseList, input, stem, fzk, terminal, all)) {
300                            ret = true;
301                            max = stem.frequency;
302                        }
303                    }
304                }
305            }
306        }
307        return ret;
308    }
309
310    /**
311     * Add valid clause to the candidates list.
312     *
313     * @param clauseList	Where to store the results
314     * @param input			Input string
315     * @param stem			Stem of the clause (a independent word)
316     * @param fzk			Ancillary pattern
317     * @param terminal		Part of speech tag at the terminal
318     * @param all			Get all candidates or not
319     * @return				{@code true} if add the clause to the list; {@code false} if not.
320     */
321    private boolean addClause(LinkedList<WnnClause> clauseList, String input, WnnWord stem, WnnWord fzk,
322                              WnnPOS terminal, boolean all) {
323        WnnClause clause = null;
324        /* check if the part of speech is valid */
325        if (fzk == null) {
326            if (connectible(stem.partOfSpeech.right, terminal.left)) {
327                clause = new WnnClause(input, stem);
328            }
329        } else {
330            if (connectible(stem.partOfSpeech.right, fzk.partOfSpeech.left)
331                && connectible(fzk.partOfSpeech.right, terminal.left)) {
332                clause = new WnnClause(input, stem, fzk);
333            }
334        }
335        if (clause == null) {
336            return false;
337        }
338        if (mFilter != null && !mFilter.isAllowed(clause)) {
339        	return false;
340        }
341
342        /* store to the list */
343        if (clauseList.isEmpty()) {
344            /* add if the list is empty */
345            clauseList.add(0, clause);
346            return true;
347        } else {
348            if (!all) {
349                /* reserve only the best clause */
350                WnnClause best = (WnnClause)clauseList.get(0);
351                if (best.frequency < clause.frequency) {
352                    clauseList.set(0, clause);
353                    return true;
354                }
355            } else {
356                /* reserve all clauses */
357                Iterator clauseListi = clauseList.iterator();
358                int index = 0;
359                while (clauseListi.hasNext()) {
360                    WnnClause clausei = (WnnClause)clauseListi.next();
361                    if (clausei.frequency < clause.frequency) {
362                        break;
363                    }
364                    index++;
365                }
366                clauseList.add(index, clause);
367                return true;
368            }
369        }
370
371        return false;
372    }
373
374    /**
375     * Check the part-of-speeches are connectable.
376     *
377     * @param right		Right attribute of the preceding word/clause
378     * @param left		Left attribute of the following word/clause
379     * @return			{@code true} if there are connectable; {@code false} if otherwise
380     */
381    private boolean connectible(int right, int left) {
382        try {
383            if (mConnectMatrix[left][right] != 0) {
384                return true;
385            }
386        } catch (Exception ex) {
387        }
388        return false;
389    }
390
391    /**
392     * Get all exact matched ancillary words(Fuzokugo) list.
393     *
394     * @param input		Search key
395     * @return			List of ancillary words
396     */
397    private ArrayList<WnnWord> getAncillaryPattern(String input) {
398        if (input.length() == 0) {
399            return null;
400        }
401
402        HashMap<String,ArrayList<WnnWord>> fzkPat = mFzkPatterns;
403        ArrayList<WnnWord> fzks = fzkPat.get(input);
404        if (fzks != null) {
405            return fzks;
406        }
407
408        /* set dictionaries */
409        WnnDictionary dict = mDictionary;
410        dict.clearDictionary();
411        dict.clearApproxPattern();
412        dict.setDictionary(6, 400, 500);
413
414        for (int start = input.length() - 1; start >= 0; start--) {
415            String key = input.substring(start);
416
417            fzks = fzkPat.get(key);
418            if (fzks != null) {
419                continue;
420            }
421
422            fzks = new ArrayList<WnnWord>();
423            mFzkPatterns.put(key, fzks);
424
425            /* search ancillary words */
426            dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, key);
427            WnnWord word;
428            while ((word = dict.getNextWord()) != null) {
429                fzks.add(word);
430            }
431
432            /* concatenate sequence of ancillary words */
433            for (int end = input.length() - 1; end > start; end--) {
434                ArrayList<WnnWord> followFzks = fzkPat.get(input.substring(end));
435                if (followFzks == null ||  followFzks.isEmpty()) {
436                    continue;
437                }
438                dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input.substring(start, end));
439                while ((word = dict.getNextWord()) != null) {
440                    Iterator<WnnWord> followFzksi = followFzks.iterator();
441                    while (followFzksi.hasNext()) {
442                        WnnWord follow = followFzksi.next();
443                        if (connectible(word.partOfSpeech.right, follow.partOfSpeech.left)) {
444                            fzks.add(new WnnWord(key, key, new WnnPOS(word.partOfSpeech.left, follow.partOfSpeech.right)));
445                        }
446                    }
447                }
448            }
449        }
450        return fzks;
451    }
452
453    /**
454     * Get all exact matched independent words(Jiritsugo) list.
455     *
456     * @param input    Search key
457     * @param all      {@code true} if list all words; {@code false} if list words which has an unique part of speech tag.
458     * @return			List of words; {@code null} if {@code input.length() == 0}.
459     */
460    private ArrayList<WnnWord> getIndependentWords(String input, boolean all) {
461        if (input.length() == 0) {
462            return null;
463        }
464
465        ArrayList<WnnWord> words = (all)? mAllIndepWordBag.get(input) : mIndepWordBag.get(input);
466
467        if (words == null) {
468            /* set dictionaries */
469            WnnDictionary dict = mDictionary;
470            dict.clearDictionary();
471            dict.clearApproxPattern();
472            dict.setDictionary(4, 0, 10);
473            dict.setDictionary(5, 400, 500);
474            dict.setDictionary(WnnDictionary.INDEX_USER_DICTIONARY, FREQ_USER, FREQ_USER);
475            dict.setDictionary(WnnDictionary.INDEX_LEARN_DICTIONARY, FREQ_LEARN, FREQ_LEARN);
476
477            words = new ArrayList<WnnWord>();
478            WnnWord word;
479            if (all) {
480            	mAllIndepWordBag.put(input, words);
481                dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input);
482                /* store all words */
483                while ((word = dict.getNextWord()) != null) {
484                    if (input.equals(word.stroke)) {
485                        words.add(word);
486                    }
487                }
488            } else {
489            	mIndepWordBag.put(input, words);
490                dict.searchWord(WnnDictionary.SEARCH_EXACT, WnnDictionary.ORDER_BY_FREQUENCY, input);
491                /* store a word which has an unique part of speech tag */
492                while ((word = dict.getNextWord()) != null) {
493                    if (input.equals(word.stroke)) {
494                        Iterator<WnnWord> list = words.iterator();
495                        boolean found = false;
496                        while (list.hasNext()) {
497                            WnnWord w = (WnnWord)list.next();
498                                if (w.partOfSpeech.right == word.partOfSpeech.right) {
499                                    found = true;
500                                    break;
501                                }
502                        }
503                        if (!found) {
504                            words.add(word);
505                        }
506                        if (word.frequency < 400) {
507                            break;
508                        }
509                    }
510                }
511            }
512            addAutoGeneratedCandidates(input, words, all);
513        }
514        return words;
515    }
516
517    /**
518     * Add some words not including in the dictionary.
519     * <br>
520     * This method adds some words which are not in the dictionary.
521     *
522     * @param input     Input string
523     * @param wordList  List to store words
524     * @param all       Get all candidates or not
525     */
526    private void addAutoGeneratedCandidates(String input, ArrayList wordList, boolean all) {
527        wordList.add(new WnnWord(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length()));
528    }
529
530    /**
531     * Get a default clause.
532     * <br>
533     * This method generates a clause which has a string same as input
534     * and the default part-of-speech tag.
535     *
536     * @param input    Input string
537     * @return			Default clause
538     */
539    private WnnClause defaultClause(String input) {
540        return (new WnnClause(input, input, mPosDefault, (CLAUSE_COST - 1) * input.length()));
541    }
542}