17898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project/*
27898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Copyright (C) 2009 The Android Open Source Project
37898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
47898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Licensed under the Apache License, Version 2.0 (the "License");
57898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * you may not use this file except in compliance with the License.
67898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * You may obtain a copy of the License at
77898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
87898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *      http://www.apache.org/licenses/LICENSE-2.0
97898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project *
107898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * Unless required by applicable law or agreed to in writing, software
117898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * distributed under the License is distributed on an "AS IS" BASIS,
127898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
137898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * See the License for the specific language governing permissions and
147898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project * limitations under the License.
157898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project */
167898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
177898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifndef PINYINIME_INCLUDE_NGRAM_H__
187898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#define PINYINIME_INCLUDE_NGRAM_H__
197898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
207898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <stdio.h>
217898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include <stdlib.h>
227898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#include "./dictdef.h"
237898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
247898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectnamespace ime_pinyin {
257898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
267898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projecttypedef unsigned char CODEBOOK_TYPE;
277898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
287898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectstatic const size_t kCodeBookSize = 256;
297898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
307898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Projectclass NGram {
317898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project public:
327898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // The maximum score of a lemma item.
337898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  static const LmaScoreType kMaxScore = 0x3fff;
347898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
357898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // In order to reduce the storage size, the original log value is amplified by
367898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // kScoreAmplifier, and we use LmaScoreType to store.
377898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // After this process, an item with a lower score has a higher frequency.
387898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  static const int kLogValueAmplifier = -800;
397898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
407898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // System words' total frequency. It is not the real total frequency, instead,
417898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // It is only used to adjust system lemmas' scores when the user dictionary's
427898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // total frequency changes.
437898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // In this version, frequencies of system lemmas are fixed. We are considering
447898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // to make them changable in next version.
457898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  static const size_t kSysDictTotalFreq = 100000000;
467898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
477898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project private:
487898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
497898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  static NGram* instance_;
507898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
517898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  bool initialized_;
527898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  size_t idx_num_;
537898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
547898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  size_t total_freq_none_sys_;
557898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
567898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // Score compensation for system dictionary lemmas.
577898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // Because after user adds some user lemmas, the total frequency changes, and
587898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // we use this value to normalize the score.
597898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  float sys_score_compensation_;
607898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
617898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___
627898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  double *freq_codes_df_;
637898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif
647898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  LmaScoreType *freq_codes_;
657898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  CODEBOOK_TYPE *lma_freq_idx_;
667898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
677898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project public:
687898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  NGram();
697898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  ~NGram();
707898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
717898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  static NGram& get_instance();
727898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
737898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  bool save_ngram(FILE *fp);
747898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  bool load_ngram(FILE *fp);
757898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
767898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // Set the total frequency of all none system dictionaries.
777898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  void set_total_freq_none_sys(size_t freq_none_sys);
787898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
797898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  float get_uni_psb(LemmaIdType lma_id);
807898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
817898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // Convert a probability to score. Actually, the score will be limited to
827898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // kMaxScore, but at runtime, we also need float expression to get accurate
837898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // value of the score.
847898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // After the conversion, a lower score indicates a higher probability of the
857898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // item.
867898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  static float convert_psb_to_score(double psb);
877898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
887898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#ifdef ___BUILD_MODEL___
897898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  // For constructing the unigram mode model.
907898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project  bool build_unigram(LemmaEntry *lemma_arr, size_t num,
917898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project                     LemmaIdType next_idx_unused);
927898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif
937898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project};
947898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project}
957898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project
967898d76cc005bbe1c5893a9f57439561e0771ccThe Android Open Source Project#endif  // PINYINIME_INCLUDE_NGRAM_H__
97