15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/letterscript_enum.h" 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/compact_lang_det_impl.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace getone { 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const int kMaxScriptBuffer = 4096; 135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; 145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static const int kMaxAnswerBuffer = 256; 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef enum UnicodeLScript ULScript; 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) typedef struct { 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char* text; // Pointer to the span, somewhere 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int text_bytes; // Number of bytes of text in the span 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int offset; // Offset of start of span in original input buffer 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ULScript script; // Script of all the letters in this span 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Language lang; // Language identified for this span 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool truncated; // true if buffer filled up before a 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // different script or EOF was found 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } LangSpan; 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static inline bool IsContinuationByte(char c) { 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return static_cast<signed char>(c) < -64; 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Gets lscript number for letters; always returns 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // 0 (common script) for non-letters 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int GetUTF8LetterScriptNum(const char* src); 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Update src pointer to point to next quadgram, +2..+5 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Looks at src[0..4] 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* AdvanceQuad(const char* src); 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // end namespace getone 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ScriptScanner { 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~ScriptScanner(); 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Copy next run of same-script non-tag letters to buffer [NUL terminated] 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetOneScriptSpan(getone::LangSpan* span); 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Force Latin and Cyrillic scripts to be lowercase 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void LowerScriptSpan(getone::LangSpan* span); 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Copy next run of same-script non-tag letters to buffer [NUL terminated] 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Force Latin and Cyrillic scripts to be lowercase 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetOneScriptSpanLower(getone::LangSpan* span); 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int SkipToFrontOfSpan(const char* src, int len, int* script); 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* start_byte_; 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* next_byte_; 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* next_byte_limit_; 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int byte_length_; 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_plain_text_; 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char* script_buffer_; // Holds text with expanded entities 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char* script_buffer_lower_; // Holds lowercased text 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class LangScanner { 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj, 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) getone::LangSpan* spn, int smoothwidth, int smoothcandidates, 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int maxlangs, int minlangspan); 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) ~LangScanner(); 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int script() {return script_;} 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Use new text 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Keep smoothing state if same script, otherwise reinit smoothing 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void NewText(getone::LangSpan* spn); 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // The real ones 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj, 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) getone::LangSpan* span); 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj, 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) getone::LangSpan* span); 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Increases language bias by delta 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj, 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Language key, int delta); 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // For debugging output 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int next_answer_; 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char answer_buffer_[getone::kMaxAnswerBuffer]; 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char answer_buffer2_[getone::kMaxAnswerBuffer]; 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char answer_buffer3_[getone::kMaxAnswerBuffer]; 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char answer_buffer4_[getone::kMaxAnswerBuffer]; 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* start_byte_; 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* next_byte_limit_; 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* next_byte_; 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const char* onelangspan_begin_; 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int byte_length_; 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int script_; 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) Language spanlang_; 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int smoothwidth_; 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int smoothwidth_2_; 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int smoothcandidates_; 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int maxlangs_; 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int minlangspan_; 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int rb_size_; 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int next_rb_; 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int rb_mask_; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) uint32* rb_; 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int* offset_rb_; 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ 132