15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/letterscript_enum.h"
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "encodings/compact_lang_det/compact_lang_det_impl.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace getone {
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kMaxScriptBuffer = 4096;
135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2;
145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kMaxScriptBytes = kMaxScriptBuffer- 8;   // Leave some room
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static const int kMaxAnswerBuffer = 256;
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef enum UnicodeLScript ULScript;
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  typedef struct {
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    char* text;             // Pointer to the span, somewhere
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int text_bytes;         // Number of bytes of text in the span
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int offset;             // Offset of start of span in original input buffer
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    ULScript script;        // Script of all the letters in this span
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    Language lang;          // Language identified for this span
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    bool truncated;         // true if buffer filled up before a
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                            // different script or EOF was found
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  } LangSpan;
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static inline bool IsContinuationByte(char c) {
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return static_cast<signed char>(c) < -64;
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Gets lscript number for letters; always returns
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  //   0 (common script) for non-letters
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int GetUTF8LetterScriptNum(const char* src);
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Update src pointer to point to next quadgram, +2..+5
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Looks at src[0..4]
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* AdvanceQuad(const char* src);
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}     // end namespace getone
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class ScriptScanner {
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text);
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~ScriptScanner();
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetOneScriptSpan(getone::LangSpan* span);
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Force Latin and Cyrillic scripts to be lowercase
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void LowerScriptSpan(getone::LangSpan* span);
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Copy next run of same-script non-tag letters to buffer [NUL terminated]
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Force Latin and Cyrillic scripts to be lowercase
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetOneScriptSpanLower(getone::LangSpan* span);
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int SkipToFrontOfSpan(const char* src, int len, int* script);
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* start_byte_;
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* next_byte_;
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* next_byte_limit_;
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int byte_length_;
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool is_plain_text_;
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char* script_buffer_;           // Holds text with expanded entities
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char* script_buffer_lower_;     // Holds lowercased text
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class LangScanner {
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj,
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              getone::LangSpan* spn, int smoothwidth, int smoothcandidates,
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)              int maxlangs, int minlangspan);
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  ~LangScanner();
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int script() {return script_;}
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Use new text
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Keep smoothing state if same script, otherwise reinit smoothing
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void NewText(getone::LangSpan* spn);
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetOneShortLangSpanBoot(getone::LangSpan* span);  // Just for bootstrapping
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetOneLangSpanBoot(getone::LangSpan* span);       // Just for bootstrapping
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // The real ones
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                           getone::LangSpan* span);
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj,
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                      getone::LangSpan* span);
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Increases language bias by delta
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj,
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                       Language key, int delta);
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // For debugging output
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int next_answer_;
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char answer_buffer_[getone::kMaxAnswerBuffer];
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char answer_buffer2_[getone::kMaxAnswerBuffer];
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char answer_buffer3_[getone::kMaxAnswerBuffer];
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char answer_buffer4_[getone::kMaxAnswerBuffer];
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* start_byte_;
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* next_byte_limit_;
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* next_byte_;
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const char* onelangspan_begin_;
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int byte_length_;
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int script_;
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  Language spanlang_;
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int smoothwidth_;
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int smoothwidth_2_;
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int smoothcandidates_;
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int maxlangs_;
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int minlangspan_;
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int rb_size_;
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int next_rb_;
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int rb_mask_;
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  uint32* rb_;
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int* offset_rb_;
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#endif  // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_
132