diff options
Diffstat (limited to 'third_party/cld/encodings/compact_lang_det/getonescriptspan.h')
-rw-r--r-- | third_party/cld/encodings/compact_lang_det/getonescriptspan.h | 131 |
1 files changed, 131 insertions, 0 deletions
diff --git a/third_party/cld/encodings/compact_lang_det/getonescriptspan.h b/third_party/cld/encodings/compact_lang_det/getonescriptspan.h new file mode 100644 index 0000000..936aab4 --- /dev/null +++ b/third_party/cld/encodings/compact_lang_det/getonescriptspan.h @@ -0,0 +1,131 @@ +// Copyright (c) 2006-2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ +#define ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ + +#include "encodings/compact_lang_det/letterscript_enum.h" +#include "encodings/compact_lang_det/compact_lang_det_impl.h" + +namespace getone { + static const int kMaxScriptBuffer = 4096; + static const int kMaxScriptLowerBuffer = (kMaxScriptBuffer * 3) / 2; + static const int kMaxScriptBytes = kMaxScriptBuffer- 8; // Leave some room + static const int kMaxAnswerBuffer = 256; + + typedef enum UnicodeLScript ULScript; + + typedef struct { + char* text; // Pointer to the span, somewhere + int text_bytes; // Number of bytes of text in the span + int offset; // Offset of start of span in original input buffer + ULScript script; // Script of all the letters in this span + Language lang; // Language identified for this span + bool truncated; // true if buffer filled up before a + // different script or EOF was found + } LangSpan; + + + static inline bool IsContinuationByte(char c) { + return static_cast<signed char>(c) < -64; + } + + // Gets lscript number for letters; always returns + // 0 (common script) for non-letters + int GetUTF8LetterScriptNum(const char* src); + + + // Update src pointer to point to next quadgram, +2..+5 + // Looks at src[0..4] + const char* AdvanceQuad(const char* src); +} // end namespace getone + + + + + + +class ScriptScanner { + public: + ScriptScanner(const char* buffer, int buffer_length, bool is_plain_text); + ~ScriptScanner(); + + // Copy next run of same-script non-tag letters to buffer [NUL terminated] + bool GetOneScriptSpan(getone::LangSpan* span); + + // Force Latin and Cyrillic scripts to be lowercase + void LowerScriptSpan(getone::LangSpan* span); + + // Copy next run of same-script non-tag letters to buffer [NUL terminated] + // Force Latin and Cyrillic scripts to be lowercase + bool GetOneScriptSpanLower(getone::LangSpan* span); + + private: + int SkipToFrontOfSpan(const char* src, int len, int* script); + + const char* start_byte_; + const char* next_byte_; + const char* next_byte_limit_; + int byte_length_; + bool is_plain_text_; + char* script_buffer_; // Holds text with expanded entities + char* script_buffer_lower_; // Holds lowercased text +}; + + +class LangScanner { + public: + LangScanner(const CompactLangDetImpl::LangDetObj* langdetobj, + getone::LangSpan* spn, int smoothwidth, int smoothcandidates, + int maxlangs, int minlangspan); + ~LangScanner(); + + + int script() {return script_;} + + // Use new text + // Keep smoothing state if same script, otherwise reinit smoothing + void NewText(getone::LangSpan* spn); + + bool GetOneShortLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping + bool GetOneLangSpanBoot(getone::LangSpan* span); // Just for bootstrapping + + // The real ones + bool GetOneShortLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj, + getone::LangSpan* span); + bool GetOneLangSpan(const CompactLangDetImpl::LangDetObj* langdetobj, + getone::LangSpan* span); + + // Increases language bias by delta + void SetLanguageBias(const CompactLangDetImpl::LangDetObj* langdetobj, + Language key, int delta); + + // For debugging output + int next_answer_; + char answer_buffer_[getone::kMaxAnswerBuffer]; + char answer_buffer2_[getone::kMaxAnswerBuffer]; + char answer_buffer3_[getone::kMaxAnswerBuffer]; + char answer_buffer4_[getone::kMaxAnswerBuffer]; + + private: + const char* start_byte_; + const char* next_byte_limit_; + const char* next_byte_; + const char* onelangspan_begin_; + int byte_length_; + int script_; + Language spanlang_; + int smoothwidth_; + int smoothwidth_2_; + int smoothcandidates_; + int maxlangs_; + int minlangspan_; + int rb_size_; + int next_rb_; + int rb_mask_; + uint32* rb_; + int* offset_rb_; +}; + +#endif // ENCODINGS_COMPACT_LANG_DET_GETONESCRIPTSPAN_H_ |