diff options
Diffstat (limited to 'third_party/libphonenumber/cpp/src/utf/unicodetext.h')
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/unicodetext.h | 456 |
1 files changed, 456 insertions, 0 deletions
diff --git a/third_party/libphonenumber/cpp/src/utf/unicodetext.h b/third_party/libphonenumber/cpp/src/utf/unicodetext.h new file mode 100644 index 0000000..fb37a33 --- /dev/null +++ b/third_party/libphonenumber/cpp/src/utf/unicodetext.h @@ -0,0 +1,456 @@ +// Copyright (C) 2006 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Author: Jim Meehan + +#ifndef UTIL_UTF8_UNICODETEXT_H__ +#define UTIL_UTF8_UNICODETEXT_H__ + +#include <iterator> +#include <string> +#include <utility> +#include "base/basictypes.h" +//#include "util/utf8/public/config.h" + +using std::string; +using std::bidirectional_iterator_tag; +using std::pair; + +// ***************************** UnicodeText ************************** +// +// A UnicodeText object is a container for a sequence of Unicode +// codepoint values. It has default, copy, and assignment constructors. +// Data can be appended to it from another UnicodeText, from +// iterators, or from a single codepoint. +// +// The internal representation of the text is UTF-8. Since UTF-8 is a +// variable-width format, UnicodeText does not provide random access +// to the text, and changes to the text are permitted only at the end. +// +// The UnicodeText class defines a const_iterator. The dereferencing +// operator (*) returns a codepoint (char32). The iterator is a +// bidirectional, read-only iterator. It becomes invalid if the text +// is changed. +// +// There are methods for appending and retrieving UTF-8 data directly. +// The 'utf8_data' method returns a const char* that contains the +// UTF-8-encoded version of the text; 'utf8_length' returns the number +// of bytes in the UTF-8 data. An iterator's 'get' method stores up to +// 4 bytes of UTF-8 data in a char array and returns the number of +// bytes that it stored. +// +// Codepoints are integers in the range [0, 0xD7FF] or [0xE000, +// 0x10FFFF], but UnicodeText has the additional restriction that it +// can contain only those characters that are valid for interchange on +// the Web. This excludes all of the control codes except for carriage +// return, line feed, and horizontal tab. It also excludes +// non-characters, but codepoints that are in the Private Use regions +// are allowed, as are codepoints that are unassigned. (See the +// Unicode reference for details.) The function UniLib::IsInterchangeValid +// can be used as a test for this property. +// +// UnicodeTexts are safe. Every method that constructs or modifies a +// UnicodeText tests for interchange-validity, and will substitute a +// space for the invalid data. Such cases are reported via +// LOG(WARNING). +// +// MEMORY MANAGEMENT: copy, take ownership, or point to +// +// A UnicodeText is either an "owner", meaning that it owns the memory +// for the data buffer and will free it when the UnicodeText is +// destroyed, or it is an "alias", meaning that it does not. +// +// There are three methods for storing UTF-8 data in a UnicodeText: +// +// CopyUTF8(buffer, len) copies buffer. +// +// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. +// +// PointToUTF8(buffer, size) creates an alias pointing to buffer. +// +// All three methods perform a validity check on the buffer. There are +// private, "unsafe" versions of these functions that bypass the +// validity check. They are used internally and by friend-functions +// that are handling UTF-8 data that has already been validated. +// +// The purpose of an alias is to avoid making an unnecessary copy of a +// UTF-8 buffer while still providing access to the Unicode values +// within that text through iterators or the fast scanners that are +// based on UTF-8 state tables. The lifetime of an alias must not +// exceed the lifetime of the buffer from which it was constructed. +// +// The semantics of an alias might be described as "copy on write or +// repair." The source data is never modified. If push_back() or +// append() is called on an alias, a copy of the data will be created, +// and the UnicodeText will become an owner. If clear() is called on +// an alias, it becomes an (empty) owner. +// +// The copy constructor and the assignment operator produce an owner. +// That is, after direct initialization ("UnicodeText x(y);") or copy +// initialization ("UnicodeText x = y;") x will be an owner, even if y +// was an alias. The assignment operator ("x = y;") also produces an +// owner unless x and y are the same object and y is an alias. +// +// Aliases should be used with care. If the source from which an alias +// was created is freed, or if the contents are changed, while the +// alias is still in use, fatal errors could result. But it can be +// quite useful to have a UnicodeText "window" through which to see a +// UTF-8 buffer without having to pay the price of making a copy. +// +// UTILITIES +// +// The interfaces in util/utf8/public/textutils.h provide higher-level +// utilities for dealing with UnicodeTexts, including routines for +// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or +// strings, creating strings from UnicodeTexts, normalizing text for +// efficient matching or display, and others. + +class UnicodeText { + public: + class const_iterator; + + typedef char32 value_type; + + // Constructors. These always produce owners. + UnicodeText(); // Create an empty text. + UnicodeText(const UnicodeText& src); // copy constructor + // Construct a substring (copies the data). + UnicodeText(const const_iterator& first, const const_iterator& last); + + // Assignment operator. This copies the data and produces an owner + // unless this == &src, e.g., "x = x;", which is a no-op. + UnicodeText& operator=(const UnicodeText& src); + + // x.Copy(y) copies the data from y into x. + UnicodeText& Copy(const UnicodeText& src); + inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } + + // x.PointTo(y) changes x so that it points to y's data. + // It does not copy y or take ownership of y's data. + UnicodeText& PointTo(const UnicodeText& src); + UnicodeText& PointTo(const const_iterator& first, + const const_iterator& last); + + ~UnicodeText(); + + void clear(); // Clear text. + bool empty() { return repr_.size_ == 0; } // Test if text is empty. + + // Add a codepoint to the end of the text. + // If the codepoint is not interchange-valid, add a space instead + // and log a warning. + void push_back(char32 codepoint); + + // Generic appending operation. + // iterator_traits<ForwardIterator>::value_type must be implicitly + // convertible to char32. Typical uses of this method might include: + // char32 chars[] = {0x1, 0x2, ...}; + // vector<char32> more_chars = ...; + // utext.append(chars, chars+arraysize(chars)); + // utext.append(more_chars.begin(), more_chars.end()); + template<typename ForwardIterator> + UnicodeText& append(ForwardIterator first, const ForwardIterator last) { + while (first != last) { push_back(*first++); } + return *this; + } + + // A specialization of the generic append() method. + UnicodeText& append(const const_iterator& first, const const_iterator& last); + + // An optimization of append(source.begin(), source.end()). + UnicodeText& append(const UnicodeText& source); + + int size() const; // the number of Unicode characters (codepoints) + + friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); + friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); + + class const_iterator { + typedef const_iterator CI; + public: + typedef bidirectional_iterator_tag iterator_category; + typedef char32 value_type; + typedef ptrdiff_t difference_type; + typedef void pointer; // (Not needed.) + typedef const char32 reference; // (Needed for const_reverse_iterator) + + // Iterators are default-constructible. + const_iterator(); + + // It's safe to make multiple passes over a UnicodeText. + const_iterator(const const_iterator& other); + const_iterator& operator=(const const_iterator& other); + + char32 operator*() const; // Dereference + + const_iterator& operator++(); // Advance (++iter) + const_iterator operator++(int) { // (iter++) + const_iterator result(*this); + ++*this; + return result; + } + + const_iterator& operator--(); // Retreat (--iter) + const_iterator operator--(int) { // (iter--) + const_iterator result(*this); + --*this; + return result; + } + + // We love relational operators. + friend bool operator==(const CI& lhs, const CI& rhs) { + return lhs.it_ == rhs.it_; } + friend bool operator!=(const CI& lhs, const CI& rhs) { + return !(lhs == rhs); } + friend bool operator<(const CI& lhs, const CI& rhs); + friend bool operator>(const CI& lhs, const CI& rhs) { + return rhs < lhs; } + friend bool operator<=(const CI& lhs, const CI& rhs) { + return !(rhs < lhs); } + friend bool operator>=(const CI& lhs, const CI& rhs) { + return !(lhs < rhs); } + + friend difference_type distance(const CI& first, const CI& last); + + // UTF-8-specific methods + // Store the UTF-8 encoding of the current codepoint into buf, + // which must be at least 4 bytes long. Return the number of + // bytes written. + int get_utf8(char* buf) const; + // Return the iterator's pointer into the UTF-8 data. + const char* utf8_data() const { return it_; } + + string DebugString() const; + + private: + friend class UnicodeText; + friend class UnicodeTextUtils; + friend class UTF8StateTableProperty; + explicit const_iterator(const char* it) : it_(it) {} + + const char* it_; + }; + + const_iterator begin() const; + const_iterator end() const; + + class const_reverse_iterator : public std::reverse_iterator<const_iterator> { + public: + const_reverse_iterator(const_iterator it) : + std::reverse_iterator<const_iterator>(it) {} + const char* utf8_data() const { + const_iterator tmp_it = base(); + return (--tmp_it).utf8_data(); + } + int get_utf8(char* buf) const { + const_iterator tmp_it = base(); + return (--tmp_it).get_utf8(buf); + } + }; + const_reverse_iterator rbegin() const { + return const_reverse_iterator(end()); + } + const_reverse_iterator rend() const { + return const_reverse_iterator(begin()); + } + + // Substring searching. Returns the beginning of the first + // occurrence of "look", or end() if not found. + const_iterator find(const UnicodeText& look, const_iterator start_pos) const; + // Equivalent to find(look, begin()) + const_iterator find(const UnicodeText& look) const; + + // Returns whether this contains the character U+FFFD. This can + // occur, for example, if the input to Encodings::Decode() had byte + // sequences that were invalid in the source encoding. + bool HasReplacementChar() const; + + // UTF-8-specific methods + // + // Return the data, length, and capacity of UTF-8-encoded version of + // the text. Length and capacity are measured in bytes. + const char* utf8_data() const { return repr_.data_; } + int utf8_length() const { return repr_.size_; } + int utf8_capacity() const { return repr_.capacity_; } + + // Return the UTF-8 data as a string. + static string UTF8Substring(const const_iterator& first, + const const_iterator& last); + + // There are three methods for initializing a UnicodeText from UTF-8 + // data. They vary in details of memory management. In all cases, + // the data is tested for interchange-validity. If it is not + // interchange-valid, a LOG(WARNING) is issued, and each + // structurally invalid byte and each interchange-invalid codepoint + // is replaced with a space. + + // x.CopyUTF8(buf, len) copies buf into x. + UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); + + // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of + // buf. buf is not copied. + UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, + int byte_length, + int byte_capacity); + + // x.PointToUTF8(buf,len) changes x so that it points to buf + // ("becomes an alias"). It does not take ownership or copy buf. + // If the buffer is not valid, this has the same effect as + // CopyUTF8(utf8_buffer, byte_length). + UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); + + // Occasionally it is necessary to use functions that operate on the + // pointer returned by utf8_data(). MakeIterator(p) provides a way + // to get back to the UnicodeText level. It uses CHECK to ensure + // that p is a pointer within this object's UTF-8 data, and that it + // points to the beginning of a character. + const_iterator MakeIterator(const char* p) const; + + string DebugString() const; + + private: + friend class const_iterator; + friend class UnicodeTextUtils; + + class Repr { // A byte-string. + public: + char* data_; + int size_; + int capacity_; + bool ours_; // Do we own data_? + + Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} + ~Repr() { if (ours_) delete[] data_; } + + void clear(); + void reserve(int capacity); + void resize(int size); + + void append(const char* bytes, int byte_length); + void Copy(const char* data, int size); + void TakeOwnershipOf(char* data, int size, int capacity); + void PointTo(const char* data, int size); + + string DebugString() const; + + private: + Repr& operator=(const Repr&); + Repr(const Repr& other); + }; + + Repr repr_; + + // UTF-8-specific private methods. + // These routines do not perform a validity check when compiled + // in opt mode. + // It is an error to call these methods with UTF-8 data that + // is not interchange-valid. + // + UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); + UnicodeText& UnsafeTakeOwnershipOfUTF8( + char* utf8_buffer, int byte_length, int byte_capacity); + UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); + UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); + const_iterator UnsafeFind(const UnicodeText& look, + const_iterator start_pos) const; +}; + +bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); + +inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { + return !(lhs == rhs); +} + +// UnicodeTextRange is a pair of iterators, useful for specifying text +// segments. If the iterators are ==, the segment is empty. +typedef pair<UnicodeText::const_iterator, + UnicodeText::const_iterator> UnicodeTextRange; + +inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { + return r.first == r.second; +} + + +// *************************** Utilities ************************* + +// A factory function for creating a UnicodeText from a buffer of +// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It +// is an "owner.") +// +// Each byte that is structurally invalid will be replaced with a +// space. Each codepoint that is interchange-invalid will also be +// replaced with a space, even if the codepoint was represented with a +// multibyte sequence in the UTF-8 data. +// +inline UnicodeText MakeUnicodeTextAcceptingOwnership( + char* utf8_buffer, int byte_length, int byte_capacity) { + return UnicodeText().TakeOwnershipOfUTF8( + utf8_buffer, byte_length, byte_capacity); +} + +// A factory function for creating a UnicodeText from a buffer of +// UTF-8 data. The new UnicodeText does not take ownership of the +// buffer. (It is an "alias.") +// +inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( + const char* utf8_buffer, int byte_length) { + return UnicodeText().PointToUTF8(utf8_buffer, byte_length); +} + +// Create a UnicodeText from a UTF-8 string or buffer. +// +// If do_copy is true, then a copy of the string is made. The copy is +// owned by the resulting UnicodeText object and will be freed when +// the object is destroyed. This UnicodeText object is referred to +// as an "owner." +// +// If do_copy is false, then no copy is made. The resulting +// UnicodeText object does NOT take ownership of the string; in this +// case, the lifetime of the UnicodeText object must not exceed the +// lifetime of the string. This Unicodetext object is referred to as +// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. +// +// If the input string does not contain valid UTF-8, then a copy is +// made (as if do_copy were true) and coerced to valid UTF-8 by +// replacing each invalid byte with a space. +// +inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, + bool do_copy) { + UnicodeText t; + if (do_copy) { + t.CopyUTF8(utf8_buf, len); + } else { + t.PointToUTF8(utf8_buf, len); + } + return t; +} + +inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { + return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); +} + +inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { + return UTF8ToUnicodeText(utf8_buf, len, true); +} +inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { + return UTF8ToUnicodeText(utf8_string, true); +} + +// Return a string containing the UTF-8 encoded version of all the +// Unicode characters in t. +inline string UnicodeTextToUTF8(const UnicodeText& t) { + return string(t.utf8_data(), t.utf8_length()); +} + +#endif // UTIL_UTF8_UNICODETEXT_H__ |