diff options
Diffstat (limited to 'third_party/libphonenumber/cpp/src/utf')
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/README | 1 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/rune.c | 350 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/stringpiece.h | 24 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/stringprintf.h | 22 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/unicodetext.cc | 515 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/unicodetext.h | 456 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/unilib.cc | 64 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/unilib.h | 95 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/utf.h | 251 | ||||
-rw-r--r-- | third_party/libphonenumber/cpp/src/utf/utfdef.h | 28 |
10 files changed, 0 insertions, 1806 deletions
diff --git a/third_party/libphonenumber/cpp/src/utf/README b/third_party/libphonenumber/cpp/src/utf/README deleted file mode 100644 index 986e9e3..0000000 --- a/third_party/libphonenumber/cpp/src/utf/README +++ /dev/null @@ -1 +0,0 @@ -These files come from lib9 (http://code.google.com/p/go/source/browse). diff --git a/third_party/libphonenumber/cpp/src/utf/rune.c b/third_party/libphonenumber/cpp/src/utf/rune.c deleted file mode 100644 index 5a37368..0000000 --- a/third_party/libphonenumber/cpp/src/utf/rune.c +++ /dev/null @@ -1,350 +0,0 @@ -/* - * The authors of this software are Rob Pike and Ken Thompson. - * Copyright (c) 2002 by Lucent Technologies. - * Permission to use, copy, modify, and distribute this software for any - * purpose without fee is hereby granted, provided that this entire notice - * is included in all copies of any software which is or includes a copy - * or modification of this software and in all copies of the supporting - * documentation for such software. - * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY - * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY - * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. - */ -#include <stdarg.h> -#include <string.h> -#include "utf.h" -#include "utfdef.h" - -enum -{ - Bit1 = 7, - Bitx = 6, - Bit2 = 5, - Bit3 = 4, - Bit4 = 3, - Bit5 = 2, - - T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ - Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ - T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ - T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ - T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ - T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ - Rune4 = (1<<(Bit4+3*Bitx))-1, - /* 0001 1111 1111 1111 1111 1111 */ - - Maskx = (1<<Bitx)-1, /* 0011 1111 */ - Testx = Maskx ^ 0xFF, /* 1100 0000 */ - - Bad = Runeerror, -}; - -/* - * Modified by Wei-Hwa Huang, Google Inc., on 2004-09-24 - * This is a slower but "safe" version of the old chartorune - * that works on strings that are not necessarily null-terminated. - * - * If you know for sure that your string is null-terminated, - * chartorune will be a bit faster. - * - * It is guaranteed not to attempt to access "length" - * past the incoming pointer. This is to avoid - * possible access violations. If the string appears to be - * well-formed but incomplete (i.e., to get the whole Rune - * we'd need to read past str+length) then we'll set the Rune - * to Bad and return 0. - * - * Note that if we have decoding problems for other - * reasons, we return 1 instead of 0. - */ -int -charntorune(Rune *rune, const char *str, int length) -{ - int c, c1, c2, c3; - long l; - - /* When we're not allowed to read anything */ - if(length <= 0) { - goto badlen; - } - - /* - * one character sequence (7-bit value) - * 00000-0007F => T1 - */ - c = *(uchar*)str; - if(c < Tx) { - *rune = c; - return 1; - } - - // If we can't read more than one character we must stop - if(length <= 1) { - goto badlen; - } - - /* - * two character sequence (11-bit value) - * 0080-07FF => T2 Tx - */ - c1 = *(uchar*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) - goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) - goto bad; - *rune = l; - return 2; - } - - // If we can't read more than two characters we must stop - if(length <= 2) { - goto badlen; - } - - /* - * three character sequence (16-bit value) - * 0800-FFFF => T3 Tx Tx - */ - c2 = *(uchar*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) - goto bad; - *rune = l; - return 3; - } - - if (length <= 3) - goto badlen; - - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => T4 Tx Tx Tx - */ - c3 = *(uchar*)(str+3) ^ Tx; - if (c3 & Testx) - goto bad; - if (c < T5) { - l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; - if (l <= Rune3) - goto bad; - *rune = l; - return 4; - } - - // Support for 5-byte or longer UTF-8 would go here, but - // since we don't have that, we'll just fall through to bad. - - /* - * bad decoding - */ -bad: - *rune = Bad; - return 1; -badlen: - *rune = Bad; - return 0; - -} - - -/* - * This is the older "unsafe" version, which works fine on - * null-terminated strings. - */ -int -chartorune(Rune *rune, const char *str) -{ - int c, c1, c2, c3; - long l; - - /* - * one character sequence - * 00000-0007F => T1 - */ - c = *(uchar*)str; - if(c < Tx) { - *rune = c; - return 1; - } - - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - c1 = *(uchar*)(str+1) ^ Tx; - if(c1 & Testx) - goto bad; - if(c < T3) { - if(c < T2) - goto bad; - l = ((c << Bitx) | c1) & Rune2; - if(l <= Rune1) - goto bad; - *rune = l; - return 2; - } - - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - c2 = *(uchar*)(str+2) ^ Tx; - if(c2 & Testx) - goto bad; - if(c < T4) { - l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; - if(l <= Rune2) - goto bad; - *rune = l; - return 3; - } - - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => T4 Tx Tx Tx - */ - c3 = *(uchar*)(str+3) ^ Tx; - if (c3 & Testx) - goto bad; - if (c < T5) { - l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; - if (l <= Rune3) - goto bad; - *rune = l; - return 4; - } - - /* - * Support for 5-byte or longer UTF-8 would go here, but - * since we don't have that, we'll just fall through to bad. - */ - - /* - * bad decoding - */ -bad: - *rune = Bad; - return 1; -} - -int -isvalidcharntorune(const char* str, int length, Rune* rune, int* consumed) { - *consumed = charntorune(rune, str, length); - return *rune != Runeerror || *consumed == 3; -} - -int -runetochar(char *str, const Rune *rune) -{ - /* Runes are signed, so convert to unsigned for range check. */ - unsigned long c; - - /* - * one character sequence - * 00000-0007F => 00-7F - */ - c = *rune; - if(c <= Rune1) { - str[0] = c; - return 1; - } - - /* - * two character sequence - * 0080-07FF => T2 Tx - */ - if(c <= Rune2) { - str[0] = T2 | (c >> 1*Bitx); - str[1] = Tx | (c & Maskx); - return 2; - } - - /* - * If the Rune is out of range, convert it to the error rune. - * Do this test here because the error rune encodes to three bytes. - * Doing it earlier would duplicate work, since an out of range - * Rune wouldn't have fit in one or two bytes. - */ - if (c > Runemax) - c = Runeerror; - - /* - * three character sequence - * 0800-FFFF => T3 Tx Tx - */ - if (c <= Rune3) { - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; - } - - /* - * four character sequence (21-bit value) - * 10000-1FFFFF => T4 Tx Tx Tx - */ - str[0] = T4 | (c >> 3*Bitx); - str[1] = Tx | ((c >> 2*Bitx) & Maskx); - str[2] = Tx | ((c >> 1*Bitx) & Maskx); - str[3] = Tx | (c & Maskx); - return 4; -} - -int -runelen(Rune rune) -{ - char str[10]; - - return runetochar(str, &rune); -} - -int -runenlen(const Rune *r, int nrune) -{ - int nb, c; - - nb = 0; - while(nrune--) { - c = *r++; - if (c <= Rune1) - nb++; - else if (c <= Rune2) - nb += 2; - else if (c <= Rune3) - nb += 3; - else /* assert(c <= Rune4) */ - nb += 4; - } - return nb; -} - -int -fullrune(const char *str, int n) -{ - if (n > 0) { - int c = *(uchar*)str; - if (c < Tx) - return 1; - if (n > 1) { - if (c < T3) - return 1; - if (n > 2) { - if (c < T4 || n > 3) - return 1; - } - } - } - return 0; -} diff --git a/third_party/libphonenumber/cpp/src/utf/stringpiece.h b/third_party/libphonenumber/cpp/src/utf/stringpiece.h deleted file mode 100644 index 7b56772..0000000 --- a/third_party/libphonenumber/cpp/src/utf/stringpiece.h +++ /dev/null @@ -1,24 +0,0 @@ -/** - * Copyright 2010 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef STRINGS_STRINGPIECE_H_ -#define STRINGS_STRINGPIECE_H_ - -//#include "third_party/chromium/src/base/string_piece.h" -#include "base/string_piece.h" - -using base::StringPiece; - -#endif // STRINGS_STRINGPIECE_H_ diff --git a/third_party/libphonenumber/cpp/src/utf/stringprintf.h b/third_party/libphonenumber/cpp/src/utf/stringprintf.h deleted file mode 100644 index 208d338f..0000000 --- a/third_party/libphonenumber/cpp/src/utf/stringprintf.h +++ /dev/null @@ -1,22 +0,0 @@ -/** - * Copyright 2010 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#ifndef STRINGS_STRINGPRINTF_H_ -#define STRINGS_STRINGPRINTF_H_ - -//#include "third_party/chromium/src/base/string_util.h" -#include "base/string_util.h" - -#endif // STRINGS_STRINGPRINTF_H_ diff --git a/third_party/libphonenumber/cpp/src/utf/unicodetext.cc b/third_party/libphonenumber/cpp/src/utf/unicodetext.cc deleted file mode 100644 index 82c1b42..0000000 --- a/third_party/libphonenumber/cpp/src/utf/unicodetext.cc +++ /dev/null @@ -1,515 +0,0 @@ -// Copyright (C) 2006 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Author: Jim Meehan - -#include <iostream> -#include <sstream> -#include <cassert> - -#include "utf/unicodetext.h" -//#include "base/logging.h" -#include "utf/stringpiece.h" -//#include "utf/stringprintf.h" -#include "utf/utf.h" -#include "utf/unilib.h" - -using std::stringstream; -using std::max; -using std::hex; -using std::dec; -using std::cerr; -using std::endl; - -static int CodepointDistance(const char* start, const char* end) { - int n = 0; - // Increment n on every non-trail-byte. - for (const char* p = start; p < end; ++p) { - n += (*reinterpret_cast<const signed char*>(p) >= -0x40); - } - return n; -} - -static int CodepointCount(const char* utf8, int len) { - return CodepointDistance(utf8, utf8 + len); -} - -UnicodeText::const_iterator::difference_type -distance(const UnicodeText::const_iterator& first, - const UnicodeText::const_iterator& last) { - return CodepointDistance(first.it_, last.it_); -} - -// ---------- Utility ---------- - -static int ConvertToInterchangeValid(char* start, int len) { - // This routine is called only when we've discovered that a UTF-8 buffer - // that was passed to CopyUTF8, TakeOwnershipOfUTF8, or PointToUTF8 - // was not interchange valid. This indicates a bug in the caller, and - // a LOG(WARNING) is done in that case. - // This is similar to CoerceToInterchangeValid, but it replaces each - // structurally valid byte with a space, and each non-interchange - // character with a space, even when that character requires more - // than one byte in UTF8. E.g., "\xEF\xB7\x90" (U+FDD0) is - // structurally valid UTF8, but U+FDD0 is not an interchange-valid - // code point. The result should contain one space, not three. - // - // Since the conversion never needs to write more data than it - // reads, it is safe to change the buffer in place. It returns the - // number of bytes written. - char* const in = start; - char* out = start; - char* const end = start + len; - while (start < end) { - int good = UniLib::SpanInterchangeValid(start, end - start); - if (good > 0) { - if (out != start) { - memmove(out, start, good); - } - out += good; - start += good; - if (start == end) { - break; - } - } - // Is the current string invalid UTF8 or just non-interchange UTF8? - char32 rune; - int n; - if (isvalidcharntorune(start, end - start, &rune, &n)) { - // structurally valid UTF8, but not interchange valid - start += n; // Skip over the whole character. - } else { // bad UTF8 - start += 1; // Skip over just one byte - } - *out++ = ' '; - } - return out - in; -} - - -// *************** Data representation ********** - -// Note: the copy constructor is undefined. - -// After reserve(), resize(), or clear(), we're an owner, not an alias. - -void UnicodeText::Repr::reserve(int new_capacity) { - // If there's already enough capacity, and we're an owner, do nothing. - if (capacity_ >= new_capacity && ours_) return; - - // Otherwise, allocate a new buffer. - capacity_ = max(new_capacity, (3 * capacity_) / 2 + 20); - char* new_data = new char[capacity_]; - - // If there is an old buffer, copy it into the new buffer. - if (data_) { - memcpy(new_data, data_, size_); - if (ours_) delete[] data_; // If we owned the old buffer, free it. - } - data_ = new_data; - ours_ = true; // We own the new buffer. - // size_ is unchanged. -} - -void UnicodeText::Repr::resize(int new_size) { - if (new_size == 0) { - clear(); - } else { - if (!ours_ || new_size > capacity_) reserve(new_size); - // Clear the memory in the expanded part. - if (size_ < new_size) memset(data_ + size_, 0, new_size - size_); - size_ = new_size; - ours_ = true; - } -} - -// This implementation of clear() deallocates the buffer if we're an owner. -// That's not strictly necessary; we could just set size_ to 0. -void UnicodeText::Repr::clear() { - if (ours_) delete[] data_; - data_ = NULL; - size_ = capacity_ = 0; - ours_ = true; -} - -void UnicodeText::Repr::Copy(const char* data, int size) { - resize(size); - memcpy(data_, data, size); -} - -void UnicodeText::Repr::TakeOwnershipOf(char* data, int size, int capacity) { - if (data == data_) return; // We already own this memory. (Weird case.) - if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. - data_ = data; - size_ = size; - capacity_ = capacity; - ours_ = true; -} - -void UnicodeText::Repr::PointTo(const char* data, int size) { - if (ours_ && data_) delete[] data_; // If we owned the old buffer, free it. - data_ = const_cast<char*>(data); - size_ = size; - capacity_ = size; - ours_ = false; -} - -void UnicodeText::Repr::append(const char* bytes, int byte_length) { - reserve(size_ + byte_length); - memcpy(data_ + size_, bytes, byte_length); - size_ += byte_length; -} - -string UnicodeText::Repr::DebugString() const { - stringstream ss; - - ss << "{Repr " << hex << this << " data=" << data_ << " size=" << dec - << size_ << " capacity=" << capacity_ << " " - << (ours_ ? "Owned" : "Alias") << "}"; - - string result; - ss >> result; - - return result; -} - - - -// *************** UnicodeText ****************** - -// ----- Constructors ----- - -// Default constructor -UnicodeText::UnicodeText() { -} - -// Copy constructor -UnicodeText::UnicodeText(const UnicodeText& src) { - Copy(src); -} - -// Substring constructor -UnicodeText::UnicodeText(const UnicodeText::const_iterator& first, - const UnicodeText::const_iterator& last) { - assert(first <= last && "Incompatible iterators"); - repr_.append(first.it_, last.it_ - first.it_); -} - -string UnicodeText::UTF8Substring(const const_iterator& first, - const const_iterator& last) { - assert(first <= last && "Incompatible iterators"); - return string(first.it_, last.it_ - first.it_); -} - - -// ----- Copy ----- - -UnicodeText& UnicodeText::operator=(const UnicodeText& src) { - if (this != &src) { - Copy(src); - } - return *this; -} - -UnicodeText& UnicodeText::Copy(const UnicodeText& src) { - repr_.Copy(src.repr_.data_, src.repr_.size_); - return *this; -} - -UnicodeText& UnicodeText::CopyUTF8(const char* buffer, int byte_length) { - repr_.Copy(buffer, byte_length); - if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { - cerr << "UTF-8 buffer is not interchange-valid." << endl; - repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); - } - return *this; -} - -UnicodeText& UnicodeText::UnsafeCopyUTF8(const char* buffer, - int byte_length) { - repr_.Copy(buffer, byte_length); - return *this; -} - -// ----- TakeOwnershipOf ----- - -UnicodeText& UnicodeText::TakeOwnershipOfUTF8(char* buffer, - int byte_length, - int byte_capacity) { - repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); - if (!UniLib:: IsInterchangeValid(buffer, byte_length)) { - cerr << "UTF-8 buffer is not interchange-valid." << endl; - repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); - } - return *this; -} - -UnicodeText& UnicodeText::UnsafeTakeOwnershipOfUTF8(char* buffer, - int byte_length, - int byte_capacity) { - repr_.TakeOwnershipOf(buffer, byte_length, byte_capacity); - return *this; -} - -// ----- PointTo ----- - -UnicodeText& UnicodeText::PointToUTF8(const char* buffer, int byte_length) { - if (UniLib:: IsInterchangeValid(buffer, byte_length)) { - repr_.PointTo(buffer, byte_length); - } else { - cerr << "UTF-8 buffer is not interchange-valid." << endl; - repr_.Copy(buffer, byte_length); - repr_.size_ = ConvertToInterchangeValid(repr_.data_, byte_length); - } - return *this; -} - -UnicodeText& UnicodeText::UnsafePointToUTF8(const char* buffer, - int byte_length) { - repr_.PointTo(buffer, byte_length); - return *this; -} - -UnicodeText& UnicodeText::PointTo(const UnicodeText& src) { - repr_.PointTo(src.repr_.data_, src.repr_.size_); - return *this; -} - -UnicodeText& UnicodeText::PointTo(const const_iterator &first, - const const_iterator &last) { - assert(first <= last && " Incompatible iterators"); - repr_.PointTo(first.utf8_data(), last.utf8_data() - first.utf8_data()); - return *this; -} - -// ----- Append ----- - -UnicodeText& UnicodeText::append(const UnicodeText& u) { - repr_.append(u.repr_.data_, u.repr_.size_); - return *this; -} - -UnicodeText& UnicodeText::append(const const_iterator& first, - const const_iterator& last) { - assert(first <= last && "Incompatible iterators"); - repr_.append(first.it_, last.it_ - first.it_); - return *this; -} - -UnicodeText& UnicodeText::UnsafeAppendUTF8(const char* utf8, int len) { - repr_.append(utf8, len); - return *this; -} - -// ----- substring searching ----- - -UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look, - const_iterator start_pos) const { - assert(start_pos.utf8_data() >= utf8_data()); - assert(start_pos.utf8_data() <= utf8_data() + utf8_length()); - return UnsafeFind(look, start_pos); -} - -UnicodeText::const_iterator UnicodeText::find(const UnicodeText& look) const { - return UnsafeFind(look, begin()); -} - -UnicodeText::const_iterator UnicodeText::UnsafeFind( - const UnicodeText& look, const_iterator start_pos) const { - // Due to the magic of the UTF8 encoding, searching for a sequence of - // letters is equivalent to substring search. - StringPiece searching(utf8_data(), utf8_length()); - StringPiece look_piece(look.utf8_data(), look.utf8_length()); - StringPiece::size_type found = - searching.find(look_piece, start_pos.utf8_data() - utf8_data()); - if (found == StringPiece::npos) return end(); - return const_iterator(utf8_data() + found); -} - -bool UnicodeText::HasReplacementChar() const { - // Equivalent to: - // UnicodeText replacement_char; - // replacement_char.push_back(0xFFFD); - // return find(replacement_char) != end(); - StringPiece searching(utf8_data(), utf8_length()); - StringPiece looking_for("\xEF\xBF\xBD", 3); - return searching.find(looking_for) != StringPiece::npos; -} - -// ----- other methods ----- - -// Clear operator -void UnicodeText::clear() { - repr_.clear(); -} - -// Destructor -UnicodeText::~UnicodeText() {} - - -void UnicodeText::push_back(char32 c) { - if (UniLib::IsValidCodepoint(c)) { - char buf[UTFmax]; - int len = runetochar(buf, &c); - if (UniLib::IsInterchangeValid(buf, len)) { - repr_.append(buf, len); - } else { - cerr << "Unicode value 0x" << hex << c - << " is not valid for interchange" << endl; - repr_.append(" ", 1); - } - } else { - cerr << "Illegal Unicode value: 0x" << hex << c << endl; - repr_.append(" ", 1); - } -} - -int UnicodeText::size() const { - return CodepointCount(repr_.data_, repr_.size_); -} - -bool operator==(const UnicodeText& lhs, const UnicodeText& rhs) { - if (&lhs == &rhs) return true; - if (lhs.repr_.size_ != rhs.repr_.size_) return false; - return memcmp(lhs.repr_.data_, rhs.repr_.data_, lhs.repr_.size_) == 0; -} - -string UnicodeText::DebugString() const { - stringstream ss; - - ss << "{UnicodeText " << hex << this << dec << " chars=" - << size() << " repr=" << repr_.DebugString() << "}"; -#if 0 - return StringPrintf("{UnicodeText %p chars=%d repr=%s}", - this, - size(), - repr_.DebugString().c_str()); -#endif - string result; - ss >> result; - - return result; -} - - -// ******************* UnicodeText::const_iterator ********************* - -// The implementation of const_iterator would be nicer if it -// inherited from boost::iterator_facade -// (http://boost.org/libs/iterator/doc/iterator_facade.html). - -UnicodeText::const_iterator::const_iterator() : it_(0) {} - -UnicodeText::const_iterator::const_iterator(const const_iterator& other) - : it_(other.it_) { -} - -UnicodeText::const_iterator& -UnicodeText::const_iterator::operator=(const const_iterator& other) { - if (&other != this) - it_ = other.it_; - return *this; -} - -UnicodeText::const_iterator UnicodeText::begin() const { - return const_iterator(repr_.data_); -} - -UnicodeText::const_iterator UnicodeText::end() const { - return const_iterator(repr_.data_ + repr_.size_); -} - -bool operator<(const UnicodeText::const_iterator& lhs, - const UnicodeText::const_iterator& rhs) { - return lhs.it_ < rhs.it_; -} - -char32 UnicodeText::const_iterator::operator*() const { - // (We could call chartorune here, but that does some - // error-checking, and we're guaranteed that our data is valid - // UTF-8. Also, we expect this routine to be called very often. So - // for speed, we do the calculation ourselves.) - - // Convert from UTF-8 - int byte1 = it_[0]; - if (byte1 < 0x80) - return byte1; - - int byte2 = it_[1]; - if (byte1 < 0xE0) - return ((byte1 & 0x1F) << 6) - | (byte2 & 0x3F); - - int byte3 = it_[2]; - if (byte1 < 0xF0) - return ((byte1 & 0x0F) << 12) - | ((byte2 & 0x3F) << 6) - | (byte3 & 0x3F); - - int byte4 = it_[3]; - return ((byte1 & 0x07) << 18) - | ((byte2 & 0x3F) << 12) - | ((byte3 & 0x3F) << 6) - | (byte4 & 0x3F); -} - -UnicodeText::const_iterator& UnicodeText::const_iterator::operator++() { - it_ += UniLib::OneCharLen(it_); - return *this; -} - -UnicodeText::const_iterator& UnicodeText::const_iterator::operator--() { - while (UniLib::IsTrailByte(*--it_)); - return *this; -} - -int UnicodeText::const_iterator::get_utf8(char* utf8_output) const { - utf8_output[0] = it_[0]; - if (static_cast<unsigned char>(it_[0]) < 0x80) - return 1; - - utf8_output[1] = it_[1]; - if (static_cast<unsigned char>(it_[0]) < 0xE0) - return 2; - - utf8_output[2] = it_[2]; - if (static_cast<unsigned char>(it_[0]) < 0xF0) - return 3; - - utf8_output[3] = it_[3]; - return 4; -} - - -UnicodeText::const_iterator UnicodeText::MakeIterator(const char* p) const { - assert(p != NULL); - const char* start = utf8_data(); - int len = utf8_length(); - const char* end = start + len; - assert(p >= start); - assert(p <= end); - assert(p == end || !UniLib::IsTrailByte(*p)); - return const_iterator(p); -} - -string UnicodeText::const_iterator::DebugString() const { - stringstream ss; - - ss << "{iter " << hex << it_ << "}"; - string result; - ss >> result; - - return result; -} - diff --git a/third_party/libphonenumber/cpp/src/utf/unicodetext.h b/third_party/libphonenumber/cpp/src/utf/unicodetext.h deleted file mode 100644 index fb37a33..0000000 --- a/third_party/libphonenumber/cpp/src/utf/unicodetext.h +++ /dev/null @@ -1,456 +0,0 @@ -// Copyright (C) 2006 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -// Author: Jim Meehan - -#ifndef UTIL_UTF8_UNICODETEXT_H__ -#define UTIL_UTF8_UNICODETEXT_H__ - -#include <iterator> -#include <string> -#include <utility> -#include "base/basictypes.h" -//#include "util/utf8/public/config.h" - -using std::string; -using std::bidirectional_iterator_tag; -using std::pair; - -// ***************************** UnicodeText ************************** -// -// A UnicodeText object is a container for a sequence of Unicode -// codepoint values. It has default, copy, and assignment constructors. -// Data can be appended to it from another UnicodeText, from -// iterators, or from a single codepoint. -// -// The internal representation of the text is UTF-8. Since UTF-8 is a -// variable-width format, UnicodeText does not provide random access -// to the text, and changes to the text are permitted only at the end. -// -// The UnicodeText class defines a const_iterator. The dereferencing -// operator (*) returns a codepoint (char32). The iterator is a -// bidirectional, read-only iterator. It becomes invalid if the text -// is changed. -// -// There are methods for appending and retrieving UTF-8 data directly. -// The 'utf8_data' method returns a const char* that contains the -// UTF-8-encoded version of the text; 'utf8_length' returns the number -// of bytes in the UTF-8 data. An iterator's 'get' method stores up to -// 4 bytes of UTF-8 data in a char array and returns the number of -// bytes that it stored. -// -// Codepoints are integers in the range [0, 0xD7FF] or [0xE000, -// 0x10FFFF], but UnicodeText has the additional restriction that it -// can contain only those characters that are valid for interchange on -// the Web. This excludes all of the control codes except for carriage -// return, line feed, and horizontal tab. It also excludes -// non-characters, but codepoints that are in the Private Use regions -// are allowed, as are codepoints that are unassigned. (See the -// Unicode reference for details.) The function UniLib::IsInterchangeValid -// can be used as a test for this property. -// -// UnicodeTexts are safe. Every method that constructs or modifies a -// UnicodeText tests for interchange-validity, and will substitute a -// space for the invalid data. Such cases are reported via -// LOG(WARNING). -// -// MEMORY MANAGEMENT: copy, take ownership, or point to -// -// A UnicodeText is either an "owner", meaning that it owns the memory -// for the data buffer and will free it when the UnicodeText is -// destroyed, or it is an "alias", meaning that it does not. -// -// There are three methods for storing UTF-8 data in a UnicodeText: -// -// CopyUTF8(buffer, len) copies buffer. -// -// TakeOwnershipOfUTF8(buffer, size, capacity) takes ownership of buffer. -// -// PointToUTF8(buffer, size) creates an alias pointing to buffer. -// -// All three methods perform a validity check on the buffer. There are -// private, "unsafe" versions of these functions that bypass the -// validity check. They are used internally and by friend-functions -// that are handling UTF-8 data that has already been validated. -// -// The purpose of an alias is to avoid making an unnecessary copy of a -// UTF-8 buffer while still providing access to the Unicode values -// within that text through iterators or the fast scanners that are -// based on UTF-8 state tables. The lifetime of an alias must not -// exceed the lifetime of the buffer from which it was constructed. -// -// The semantics of an alias might be described as "copy on write or -// repair." The source data is never modified. If push_back() or -// append() is called on an alias, a copy of the data will be created, -// and the UnicodeText will become an owner. If clear() is called on -// an alias, it becomes an (empty) owner. -// -// The copy constructor and the assignment operator produce an owner. -// That is, after direct initialization ("UnicodeText x(y);") or copy -// initialization ("UnicodeText x = y;") x will be an owner, even if y -// was an alias. The assignment operator ("x = y;") also produces an -// owner unless x and y are the same object and y is an alias. -// -// Aliases should be used with care. If the source from which an alias -// was created is freed, or if the contents are changed, while the -// alias is still in use, fatal errors could result. But it can be -// quite useful to have a UnicodeText "window" through which to see a -// UTF-8 buffer without having to pay the price of making a copy. -// -// UTILITIES -// -// The interfaces in util/utf8/public/textutils.h provide higher-level -// utilities for dealing with UnicodeTexts, including routines for -// creating UnicodeTexts (both owners and aliases) from UTF-8 buffers or -// strings, creating strings from UnicodeTexts, normalizing text for -// efficient matching or display, and others. - -class UnicodeText { - public: - class const_iterator; - - typedef char32 value_type; - - // Constructors. These always produce owners. - UnicodeText(); // Create an empty text. - UnicodeText(const UnicodeText& src); // copy constructor - // Construct a substring (copies the data). - UnicodeText(const const_iterator& first, const const_iterator& last); - - // Assignment operator. This copies the data and produces an owner - // unless this == &src, e.g., "x = x;", which is a no-op. - UnicodeText& operator=(const UnicodeText& src); - - // x.Copy(y) copies the data from y into x. - UnicodeText& Copy(const UnicodeText& src); - inline UnicodeText& assign(const UnicodeText& src) { return Copy(src); } - - // x.PointTo(y) changes x so that it points to y's data. - // It does not copy y or take ownership of y's data. - UnicodeText& PointTo(const UnicodeText& src); - UnicodeText& PointTo(const const_iterator& first, - const const_iterator& last); - - ~UnicodeText(); - - void clear(); // Clear text. - bool empty() { return repr_.size_ == 0; } // Test if text is empty. - - // Add a codepoint to the end of the text. - // If the codepoint is not interchange-valid, add a space instead - // and log a warning. - void push_back(char32 codepoint); - - // Generic appending operation. - // iterator_traits<ForwardIterator>::value_type must be implicitly - // convertible to char32. Typical uses of this method might include: - // char32 chars[] = {0x1, 0x2, ...}; - // vector<char32> more_chars = ...; - // utext.append(chars, chars+arraysize(chars)); - // utext.append(more_chars.begin(), more_chars.end()); - template<typename ForwardIterator> - UnicodeText& append(ForwardIterator first, const ForwardIterator last) { - while (first != last) { push_back(*first++); } - return *this; - } - - // A specialization of the generic append() method. - UnicodeText& append(const const_iterator& first, const const_iterator& last); - - // An optimization of append(source.begin(), source.end()). - UnicodeText& append(const UnicodeText& source); - - int size() const; // the number of Unicode characters (codepoints) - - friend bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); - friend bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs); - - class const_iterator { - typedef const_iterator CI; - public: - typedef bidirectional_iterator_tag iterator_category; - typedef char32 value_type; - typedef ptrdiff_t difference_type; - typedef void pointer; // (Not needed.) - typedef const char32 reference; // (Needed for const_reverse_iterator) - - // Iterators are default-constructible. - const_iterator(); - - // It's safe to make multiple passes over a UnicodeText. - const_iterator(const const_iterator& other); - const_iterator& operator=(const const_iterator& other); - - char32 operator*() const; // Dereference - - const_iterator& operator++(); // Advance (++iter) - const_iterator operator++(int) { // (iter++) - const_iterator result(*this); - ++*this; - return result; - } - - const_iterator& operator--(); // Retreat (--iter) - const_iterator operator--(int) { // (iter--) - const_iterator result(*this); - --*this; - return result; - } - - // We love relational operators. - friend bool operator==(const CI& lhs, const CI& rhs) { - return lhs.it_ == rhs.it_; } - friend bool operator!=(const CI& lhs, const CI& rhs) { - return !(lhs == rhs); } - friend bool operator<(const CI& lhs, const CI& rhs); - friend bool operator>(const CI& lhs, const CI& rhs) { - return rhs < lhs; } - friend bool operator<=(const CI& lhs, const CI& rhs) { - return !(rhs < lhs); } - friend bool operator>=(const CI& lhs, const CI& rhs) { - return !(lhs < rhs); } - - friend difference_type distance(const CI& first, const CI& last); - - // UTF-8-specific methods - // Store the UTF-8 encoding of the current codepoint into buf, - // which must be at least 4 bytes long. Return the number of - // bytes written. - int get_utf8(char* buf) const; - // Return the iterator's pointer into the UTF-8 data. - const char* utf8_data() const { return it_; } - - string DebugString() const; - - private: - friend class UnicodeText; - friend class UnicodeTextUtils; - friend class UTF8StateTableProperty; - explicit const_iterator(const char* it) : it_(it) {} - - const char* it_; - }; - - const_iterator begin() const; - const_iterator end() const; - - class const_reverse_iterator : public std::reverse_iterator<const_iterator> { - public: - const_reverse_iterator(const_iterator it) : - std::reverse_iterator<const_iterator>(it) {} - const char* utf8_data() const { - const_iterator tmp_it = base(); - return (--tmp_it).utf8_data(); - } - int get_utf8(char* buf) const { - const_iterator tmp_it = base(); - return (--tmp_it).get_utf8(buf); - } - }; - const_reverse_iterator rbegin() const { - return const_reverse_iterator(end()); - } - const_reverse_iterator rend() const { - return const_reverse_iterator(begin()); - } - - // Substring searching. Returns the beginning of the first - // occurrence of "look", or end() if not found. - const_iterator find(const UnicodeText& look, const_iterator start_pos) const; - // Equivalent to find(look, begin()) - const_iterator find(const UnicodeText& look) const; - - // Returns whether this contains the character U+FFFD. This can - // occur, for example, if the input to Encodings::Decode() had byte - // sequences that were invalid in the source encoding. - bool HasReplacementChar() const; - - // UTF-8-specific methods - // - // Return the data, length, and capacity of UTF-8-encoded version of - // the text. Length and capacity are measured in bytes. - const char* utf8_data() const { return repr_.data_; } - int utf8_length() const { return repr_.size_; } - int utf8_capacity() const { return repr_.capacity_; } - - // Return the UTF-8 data as a string. - static string UTF8Substring(const const_iterator& first, - const const_iterator& last); - - // There are three methods for initializing a UnicodeText from UTF-8 - // data. They vary in details of memory management. In all cases, - // the data is tested for interchange-validity. If it is not - // interchange-valid, a LOG(WARNING) is issued, and each - // structurally invalid byte and each interchange-invalid codepoint - // is replaced with a space. - - // x.CopyUTF8(buf, len) copies buf into x. - UnicodeText& CopyUTF8(const char* utf8_buffer, int byte_length); - - // x.TakeOwnershipOfUTF8(buf, len, capacity). x takes ownership of - // buf. buf is not copied. - UnicodeText& TakeOwnershipOfUTF8(char* utf8_buffer, - int byte_length, - int byte_capacity); - - // x.PointToUTF8(buf,len) changes x so that it points to buf - // ("becomes an alias"). It does not take ownership or copy buf. - // If the buffer is not valid, this has the same effect as - // CopyUTF8(utf8_buffer, byte_length). - UnicodeText& PointToUTF8(const char* utf8_buffer, int byte_length); - - // Occasionally it is necessary to use functions that operate on the - // pointer returned by utf8_data(). MakeIterator(p) provides a way - // to get back to the UnicodeText level. It uses CHECK to ensure - // that p is a pointer within this object's UTF-8 data, and that it - // points to the beginning of a character. - const_iterator MakeIterator(const char* p) const; - - string DebugString() const; - - private: - friend class const_iterator; - friend class UnicodeTextUtils; - - class Repr { // A byte-string. - public: - char* data_; - int size_; - int capacity_; - bool ours_; // Do we own data_? - - Repr() : data_(NULL), size_(0), capacity_(0), ours_(true) {} - ~Repr() { if (ours_) delete[] data_; } - - void clear(); - void reserve(int capacity); - void resize(int size); - - void append(const char* bytes, int byte_length); - void Copy(const char* data, int size); - void TakeOwnershipOf(char* data, int size, int capacity); - void PointTo(const char* data, int size); - - string DebugString() const; - - private: - Repr& operator=(const Repr&); - Repr(const Repr& other); - }; - - Repr repr_; - - // UTF-8-specific private methods. - // These routines do not perform a validity check when compiled - // in opt mode. - // It is an error to call these methods with UTF-8 data that - // is not interchange-valid. - // - UnicodeText& UnsafeCopyUTF8(const char* utf8_buffer, int byte_length); - UnicodeText& UnsafeTakeOwnershipOfUTF8( - char* utf8_buffer, int byte_length, int byte_capacity); - UnicodeText& UnsafePointToUTF8(const char* utf8_buffer, int byte_length); - UnicodeText& UnsafeAppendUTF8(const char* utf8_buffer, int byte_length); - const_iterator UnsafeFind(const UnicodeText& look, - const_iterator start_pos) const; -}; - -bool operator==(const UnicodeText& lhs, const UnicodeText& rhs); - -inline bool operator!=(const UnicodeText& lhs, const UnicodeText& rhs) { - return !(lhs == rhs); -} - -// UnicodeTextRange is a pair of iterators, useful for specifying text -// segments. If the iterators are ==, the segment is empty. -typedef pair<UnicodeText::const_iterator, - UnicodeText::const_iterator> UnicodeTextRange; - -inline bool UnicodeTextRangeIsEmpty(const UnicodeTextRange& r) { - return r.first == r.second; -} - - -// *************************** Utilities ************************* - -// A factory function for creating a UnicodeText from a buffer of -// UTF-8 data. The new UnicodeText takes ownership of the buffer. (It -// is an "owner.") -// -// Each byte that is structurally invalid will be replaced with a -// space. Each codepoint that is interchange-invalid will also be -// replaced with a space, even if the codepoint was represented with a -// multibyte sequence in the UTF-8 data. -// -inline UnicodeText MakeUnicodeTextAcceptingOwnership( - char* utf8_buffer, int byte_length, int byte_capacity) { - return UnicodeText().TakeOwnershipOfUTF8( - utf8_buffer, byte_length, byte_capacity); -} - -// A factory function for creating a UnicodeText from a buffer of -// UTF-8 data. The new UnicodeText does not take ownership of the -// buffer. (It is an "alias.") -// -inline UnicodeText MakeUnicodeTextWithoutAcceptingOwnership( - const char* utf8_buffer, int byte_length) { - return UnicodeText().PointToUTF8(utf8_buffer, byte_length); -} - -// Create a UnicodeText from a UTF-8 string or buffer. -// -// If do_copy is true, then a copy of the string is made. The copy is -// owned by the resulting UnicodeText object and will be freed when -// the object is destroyed. This UnicodeText object is referred to -// as an "owner." -// -// If do_copy is false, then no copy is made. The resulting -// UnicodeText object does NOT take ownership of the string; in this -// case, the lifetime of the UnicodeText object must not exceed the -// lifetime of the string. This Unicodetext object is referred to as -// an "alias." This is the same as MakeUnicodeTextWithoutAcceptingOwnership. -// -// If the input string does not contain valid UTF-8, then a copy is -// made (as if do_copy were true) and coerced to valid UTF-8 by -// replacing each invalid byte with a space. -// -inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len, - bool do_copy) { - UnicodeText t; - if (do_copy) { - t.CopyUTF8(utf8_buf, len); - } else { - t.PointToUTF8(utf8_buf, len); - } - return t; -} - -inline UnicodeText UTF8ToUnicodeText(const string& utf_string, bool do_copy) { - return UTF8ToUnicodeText(utf_string.data(), utf_string.size(), do_copy); -} - -inline UnicodeText UTF8ToUnicodeText(const char* utf8_buf, int len) { - return UTF8ToUnicodeText(utf8_buf, len, true); -} -inline UnicodeText UTF8ToUnicodeText(const string& utf8_string) { - return UTF8ToUnicodeText(utf8_string, true); -} - -// Return a string containing the UTF-8 encoded version of all the -// Unicode characters in t. -inline string UnicodeTextToUTF8(const UnicodeText& t) { - return string(t.utf8_data(), t.utf8_length()); -} - -#endif // UTIL_UTF8_UNICODETEXT_H__ diff --git a/third_party/libphonenumber/cpp/src/utf/unilib.cc b/third_party/libphonenumber/cpp/src/utf/unilib.cc deleted file mode 100644 index 6d90954..0000000 --- a/third_party/libphonenumber/cpp/src/utf/unilib.cc +++ /dev/null @@ -1,64 +0,0 @@ -/** - * Copyright 2010 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Author: Shawn Ligocki - -#include "utf/unilib.h" - -#include "base/basictypes.h" -#include "utf/utf.h" - -namespace UniLib { - -namespace { - -// MOE: start_strip -// MOE: end_strip -// Codepoints not allowed for interchange are: -// C0 (ASCII) controls: U+0000 to U+001F excluding Space (SP, U+0020), -// Horizontal Tab (HT, U+0009), Line-Feed (LF, U+000A), -// Form Feed (FF, U+000C) and Carriage-Return (CR, U+000D) -// C1 controls: U+007F to U+009F -// Surrogates: U+D800 to U+DFFF -// Non-characters: U+FDD0 to U+FDEF and U+xxFFFE to U+xxFFFF for all xx -inline bool IsInterchangeValidCodepoint(char32 c) { - return !((c >= 0x00 && c <= 0x08) || c == 0x0B || (c >= 0x0E && c <= 0x1F) || - (c >= 0x7F && c <= 0x9F) || - (c >= 0xD800 && c <= 0xDFFF) || - (c >= 0xFDD0 && c <= 0xFDEF) || (c&0xFFFE) == 0xFFFE); -} - -} // namespace - -int SpanInterchangeValid(const char* begin, int byte_length) { - char32 rune; - const char* p = begin; - const char* end = begin + byte_length; - while (p < end) { - int bytes_consumed = charntorune(&rune, p, end - p); - // We want to accept Runeerror == U+FFFD as a valid char, but it is used - // by chartorune to indicate error. Luckily, the real codepoint is size 3 - // while errors return bytes_consumed == 1. - if ((rune == Runeerror && bytes_consumed == 1) || - !IsInterchangeValidCodepoint(rune)) { - break; // Found - } - p += bytes_consumed; - } - return p - begin; -} - -} // namespace UniLib diff --git a/third_party/libphonenumber/cpp/src/utf/unilib.h b/third_party/libphonenumber/cpp/src/utf/unilib.h deleted file mode 100644 index 4cfc787..0000000 --- a/third_party/libphonenumber/cpp/src/utf/unilib.h +++ /dev/null @@ -1,95 +0,0 @@ -/** - * Copyright 2010 Google Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -// Routines to do manipulation of Unicode characters or text -// -// The StructurallyValid routines accept buffers of arbitrary bytes. -// For CoerceToStructurallyValid(), the input buffer and output buffers may -// point to exactly the same memory. -// -// In all other cases, the UTF-8 string must be structurally valid and -// have all codepoints in the range U+0000 to U+D7FF or U+E000 to U+10FFFF. -// Debug builds take a fatal error for invalid UTF-8 input. -// The input and output buffers may not overlap at all. -// -// The char32 routines are here only for convenience; they convert to UTF-8 -// internally and use the UTF-8 routines. - -#ifndef UTIL_UTF8_UNILIB_H__ -#define UTIL_UTF8_UNILIB_H__ - -#include <string> -#include "base/basictypes.h" - -namespace UniLib { - -// Returns true unless a surrogate code point -inline bool IsValidCodepoint(char32 c) { - // In the range [0, 0xD800) or [0xE000, 0x10FFFF] - return (static_cast<uint32>(c) < 0xD800) - || (c >= 0xE000 && c <= 0x10FFFF); -} - -// Table of UTF-8 character lengths, based on first byte -static const unsigned char kUTF8LenTbl[256] = { - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1, - 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2, - 3,3,3,3,3,3,3,3, 3,3,3,3,3,3,3,3, 4,4,4,4,4,4,4,4, 4,4,4,4,4,4,4,4 -}; - -// Return length of a single UTF-8 source character -inline int OneCharLen(const char* src) { - return kUTF8LenTbl[*reinterpret_cast<const uint8*>(src)]; -} - -// Return length of a single UTF-8 source character -inline int OneCharLen(const uint8* src) { - return kUTF8LenTbl[*src]; -} - -// Return true if this byte is a trailing UTF-8 byte (10xx xxxx) -inline bool IsTrailByte(char x) { - // return (x & 0xC0) == 0x80; - // Since trail bytes are always in [0x80, 0xBF], we can optimize: - return static_cast<signed char>(x) < -0x40; -} - -// Returns the length in bytes of the prefix of src that is all -// interchange valid UTF-8 -int SpanInterchangeValid(const char* src, int byte_length); -inline int SpanInterchangeValid(const std::string& src) { - return SpanInterchangeValid(src.data(), src.size()); -} - -// Returns true if the source is all interchange valid UTF-8 -// "Interchange valid" is a stronger than structurally valid -- -// no C0 or C1 control codes (other than CR LF HT FF) and no non-characters. -inline bool IsInterchangeValid(const char* src, int byte_length) { - return (byte_length == SpanInterchangeValid(src, byte_length)); -} -inline bool IsInterchangeValid(const std::string& src) { - return IsInterchangeValid(src.data(), src.size()); -} - -} // namespace UniLib - -#endif // UTIL_UTF8_PUBLIC_UNILIB_H_ diff --git a/third_party/libphonenumber/cpp/src/utf/utf.h b/third_party/libphonenumber/cpp/src/utf/utf.h deleted file mode 100644 index f4fd482..0000000 --- a/third_party/libphonenumber/cpp/src/utf/utf.h +++ /dev/null @@ -1,251 +0,0 @@ -/* - * The authors of this software are Rob Pike and Ken Thompson. - * Copyright (c) 1998-2002 by Lucent Technologies. - * Portions Copyright (c) 2009 The Go Authors. All rights reserved. - * Permission to use, copy, modify, and distribute this software for any - * purpose without fee is hereby granted, provided that this entire notice - * is included in all copies of any software which is or includes a copy - * or modification of this software and in all copies of the supporting - * documentation for such software. - * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY - * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY - * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. - */ - -#ifndef _UTFH_ -#define _UTFH_ 1 - -// stdint.h content doesn't seem to be used in this file and doesn't exist on -// Windows, therefore we comment it out here so that the code could be compiled -// on Windows. -//#include <stdint.h> - -typedef signed int Rune; /* Code-point values in Unicode 4.0 are 21 bits wide.*/ - -enum -{ - UTFmax = 4, /* maximum bytes per rune */ - Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ - Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0xFFFD, /* decoding error in UTF */ - Runemax = 0x10FFFF, /* maximum rune value */ -}; - -#ifdef __cplusplus -extern "C" { -#endif - -/* - * rune routines - */ - -/* - * These routines were written by Rob Pike and Ken Thompson - * and first appeared in Plan 9. - * SEE ALSO - * utf (7) - * tcs (1) -*/ - -// runetochar copies (encodes) one rune, pointed to by r, to at most -// UTFmax bytes starting at s and returns the number of bytes generated. - -int runetochar(char* s, const Rune* r); - - -// chartorune copies (decodes) at most UTFmax bytes starting at s to -// one rune, pointed to by r, and returns the number of bytes consumed. -// If the input is not exactly in UTF format, chartorune will set *r -// to Runeerror and return 1. -// -// Note: There is no special case for a "null-terminated" string. A -// string whose first byte has the value 0 is the UTF8 encoding of the -// Unicode value 0 (i.e., ASCII NULL). A byte value of 0 is illegal -// anywhere else in a UTF sequence. - -int chartorune(Rune* r, const char* s); - - -// charntorune is like chartorune, except that it will access at most -// n bytes of s. If the UTF sequence is incomplete within n bytes, -// charntorune will set *r to Runeerror and return 0. If it is complete -// but not in UTF format, it will set *r to Runeerror and return 1. -// -// Added 2004-09-24 by Wei-Hwa Huang - -int charntorune(Rune* r, const char* s, int n); - -// isvalidcharntorune(str, n, r, consumed) -// is a convenience function that calls "*consumed = charntorune(r, str, n)" -// and returns an int (logically boolean) indicating whether the first -// n bytes of str was a valid and complete UTF sequence. - -int isvalidcharntorune(const char* str, int n, Rune* r, int* consumed); - -// runelen returns the number of bytes required to convert r into UTF. - -int runelen(Rune r); - - -// runenlen returns the number of bytes required to convert the n -// runes pointed to by r into UTF. - -int runenlen(const Rune* r, int n); - - -// fullrune returns 1 if the string s of length n is long enough to be -// decoded by chartorune, and 0 otherwise. This does not guarantee -// that the string contains a legal UTF encoding. This routine is used -// by programs that obtain input one byte at a time and need to know -// when a full rune has arrived. - -int fullrune(const char* s, int n); - -// The following routines are analogous to the corresponding string -// routines with "utf" substituted for "str", and "rune" substituted -// for "chr". - -// utflen returns the number of runes that are represented by the UTF -// string s. (cf. strlen) - -int utflen(const char* s); - - -// utfnlen returns the number of complete runes that are represented -// by the first n bytes of the UTF string s. If the last few bytes of -// the string contain an incompletely coded rune, utfnlen will not -// count them; in this way, it differs from utflen, which includes -// every byte of the string. (cf. strnlen) - -int utfnlen(const char* s, long n); - - -// utfrune returns a pointer to the first occurrence of rune r in the -// UTF string s, or 0 if r does not occur in the string. The NULL -// byte terminating a string is considered to be part of the string s. -// (cf. strchr) - -const char* utfrune(const char* s, Rune r); - - -// utfrrune returns a pointer to the last occurrence of rune r in the -// UTF string s, or 0 if r does not occur in the string. The NULL -// byte terminating a string is considered to be part of the string s. -// (cf. strrchr) - -const char* utfrrune(const char* s, Rune r); - - -// utfutf returns a pointer to the first occurrence of the UTF string -// s2 as a UTF substring of s1, or 0 if there is none. If s2 is the -// null string, utfutf returns s1. (cf. strstr) - -const char* utfutf(const char* s1, const char* s2); - - -// utfecpy copies UTF sequences until a null sequence has been copied, -// but writes no sequences beyond es1. If any sequences are copied, -// s1 is terminated by a null sequence, and a pointer to that sequence -// is returned. Otherwise, the original s1 is returned. (cf. strecpy) - -char* utfecpy(char *s1, char *es1, const char *s2); - - - -// These functions are rune-string analogues of the corresponding -// functions in strcat (3). -// -// These routines first appeared in Plan 9. -// SEE ALSO -// memmove (3) -// rune (3) -// strcat (2) -// -// BUGS: The outcome of overlapping moves varies among implementations. - -Rune* runestrcat(Rune* s1, const Rune* s2); -Rune* runestrncat(Rune* s1, const Rune* s2, long n); - -const Rune* runestrchr(const Rune* s, Rune c); - -int runestrcmp(const Rune* s1, const Rune* s2); -int runestrncmp(const Rune* s1, const Rune* s2, long n); - -Rune* runestrcpy(Rune* s1, const Rune* s2); -Rune* runestrncpy(Rune* s1, const Rune* s2, long n); -Rune* runestrecpy(Rune* s1, Rune* es1, const Rune* s2); - -Rune* runestrdup(const Rune* s); - -const Rune* runestrrchr(const Rune* s, Rune c); -long runestrlen(const Rune* s); -const Rune* runestrstr(const Rune* s1, const Rune* s2); - - - -// The following routines test types and modify cases for Unicode -// characters. Unicode defines some characters as letters and -// specifies three cases: upper, lower, and title. Mappings among the -// cases are also defined, although they are not exhaustive: some -// upper case letters have no lower case mapping, and so on. Unicode -// also defines several character properties, a subset of which are -// checked by these routines. These routines are based on Unicode -// version 3.0.0. -// -// NOTE: The routines are implemented in C, so the boolean functions -// (e.g., isupperrune) return 0 for false and 1 for true. -// -// -// toupperrune, tolowerrune, and totitlerune are the Unicode case -// mappings. These routines return the character unchanged if it has -// no defined mapping. - -Rune toupperrune(Rune r); -Rune tolowerrune(Rune r); -Rune totitlerune(Rune r); - - -// isupperrune tests for upper case characters, including Unicode -// upper case letters and targets of the toupper mapping. islowerrune -// and istitlerune are defined analogously. - -int isupperrune(Rune r); -int islowerrune(Rune r); -int istitlerune(Rune r); - - -// isalpharune tests for Unicode letters; this includes ideographs in -// addition to alphabetic characters. - -int isalpharune(Rune r); - - -// isdigitrune tests for digits. Non-digit numbers, such as Roman -// numerals, are not included. - -int isdigitrune(Rune r); - - -// isideographicrune tests for ideographic characters and numbers, as -// defined by the Unicode standard. - -int isideographicrune(Rune r); - - -// isspacerune tests for whitespace characters, including "C" locale -// whitespace, Unicode defined whitespace, and the "zero-width -// non-break space" character. - -int isspacerune(Rune r); - - -// (The comments in this file were copied from the manpage files rune.3, -// isalpharune.3, and runestrcat.3. Some formatting changes were also made -// to conform to Google style. /JRM 11/11/05) - -#ifdef __cplusplus -} -#endif - -#endif diff --git a/third_party/libphonenumber/cpp/src/utf/utfdef.h b/third_party/libphonenumber/cpp/src/utf/utfdef.h deleted file mode 100644 index adc6d95..0000000 --- a/third_party/libphonenumber/cpp/src/utf/utfdef.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * The authors of this software are Rob Pike and Ken Thompson. - * Copyright (c) 1998-2002 by Lucent Technologies. - * Permission to use, copy, modify, and distribute this software for any - * purpose without fee is hereby granted, provided that this entire notice - * is included in all copies of any software which is or includes a copy - * or modification of this software and in all copies of the supporting - * documentation for such software. - * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED - * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE ANY - * REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY - * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. - */ - -#define uchar _utfuchar -#define ushort _utfushort -#define uint _utfuint -#define ulong _utfulong -#define vlong _utfvlong -#define uvlong _utfuvlong - -typedef unsigned char uchar; -typedef unsigned short ushort; -typedef unsigned int uint; -typedef unsigned long ulong; - -#define nelem(x) (sizeof(x)/sizeof((x)[0])) -#define nil ((void*)0) |