diff options
author | avi@chromium.org <avi@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-06-08 06:05:47 +0000 |
---|---|---|
committer | avi@chromium.org <avi@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2013-06-08 06:05:47 +0000 |
commit | 8ad97adcaf4f453660f3d3057995ef7cd5730faa (patch) | |
tree | d549b8276831be0e10b132b01b878202e0187cc4 /base/strings | |
parent | 67c0e08dc5d8c31f7c071f2f47e1dac4134b95ae (diff) | |
download | chromium_src-8ad97adcaf4f453660f3d3057995ef7cd5730faa.zip chromium_src-8ad97adcaf4f453660f3d3057995ef7cd5730faa.tar.gz chromium_src-8ad97adcaf4f453660f3d3057995ef7cd5730faa.tar.bz2 |
Move string files in base/ to the string subdirectory.
BUG=247723
TEST=no change
Review URL: https://chromiumcodereview.appspot.com/16331011
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@205050 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'base/strings')
-rw-r--r-- | base/strings/string16.cc | 82 | ||||
-rw-r--r-- | base/strings/string16.h | 189 | ||||
-rw-r--r-- | base/strings/string16_unittest.cc | 54 | ||||
-rw-r--r-- | base/strings/string_util.cc | 1011 | ||||
-rw-r--r-- | base/strings/string_util.h | 576 | ||||
-rw-r--r-- | base/strings/string_util_constants.cc | 55 | ||||
-rw-r--r-- | base/strings/string_util_posix.h | 53 | ||||
-rw-r--r-- | base/strings/string_util_unittest.cc | 1191 | ||||
-rw-r--r-- | base/strings/string_util_win.h | 61 | ||||
-rw-r--r-- | base/strings/stringprintf.cc | 186 | ||||
-rw-r--r-- | base/strings/stringprintf.h | 62 | ||||
-rw-r--r-- | base/strings/stringprintf_unittest.cc | 188 |
12 files changed, 3708 insertions, 0 deletions
diff --git a/base/strings/string16.cc b/base/strings/string16.cc new file mode 100644 index 0000000..c802eef --- /dev/null +++ b/base/strings/string16.cc @@ -0,0 +1,82 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string16.h" + +#if defined(WCHAR_T_IS_UTF16) + +#error This file should not be used on 2-byte wchar_t systems +// If this winds up being needed on 2-byte wchar_t systems, either the +// definitions below can be used, or the host system's wide character +// functions like wmemcmp can be wrapped. + +#elif defined(WCHAR_T_IS_UTF32) + +#include <ostream> + +#include "base/strings/utf_string_conversions.h" + +namespace base { + +int c16memcmp(const char16* s1, const char16* s2, size_t n) { + // We cannot call memcmp because that changes the semantics. + while (n-- > 0) { + if (*s1 != *s2) { + // We cannot use (*s1 - *s2) because char16 is unsigned. + return ((*s1 < *s2) ? -1 : 1); + } + ++s1; + ++s2; + } + return 0; +} + +size_t c16len(const char16* s) { + const char16 *s_orig = s; + while (*s) { + ++s; + } + return s - s_orig; +} + +const char16* c16memchr(const char16* s, char16 c, size_t n) { + while (n-- > 0) { + if (*s == c) { + return s; + } + ++s; + } + return 0; +} + +char16* c16memmove(char16* s1, const char16* s2, size_t n) { + return static_cast<char16*>(memmove(s1, s2, n * sizeof(char16))); +} + +char16* c16memcpy(char16* s1, const char16* s2, size_t n) { + return static_cast<char16*>(memcpy(s1, s2, n * sizeof(char16))); +} + +char16* c16memset(char16* s, char16 c, size_t n) { + char16 *s_orig = s; + while (n-- > 0) { + *s = c; + ++s; + } + return s_orig; +} + +std::ostream& operator<<(std::ostream& out, const string16& str) { + return out << UTF16ToUTF8(str); +} + +void PrintTo(const string16& str, std::ostream* out) { + *out << str; +} + +} // namespace base + +template class std::basic_string<char16, base::string16_char_traits>; + +#endif // WCHAR_T_IS_UTF32 diff --git a/base/strings/string16.h b/base/strings/string16.h new file mode 100644 index 0000000..fd98f1b --- /dev/null +++ b/base/strings/string16.h @@ -0,0 +1,189 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING16_H_ +#define BASE_STRINGS_STRING16_H_ + +// WHAT: +// A version of std::basic_string that provides 2-byte characters even when +// wchar_t is not implemented as a 2-byte type. You can access this class as +// string16. We also define char16, which string16 is based upon. +// +// WHY: +// On Windows, wchar_t is 2 bytes, and it can conveniently handle UTF-16/UCS-2 +// data. Plenty of existing code operates on strings encoded as UTF-16. +// +// On many other platforms, sizeof(wchar_t) is 4 bytes by default. We can make +// it 2 bytes by using the GCC flag -fshort-wchar. But then std::wstring fails +// at run time, because it calls some functions (like wcslen) that come from +// the system's native C library -- which was built with a 4-byte wchar_t! +// It's wasteful to use 4-byte wchar_t strings to carry UTF-16 data, and it's +// entirely improper on those systems where the encoding of wchar_t is defined +// as UTF-32. +// +// Here, we define string16, which is similar to std::wstring but replaces all +// libc functions with custom, 2-byte-char compatible routines. It is capable +// of carrying UTF-16-encoded data. + +#include <stdio.h> +#include <string> + +#include "base/base_export.h" +#include "base/basictypes.h" + +#if defined(WCHAR_T_IS_UTF16) + +namespace base { + +typedef wchar_t char16; +typedef std::wstring string16; +typedef std::char_traits<wchar_t> string16_char_traits; + +} // namespace base + +#elif defined(WCHAR_T_IS_UTF32) + +namespace base { + +typedef uint16 char16; + +// char16 versions of the functions required by string16_char_traits; these +// are based on the wide character functions of similar names ("w" or "wcs" +// instead of "c16"). +BASE_EXPORT int c16memcmp(const char16* s1, const char16* s2, size_t n); +BASE_EXPORT size_t c16len(const char16* s); +BASE_EXPORT const char16* c16memchr(const char16* s, char16 c, size_t n); +BASE_EXPORT char16* c16memmove(char16* s1, const char16* s2, size_t n); +BASE_EXPORT char16* c16memcpy(char16* s1, const char16* s2, size_t n); +BASE_EXPORT char16* c16memset(char16* s, char16 c, size_t n); + +struct string16_char_traits { + typedef char16 char_type; + typedef int int_type; + + // int_type needs to be able to hold each possible value of char_type, and in + // addition, the distinct value of eof(). + COMPILE_ASSERT(sizeof(int_type) > sizeof(char_type), unexpected_type_width); + + typedef std::streamoff off_type; + typedef mbstate_t state_type; + typedef std::fpos<state_type> pos_type; + + static void assign(char_type& c1, const char_type& c2) { + c1 = c2; + } + + static bool eq(const char_type& c1, const char_type& c2) { + return c1 == c2; + } + static bool lt(const char_type& c1, const char_type& c2) { + return c1 < c2; + } + + static int compare(const char_type* s1, const char_type* s2, size_t n) { + return c16memcmp(s1, s2, n); + } + + static size_t length(const char_type* s) { + return c16len(s); + } + + static const char_type* find(const char_type* s, size_t n, + const char_type& a) { + return c16memchr(s, a, n); + } + + static char_type* move(char_type* s1, const char_type* s2, int_type n) { + return c16memmove(s1, s2, n); + } + + static char_type* copy(char_type* s1, const char_type* s2, size_t n) { + return c16memcpy(s1, s2, n); + } + + static char_type* assign(char_type* s, size_t n, char_type a) { + return c16memset(s, a, n); + } + + static int_type not_eof(const int_type& c) { + return eq_int_type(c, eof()) ? 0 : c; + } + + static char_type to_char_type(const int_type& c) { + return char_type(c); + } + + static int_type to_int_type(const char_type& c) { + return int_type(c); + } + + static bool eq_int_type(const int_type& c1, const int_type& c2) { + return c1 == c2; + } + + static int_type eof() { + return static_cast<int_type>(EOF); + } +}; + +typedef std::basic_string<char16, base::string16_char_traits> string16; + +BASE_EXPORT extern std::ostream& operator<<(std::ostream& out, + const string16& str); + +// This is required by googletest to print a readable output on test failures. +BASE_EXPORT extern void PrintTo(const string16& str, std::ostream* out); + +} // namespace base + +// The string class will be explicitly instantiated only once, in string16.cc. +// +// std::basic_string<> in GNU libstdc++ contains a static data member, +// _S_empty_rep_storage, to represent empty strings. When an operation such +// as assignment or destruction is performed on a string, causing its existing +// data member to be invalidated, it must not be freed if this static data +// member is being used. Otherwise, it counts as an attempt to free static +// (and not allocated) data, which is a memory error. +// +// Generally, due to C++ template magic, _S_empty_rep_storage will be marked +// as a coalesced symbol, meaning that the linker will combine multiple +// instances into a single one when generating output. +// +// If a string class is used by multiple shared libraries, a problem occurs. +// Each library will get its own copy of _S_empty_rep_storage. When strings +// are passed across a library boundary for alteration or destruction, memory +// errors will result. GNU libstdc++ contains a configuration option, +// --enable-fully-dynamic-string (_GLIBCXX_FULLY_DYNAMIC_STRING), which +// disables the static data member optimization, but it's a good optimization +// and non-STL code is generally at the mercy of the system's STL +// configuration. Fully-dynamic strings are not the default for GNU libstdc++ +// libstdc++ itself or for the libstdc++ installations on the systems we care +// about, such as Mac OS X and relevant flavors of Linux. +// +// See also http://gcc.gnu.org/bugzilla/show_bug.cgi?id=24196 . +// +// To avoid problems, string classes need to be explicitly instantiated only +// once, in exactly one library. All other string users see it via an "extern" +// declaration. This is precisely how GNU libstdc++ handles +// std::basic_string<char> (string) and std::basic_string<wchar_t> (wstring). +// +// This also works around a Mac OS X linker bug in ld64-85.2.1 (Xcode 3.1.2), +// in which the linker does not fully coalesce symbols when dead code +// stripping is enabled. This bug causes the memory errors described above +// to occur even when a std::basic_string<> does not cross shared library +// boundaries, such as in statically-linked executables. +// +// TODO(mark): File this bug with Apple and update this note with a bug number. + +extern template +class BASE_EXPORT std::basic_string<base::char16, base::string16_char_traits>; + +#endif // WCHAR_T_IS_UTF32 + +// TODO(brettw) update users of string16 to use the namespace and remove +// this "using". +using base::char16; +using base::string16; + +#endif // BASE_STRINGS_STRING16_H_ diff --git a/base/strings/string16_unittest.cc b/base/strings/string16_unittest.cc new file mode 100644 index 0000000..d98b2a9 --- /dev/null +++ b/base/strings/string16_unittest.cc @@ -0,0 +1,54 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <sstream> + +#include "base/strings/string16.h" + +#include "base/strings/utf_string_conversions.h" +#include "testing/gtest/include/gtest/gtest.h" + +#if defined(WCHAR_T_IS_UTF32) + +// We define a custom operator<< for string16 so we can use it with logging. +// This tests that conversion. +TEST(String16Test, OutputStream) { + // Basic stream test. + { + std::ostringstream stream; + stream << "Empty '" << string16() << "' standard '" + << string16(ASCIIToUTF16("Hello, world")) << "'"; + EXPECT_STREQ("Empty '' standard 'Hello, world'", + stream.str().c_str()); + } + + // Interesting edge cases. + { + // These should each get converted to the invalid character: EF BF BD. + string16 initial_surrogate; + initial_surrogate.push_back(0xd800); + string16 final_surrogate; + final_surrogate.push_back(0xdc00); + + // Old italic A = U+10300, will get converted to: F0 90 8C 80 'z'. + string16 surrogate_pair; + surrogate_pair.push_back(0xd800); + surrogate_pair.push_back(0xdf00); + surrogate_pair.push_back('z'); + + // Will get converted to the invalid char + 's': EF BF BD 's'. + string16 unterminated_surrogate; + unterminated_surrogate.push_back(0xd800); + unterminated_surrogate.push_back('s'); + + std::ostringstream stream; + stream << initial_surrogate << "," << final_surrogate << "," + << surrogate_pair << "," << unterminated_surrogate; + + EXPECT_STREQ("\xef\xbf\xbd,\xef\xbf\xbd,\xf0\x90\x8c\x80z,\xef\xbf\xbds", + stream.str().c_str()); + } +} + +#endif diff --git a/base/strings/string_util.cc b/base/strings/string_util.cc new file mode 100644 index 0000000..3ed7069 --- /dev/null +++ b/base/strings/string_util.cc @@ -0,0 +1,1011 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_util.h" + +#include <ctype.h> +#include <errno.h> +#include <math.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <time.h> +#include <wchar.h> +#include <wctype.h> + +#include <algorithm> +#include <vector> + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/memory/singleton.h" +#include "base/strings/utf_string_conversion_utils.h" +#include "base/strings/utf_string_conversions.h" +#include "base/third_party/icu/icu_utf.h" +#include "build/build_config.h" + +namespace { + +// Force the singleton used by Empty[W]String[16] to be a unique type. This +// prevents other code that might accidentally use Singleton<string> from +// getting our internal one. +struct EmptyStrings { + EmptyStrings() {} + const std::string s; + const std::wstring ws; + const string16 s16; + + static EmptyStrings* GetInstance() { + return Singleton<EmptyStrings>::get(); + } +}; + +// Used by ReplaceStringPlaceholders to track the position in the string of +// replaced parameters. +struct ReplacementOffset { + ReplacementOffset(uintptr_t parameter, size_t offset) + : parameter(parameter), + offset(offset) {} + + // Index of the parameter. + uintptr_t parameter; + + // Starting position in the string. + size_t offset; +}; + +static bool CompareParameter(const ReplacementOffset& elem1, + const ReplacementOffset& elem2) { + return elem1.parameter < elem2.parameter; +} + +} // namespace + +namespace base { + +bool IsWprintfFormatPortable(const wchar_t* format) { + for (const wchar_t* position = format; *position != '\0'; ++position) { + if (*position == '%') { + bool in_specification = true; + bool modifier_l = false; + while (in_specification) { + // Eat up characters until reaching a known specifier. + if (*++position == '\0') { + // The format string ended in the middle of a specification. Call + // it portable because no unportable specifications were found. The + // string is equally broken on all platforms. + return true; + } + + if (*position == 'l') { + // 'l' is the only thing that can save the 's' and 'c' specifiers. + modifier_l = true; + } else if (((*position == 's' || *position == 'c') && !modifier_l) || + *position == 'S' || *position == 'C' || *position == 'F' || + *position == 'D' || *position == 'O' || *position == 'U') { + // Not portable. + return false; + } + + if (wcschr(L"diouxXeEfgGaAcspn%", *position)) { + // Portable, keep scanning the rest of the format string. + in_specification = false; + } + } + } + } + + return true; +} + +} // namespace base + + +const std::string& EmptyString() { + return EmptyStrings::GetInstance()->s; +} + +const std::wstring& EmptyWString() { + return EmptyStrings::GetInstance()->ws; +} + +const string16& EmptyString16() { + return EmptyStrings::GetInstance()->s16; +} + +template<typename STR> +bool ReplaceCharsT(const STR& input, + const typename STR::value_type replace_chars[], + const STR& replace_with, + STR* output) { + bool removed = false; + size_t replace_length = replace_with.length(); + + *output = input; + + size_t found = output->find_first_of(replace_chars); + while (found != STR::npos) { + removed = true; + output->replace(found, 1, replace_with); + found = output->find_first_of(replace_chars, found + replace_length); + } + + return removed; +} + +bool ReplaceChars(const string16& input, + const char16 replace_chars[], + const string16& replace_with, + string16* output) { + return ReplaceCharsT(input, replace_chars, replace_with, output); +} + +bool ReplaceChars(const std::string& input, + const char replace_chars[], + const std::string& replace_with, + std::string* output) { + return ReplaceCharsT(input, replace_chars, replace_with, output); +} + +bool RemoveChars(const string16& input, + const char16 remove_chars[], + string16* output) { + return ReplaceChars(input, remove_chars, string16(), output); +} + +bool RemoveChars(const std::string& input, + const char remove_chars[], + std::string* output) { + return ReplaceChars(input, remove_chars, std::string(), output); +} + +template<typename STR> +TrimPositions TrimStringT(const STR& input, + const typename STR::value_type trim_chars[], + TrimPositions positions, + STR* output) { + // Find the edges of leading/trailing whitespace as desired. + const typename STR::size_type last_char = input.length() - 1; + const typename STR::size_type first_good_char = (positions & TRIM_LEADING) ? + input.find_first_not_of(trim_chars) : 0; + const typename STR::size_type last_good_char = (positions & TRIM_TRAILING) ? + input.find_last_not_of(trim_chars) : last_char; + + // When the string was all whitespace, report that we stripped off whitespace + // from whichever position the caller was interested in. For empty input, we + // stripped no whitespace, but we still need to clear |output|. + if (input.empty() || + (first_good_char == STR::npos) || (last_good_char == STR::npos)) { + bool input_was_empty = input.empty(); // in case output == &input + output->clear(); + return input_was_empty ? TRIM_NONE : positions; + } + + // Trim the whitespace. + *output = + input.substr(first_good_char, last_good_char - first_good_char + 1); + + // Return where we trimmed from. + return static_cast<TrimPositions>( + ((first_good_char == 0) ? TRIM_NONE : TRIM_LEADING) | + ((last_good_char == last_char) ? TRIM_NONE : TRIM_TRAILING)); +} + +bool TrimString(const std::wstring& input, + const wchar_t trim_chars[], + std::wstring* output) { + return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; +} + +#if !defined(WCHAR_T_IS_UTF16) +bool TrimString(const string16& input, + const char16 trim_chars[], + string16* output) { + return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; +} +#endif + +bool TrimString(const std::string& input, + const char trim_chars[], + std::string* output) { + return TrimStringT(input, trim_chars, TRIM_ALL, output) != TRIM_NONE; +} + +void TruncateUTF8ToByteSize(const std::string& input, + const size_t byte_size, + std::string* output) { + DCHECK(output); + if (byte_size > input.length()) { + *output = input; + return; + } + DCHECK_LE(byte_size, static_cast<uint32>(kint32max)); + // Note: This cast is necessary because CBU8_NEXT uses int32s. + int32 truncation_length = static_cast<int32>(byte_size); + int32 char_index = truncation_length - 1; + const char* data = input.data(); + + // Using CBU8, we will move backwards from the truncation point + // to the beginning of the string looking for a valid UTF8 + // character. Once a full UTF8 character is found, we will + // truncate the string to the end of that character. + while (char_index >= 0) { + int32 prev = char_index; + uint32 code_point = 0; + CBU8_NEXT(data, char_index, truncation_length, code_point); + if (!base::IsValidCharacter(code_point) || + !base::IsValidCodepoint(code_point)) { + char_index = prev - 1; + } else { + break; + } + } + + if (char_index >= 0 ) + *output = input.substr(0, char_index); + else + output->clear(); +} + +TrimPositions TrimWhitespace(const string16& input, + TrimPositions positions, + string16* output) { + return TrimStringT(input, kWhitespaceUTF16, positions, output); +} + +TrimPositions TrimWhitespaceASCII(const std::string& input, + TrimPositions positions, + std::string* output) { + return TrimStringT(input, kWhitespaceASCII, positions, output); +} + +// This function is only for backward-compatibility. +// To be removed when all callers are updated. +TrimPositions TrimWhitespace(const std::string& input, + TrimPositions positions, + std::string* output) { + return TrimWhitespaceASCII(input, positions, output); +} + +template<typename STR> +STR CollapseWhitespaceT(const STR& text, + bool trim_sequences_with_line_breaks) { + STR result; + result.resize(text.size()); + + // Set flags to pretend we're already in a trimmed whitespace sequence, so we + // will trim any leading whitespace. + bool in_whitespace = true; + bool already_trimmed = true; + + int chars_written = 0; + for (typename STR::const_iterator i(text.begin()); i != text.end(); ++i) { + if (IsWhitespace(*i)) { + if (!in_whitespace) { + // Reduce all whitespace sequences to a single space. + in_whitespace = true; + result[chars_written++] = L' '; + } + if (trim_sequences_with_line_breaks && !already_trimmed && + ((*i == '\n') || (*i == '\r'))) { + // Whitespace sequences containing CR or LF are eliminated entirely. + already_trimmed = true; + --chars_written; + } + } else { + // Non-whitespace chracters are copied straight across. + in_whitespace = false; + already_trimmed = false; + result[chars_written++] = *i; + } + } + + if (in_whitespace && !already_trimmed) { + // Any trailing whitespace is eliminated. + --chars_written; + } + + result.resize(chars_written); + return result; +} + +std::wstring CollapseWhitespace(const std::wstring& text, + bool trim_sequences_with_line_breaks) { + return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); +} + +#if !defined(WCHAR_T_IS_UTF16) +string16 CollapseWhitespace(const string16& text, + bool trim_sequences_with_line_breaks) { + return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); +} +#endif + +std::string CollapseWhitespaceASCII(const std::string& text, + bool trim_sequences_with_line_breaks) { + return CollapseWhitespaceT(text, trim_sequences_with_line_breaks); +} + +bool ContainsOnlyWhitespaceASCII(const std::string& str) { + for (std::string::const_iterator i(str.begin()); i != str.end(); ++i) { + if (!IsAsciiWhitespace(*i)) + return false; + } + return true; +} + +bool ContainsOnlyWhitespace(const string16& str) { + return str.find_first_not_of(kWhitespaceUTF16) == string16::npos; +} + +template<typename STR> +static bool ContainsOnlyCharsT(const STR& input, const STR& characters) { + for (typename STR::const_iterator iter = input.begin(); + iter != input.end(); ++iter) { + if (characters.find(*iter) == STR::npos) + return false; + } + return true; +} + +bool ContainsOnlyChars(const std::wstring& input, + const std::wstring& characters) { + return ContainsOnlyCharsT(input, characters); +} + +#if !defined(WCHAR_T_IS_UTF16) +bool ContainsOnlyChars(const string16& input, const string16& characters) { + return ContainsOnlyCharsT(input, characters); +} +#endif + +bool ContainsOnlyChars(const std::string& input, + const std::string& characters) { + return ContainsOnlyCharsT(input, characters); +} + +std::string WideToASCII(const std::wstring& wide) { + DCHECK(IsStringASCII(wide)) << wide; + return std::string(wide.begin(), wide.end()); +} + +std::string UTF16ToASCII(const string16& utf16) { + DCHECK(IsStringASCII(utf16)) << utf16; + return std::string(utf16.begin(), utf16.end()); +} + +// Latin1 is just the low range of Unicode, so we can copy directly to convert. +bool WideToLatin1(const std::wstring& wide, std::string* latin1) { + std::string output; + output.resize(wide.size()); + latin1->clear(); + for (size_t i = 0; i < wide.size(); i++) { + if (wide[i] > 255) + return false; + output[i] = static_cast<char>(wide[i]); + } + latin1->swap(output); + return true; +} + +template<class STR> +static bool DoIsStringASCII(const STR& str) { + for (size_t i = 0; i < str.length(); i++) { + typename ToUnsigned<typename STR::value_type>::Unsigned c = str[i]; + if (c > 0x7F) + return false; + } + return true; +} + +bool IsStringASCII(const std::wstring& str) { + return DoIsStringASCII(str); +} + +#if !defined(WCHAR_T_IS_UTF16) +bool IsStringASCII(const string16& str) { + return DoIsStringASCII(str); +} +#endif + +bool IsStringASCII(const base::StringPiece& str) { + return DoIsStringASCII(str); +} + +bool IsStringUTF8(const std::string& str) { + const char *src = str.data(); + int32 src_len = static_cast<int32>(str.length()); + int32 char_index = 0; + + while (char_index < src_len) { + int32 code_point; + CBU8_NEXT(src, char_index, src_len, code_point); + if (!base::IsValidCharacter(code_point)) + return false; + } + return true; +} + +template<typename Iter> +static inline bool DoLowerCaseEqualsASCII(Iter a_begin, + Iter a_end, + const char* b) { + for (Iter it = a_begin; it != a_end; ++it, ++b) { + if (!*b || base::ToLowerASCII(*it) != *b) + return false; + } + return *b == 0; +} + +// Front-ends for LowerCaseEqualsASCII. +bool LowerCaseEqualsASCII(const std::string& a, const char* b) { + return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); +} + +bool LowerCaseEqualsASCII(const std::wstring& a, const char* b) { + return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); +} + +#if !defined(WCHAR_T_IS_UTF16) +bool LowerCaseEqualsASCII(const string16& a, const char* b) { + return DoLowerCaseEqualsASCII(a.begin(), a.end(), b); +} +#endif + +bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, + std::string::const_iterator a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} + +bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, + std::wstring::const_iterator a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} + +#if !defined(WCHAR_T_IS_UTF16) +bool LowerCaseEqualsASCII(string16::const_iterator a_begin, + string16::const_iterator a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} +#endif + +// TODO(port): Resolve wchar_t/iterator issues that require OS_ANDROID here. +#if !defined(OS_ANDROID) +bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} + +bool LowerCaseEqualsASCII(const wchar_t* a_begin, + const wchar_t* a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} + +#if !defined(WCHAR_T_IS_UTF16) +bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b) { + return DoLowerCaseEqualsASCII(a_begin, a_end, b); +} +#endif + +#endif // !defined(OS_ANDROID) + +bool EqualsASCII(const string16& a, const base::StringPiece& b) { + if (a.length() != b.length()) + return false; + return std::equal(b.begin(), b.end(), a.begin()); +} + +bool StartsWithASCII(const std::string& str, + const std::string& search, + bool case_sensitive) { + if (case_sensitive) + return str.compare(0, search.length(), search) == 0; + else + return base::strncasecmp(str.c_str(), search.c_str(), search.length()) == 0; +} + +template <typename STR> +bool StartsWithT(const STR& str, const STR& search, bool case_sensitive) { + if (case_sensitive) { + return str.compare(0, search.length(), search) == 0; + } else { + if (search.size() > str.size()) + return false; + return std::equal(search.begin(), search.end(), str.begin(), + base::CaseInsensitiveCompare<typename STR::value_type>()); + } +} + +bool StartsWith(const std::wstring& str, const std::wstring& search, + bool case_sensitive) { + return StartsWithT(str, search, case_sensitive); +} + +#if !defined(WCHAR_T_IS_UTF16) +bool StartsWith(const string16& str, const string16& search, + bool case_sensitive) { + return StartsWithT(str, search, case_sensitive); +} +#endif + +template <typename STR> +bool EndsWithT(const STR& str, const STR& search, bool case_sensitive) { + typename STR::size_type str_length = str.length(); + typename STR::size_type search_length = search.length(); + if (search_length > str_length) + return false; + if (case_sensitive) { + return str.compare(str_length - search_length, search_length, search) == 0; + } else { + return std::equal(search.begin(), search.end(), + str.begin() + (str_length - search_length), + base::CaseInsensitiveCompare<typename STR::value_type>()); + } +} + +bool EndsWith(const std::string& str, const std::string& search, + bool case_sensitive) { + return EndsWithT(str, search, case_sensitive); +} + +bool EndsWith(const std::wstring& str, const std::wstring& search, + bool case_sensitive) { + return EndsWithT(str, search, case_sensitive); +} + +#if !defined(WCHAR_T_IS_UTF16) +bool EndsWith(const string16& str, const string16& search, + bool case_sensitive) { + return EndsWithT(str, search, case_sensitive); +} +#endif + +static const char* const kByteStringsUnlocalized[] = { + " B", + " kB", + " MB", + " GB", + " TB", + " PB" +}; + +string16 FormatBytesUnlocalized(int64 bytes) { + double unit_amount = static_cast<double>(bytes); + size_t dimension = 0; + const int kKilo = 1024; + while (unit_amount >= kKilo && + dimension < arraysize(kByteStringsUnlocalized) - 1) { + unit_amount /= kKilo; + dimension++; + } + + char buf[64]; + if (bytes != 0 && dimension > 0 && unit_amount < 100) { + base::snprintf(buf, arraysize(buf), "%.1lf%s", unit_amount, + kByteStringsUnlocalized[dimension]); + } else { + base::snprintf(buf, arraysize(buf), "%.0lf%s", unit_amount, + kByteStringsUnlocalized[dimension]); + } + + return ASCIIToUTF16(buf); +} + +template<class StringType> +void DoReplaceSubstringsAfterOffset(StringType* str, + typename StringType::size_type start_offset, + const StringType& find_this, + const StringType& replace_with, + bool replace_all) { + if ((start_offset == StringType::npos) || (start_offset >= str->length())) + return; + + DCHECK(!find_this.empty()); + for (typename StringType::size_type offs(str->find(find_this, start_offset)); + offs != StringType::npos; offs = str->find(find_this, offs)) { + str->replace(offs, find_this.length(), replace_with); + offs += replace_with.length(); + + if (!replace_all) + break; + } +} + +void ReplaceFirstSubstringAfterOffset(string16* str, + string16::size_type start_offset, + const string16& find_this, + const string16& replace_with) { + DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, + false); // replace first instance +} + +void ReplaceFirstSubstringAfterOffset(std::string* str, + std::string::size_type start_offset, + const std::string& find_this, + const std::string& replace_with) { + DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, + false); // replace first instance +} + +void ReplaceSubstringsAfterOffset(string16* str, + string16::size_type start_offset, + const string16& find_this, + const string16& replace_with) { + DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, + true); // replace all instances +} + +void ReplaceSubstringsAfterOffset(std::string* str, + std::string::size_type start_offset, + const std::string& find_this, + const std::string& replace_with) { + DoReplaceSubstringsAfterOffset(str, start_offset, find_this, replace_with, + true); // replace all instances +} + + +template<typename STR> +static size_t TokenizeT(const STR& str, + const STR& delimiters, + std::vector<STR>* tokens) { + tokens->clear(); + + typename STR::size_type start = str.find_first_not_of(delimiters); + while (start != STR::npos) { + typename STR::size_type end = str.find_first_of(delimiters, start + 1); + if (end == STR::npos) { + tokens->push_back(str.substr(start)); + break; + } else { + tokens->push_back(str.substr(start, end - start)); + start = str.find_first_not_of(delimiters, end + 1); + } + } + + return tokens->size(); +} + +size_t Tokenize(const std::wstring& str, + const std::wstring& delimiters, + std::vector<std::wstring>* tokens) { + return TokenizeT(str, delimiters, tokens); +} + +#if !defined(WCHAR_T_IS_UTF16) +size_t Tokenize(const string16& str, + const string16& delimiters, + std::vector<string16>* tokens) { + return TokenizeT(str, delimiters, tokens); +} +#endif + +size_t Tokenize(const std::string& str, + const std::string& delimiters, + std::vector<std::string>* tokens) { + return TokenizeT(str, delimiters, tokens); +} + +size_t Tokenize(const base::StringPiece& str, + const base::StringPiece& delimiters, + std::vector<base::StringPiece>* tokens) { + return TokenizeT(str, delimiters, tokens); +} + +template<typename STR> +static STR JoinStringT(const std::vector<STR>& parts, const STR& sep) { + if (parts.empty()) + return STR(); + + STR result(parts[0]); + typename std::vector<STR>::const_iterator iter = parts.begin(); + ++iter; + + for (; iter != parts.end(); ++iter) { + result += sep; + result += *iter; + } + + return result; +} + +std::string JoinString(const std::vector<std::string>& parts, char sep) { + return JoinStringT(parts, std::string(1, sep)); +} + +string16 JoinString(const std::vector<string16>& parts, char16 sep) { + return JoinStringT(parts, string16(1, sep)); +} + +std::string JoinString(const std::vector<std::string>& parts, + const std::string& separator) { + return JoinStringT(parts, separator); +} + +string16 JoinString(const std::vector<string16>& parts, + const string16& separator) { + return JoinStringT(parts, separator); +} + +template<class FormatStringType, class OutStringType> +OutStringType DoReplaceStringPlaceholders(const FormatStringType& format_string, + const std::vector<OutStringType>& subst, std::vector<size_t>* offsets) { + size_t substitutions = subst.size(); + + size_t sub_length = 0; + for (typename std::vector<OutStringType>::const_iterator iter = subst.begin(); + iter != subst.end(); ++iter) { + sub_length += iter->length(); + } + + OutStringType formatted; + formatted.reserve(format_string.length() + sub_length); + + std::vector<ReplacementOffset> r_offsets; + for (typename FormatStringType::const_iterator i = format_string.begin(); + i != format_string.end(); ++i) { + if ('$' == *i) { + if (i + 1 != format_string.end()) { + ++i; + DCHECK('$' == *i || '1' <= *i) << "Invalid placeholder: " << *i; + if ('$' == *i) { + while (i != format_string.end() && '$' == *i) { + formatted.push_back('$'); + ++i; + } + --i; + } else { + uintptr_t index = 0; + while (i != format_string.end() && '0' <= *i && *i <= '9') { + index *= 10; + index += *i - '0'; + ++i; + } + --i; + index -= 1; + if (offsets) { + ReplacementOffset r_offset(index, + static_cast<int>(formatted.size())); + r_offsets.insert(std::lower_bound(r_offsets.begin(), + r_offsets.end(), + r_offset, + &CompareParameter), + r_offset); + } + if (index < substitutions) + formatted.append(subst.at(index)); + } + } + } else { + formatted.push_back(*i); + } + } + if (offsets) { + for (std::vector<ReplacementOffset>::const_iterator i = r_offsets.begin(); + i != r_offsets.end(); ++i) { + offsets->push_back(i->offset); + } + } + return formatted; +} + +string16 ReplaceStringPlaceholders(const string16& format_string, + const std::vector<string16>& subst, + std::vector<size_t>* offsets) { + return DoReplaceStringPlaceholders(format_string, subst, offsets); +} + +std::string ReplaceStringPlaceholders(const base::StringPiece& format_string, + const std::vector<std::string>& subst, + std::vector<size_t>* offsets) { + return DoReplaceStringPlaceholders(format_string, subst, offsets); +} + +string16 ReplaceStringPlaceholders(const string16& format_string, + const string16& a, + size_t* offset) { + std::vector<size_t> offsets; + std::vector<string16> subst; + subst.push_back(a); + string16 result = ReplaceStringPlaceholders(format_string, subst, &offsets); + + DCHECK(offsets.size() == 1); + if (offset) { + *offset = offsets[0]; + } + return result; +} + +static bool IsWildcard(base_icu::UChar32 character) { + return character == '*' || character == '?'; +} + +// Move the strings pointers to the point where they start to differ. +template <typename CHAR, typename NEXT> +static void EatSameChars(const CHAR** pattern, const CHAR* pattern_end, + const CHAR** string, const CHAR* string_end, + NEXT next) { + const CHAR* escape = NULL; + while (*pattern != pattern_end && *string != string_end) { + if (!escape && IsWildcard(**pattern)) { + // We don't want to match wildcard here, except if it's escaped. + return; + } + + // Check if the escapement char is found. If so, skip it and move to the + // next character. + if (!escape && **pattern == '\\') { + escape = *pattern; + next(pattern, pattern_end); + continue; + } + + // Check if the chars match, if so, increment the ptrs. + const CHAR* pattern_next = *pattern; + const CHAR* string_next = *string; + base_icu::UChar32 pattern_char = next(&pattern_next, pattern_end); + if (pattern_char == next(&string_next, string_end) && + pattern_char != (base_icu::UChar32) CBU_SENTINEL) { + *pattern = pattern_next; + *string = string_next; + } else { + // Uh ho, it did not match, we are done. If the last char was an + // escapement, that means that it was an error to advance the ptr here, + // let's put it back where it was. This also mean that the MatchPattern + // function will return false because if we can't match an escape char + // here, then no one will. + if (escape) { + *pattern = escape; + } + return; + } + + escape = NULL; + } +} + +template <typename CHAR, typename NEXT> +static void EatWildcard(const CHAR** pattern, const CHAR* end, NEXT next) { + while (*pattern != end) { + if (!IsWildcard(**pattern)) + return; + next(pattern, end); + } +} + +template <typename CHAR, typename NEXT> +static bool MatchPatternT(const CHAR* eval, const CHAR* eval_end, + const CHAR* pattern, const CHAR* pattern_end, + int depth, + NEXT next) { + const int kMaxDepth = 16; + if (depth > kMaxDepth) + return false; + + // Eat all the matching chars. + EatSameChars(&pattern, pattern_end, &eval, eval_end, next); + + // If the string is empty, then the pattern must be empty too, or contains + // only wildcards. + if (eval == eval_end) { + EatWildcard(&pattern, pattern_end, next); + return pattern == pattern_end; + } + + // Pattern is empty but not string, this is not a match. + if (pattern == pattern_end) + return false; + + // If this is a question mark, then we need to compare the rest with + // the current string or the string with one character eaten. + const CHAR* next_pattern = pattern; + next(&next_pattern, pattern_end); + if (pattern[0] == '?') { + if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) + return true; + const CHAR* next_eval = eval; + next(&next_eval, eval_end); + if (MatchPatternT(next_eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) + return true; + } + + // This is a *, try to match all the possible substrings with the remainder + // of the pattern. + if (pattern[0] == '*') { + // Collapse duplicate wild cards (********** into *) so that the + // method does not recurse unnecessarily. http://crbug.com/52839 + EatWildcard(&next_pattern, pattern_end, next); + + while (eval != eval_end) { + if (MatchPatternT(eval, eval_end, next_pattern, pattern_end, + depth + 1, next)) + return true; + eval++; + } + + // We reached the end of the string, let see if the pattern contains only + // wildcards. + if (eval == eval_end) { + EatWildcard(&pattern, pattern_end, next); + if (pattern != pattern_end) + return false; + return true; + } + } + + return false; +} + +struct NextCharUTF8 { + base_icu::UChar32 operator()(const char** p, const char* end) { + base_icu::UChar32 c; + int offset = 0; + CBU8_NEXT(*p, offset, end - *p, c); + *p += offset; + return c; + } +}; + +struct NextCharUTF16 { + base_icu::UChar32 operator()(const char16** p, const char16* end) { + base_icu::UChar32 c; + int offset = 0; + CBU16_NEXT(*p, offset, end - *p, c); + *p += offset; + return c; + } +}; + +bool MatchPattern(const base::StringPiece& eval, + const base::StringPiece& pattern) { + return MatchPatternT(eval.data(), eval.data() + eval.size(), + pattern.data(), pattern.data() + pattern.size(), + 0, NextCharUTF8()); +} + +bool MatchPattern(const string16& eval, const string16& pattern) { + return MatchPatternT(eval.c_str(), eval.c_str() + eval.size(), + pattern.c_str(), pattern.c_str() + pattern.size(), + 0, NextCharUTF16()); +} + +// The following code is compatible with the OpenBSD lcpy interface. See: +// http://www.gratisoft.us/todd/papers/strlcpy.html +// ftp://ftp.openbsd.org/pub/OpenBSD/src/lib/libc/string/{wcs,str}lcpy.c + +namespace { + +template <typename CHAR> +size_t lcpyT(CHAR* dst, const CHAR* src, size_t dst_size) { + for (size_t i = 0; i < dst_size; ++i) { + if ((dst[i] = src[i]) == 0) // We hit and copied the terminating NULL. + return i; + } + + // We were left off at dst_size. We over copied 1 byte. Null terminate. + if (dst_size != 0) + dst[dst_size - 1] = 0; + + // Count the rest of the |src|, and return it's length in characters. + while (src[dst_size]) ++dst_size; + return dst_size; +} + +} // namespace + +size_t base::strlcpy(char* dst, const char* src, size_t dst_size) { + return lcpyT<char>(dst, src, dst_size); +} +size_t base::wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size) { + return lcpyT<wchar_t>(dst, src, dst_size); +} diff --git a/base/strings/string_util.h b/base/strings/string_util.h new file mode 100644 index 0000000..7b4af7d --- /dev/null +++ b/base/strings/string_util.h @@ -0,0 +1,576 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. +// +// This file defines utility functions for working with strings. + +#ifndef BASE_STRINGS_STRING_UTIL_H_ +#define BASE_STRINGS_STRING_UTIL_H_ + +#include <ctype.h> +#include <stdarg.h> // va_list + +#include <string> +#include <vector> + +#include "base/base_export.h" +#include "base/basictypes.h" +#include "base/compiler_specific.h" +#include "base/string16.h" +#include "base/strings/string_piece.h" // For implicit conversions. + +// Safe standard library wrappers for all platforms. + +namespace base { + +// C standard-library functions like "strncasecmp" and "snprintf" that aren't +// cross-platform are provided as "base::strncasecmp", and their prototypes +// are listed below. These functions are then implemented as inline calls +// to the platform-specific equivalents in the platform-specific headers. + +// Compares the two strings s1 and s2 without regard to case using +// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if +// s2 > s1 according to a lexicographic comparison. +int strcasecmp(const char* s1, const char* s2); + +// Compares up to count characters of s1 and s2 without regard to case using +// the current locale; returns 0 if they are equal, 1 if s1 > s2, and -1 if +// s2 > s1 according to a lexicographic comparison. +int strncasecmp(const char* s1, const char* s2, size_t count); + +// Same as strncmp but for char16 strings. +int strncmp16(const char16* s1, const char16* s2, size_t count); + +// Wrapper for vsnprintf that always null-terminates and always returns the +// number of characters that would be in an untruncated formatted +// string, even when truncation occurs. +int vsnprintf(char* buffer, size_t size, const char* format, va_list arguments) + PRINTF_FORMAT(3, 0); + +// vswprintf always null-terminates, but when truncation occurs, it will either +// return -1 or the number of characters that would be in an untruncated +// formatted string. The actual return value depends on the underlying +// C library's vswprintf implementation. +int vswprintf(wchar_t* buffer, size_t size, + const wchar_t* format, va_list arguments) + WPRINTF_FORMAT(3, 0); + +// Some of these implementations need to be inlined. + +// We separate the declaration from the implementation of this inline +// function just so the PRINTF_FORMAT works. +inline int snprintf(char* buffer, size_t size, const char* format, ...) + PRINTF_FORMAT(3, 4); +inline int snprintf(char* buffer, size_t size, const char* format, ...) { + va_list arguments; + va_start(arguments, format); + int result = vsnprintf(buffer, size, format, arguments); + va_end(arguments); + return result; +} + +// We separate the declaration from the implementation of this inline +// function just so the WPRINTF_FORMAT works. +inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) + WPRINTF_FORMAT(3, 4); +inline int swprintf(wchar_t* buffer, size_t size, const wchar_t* format, ...) { + va_list arguments; + va_start(arguments, format); + int result = vswprintf(buffer, size, format, arguments); + va_end(arguments); + return result; +} + +// BSD-style safe and consistent string copy functions. +// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|. +// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as +// long as |dst_size| is not 0. Returns the length of |src| in characters. +// If the return value is >= dst_size, then the output was truncated. +// NOTE: All sizes are in number of characters, NOT in bytes. +BASE_EXPORT size_t strlcpy(char* dst, const char* src, size_t dst_size); +BASE_EXPORT size_t wcslcpy(wchar_t* dst, const wchar_t* src, size_t dst_size); + +// Scan a wprintf format string to determine whether it's portable across a +// variety of systems. This function only checks that the conversion +// specifiers used by the format string are supported and have the same meaning +// on a variety of systems. It doesn't check for other errors that might occur +// within a format string. +// +// Nonportable conversion specifiers for wprintf are: +// - 's' and 'c' without an 'l' length modifier. %s and %c operate on char +// data on all systems except Windows, which treat them as wchar_t data. +// Use %ls and %lc for wchar_t data instead. +// - 'S' and 'C', which operate on wchar_t data on all systems except Windows, +// which treat them as char data. Use %ls and %lc for wchar_t data +// instead. +// - 'F', which is not identified by Windows wprintf documentation. +// - 'D', 'O', and 'U', which are deprecated and not available on all systems. +// Use %ld, %lo, and %lu instead. +// +// Note that there is no portable conversion specifier for char data when +// working with wprintf. +// +// This function is intended to be called from base::vswprintf. +BASE_EXPORT bool IsWprintfFormatPortable(const wchar_t* format); + +// ASCII-specific tolower. The standard library's tolower is locale sensitive, +// so we don't want to use it here. +template <class Char> inline Char ToLowerASCII(Char c) { + return (c >= 'A' && c <= 'Z') ? (c + ('a' - 'A')) : c; +} + +// ASCII-specific toupper. The standard library's toupper is locale sensitive, +// so we don't want to use it here. +template <class Char> inline Char ToUpperASCII(Char c) { + return (c >= 'a' && c <= 'z') ? (c + ('A' - 'a')) : c; +} + +// Function objects to aid in comparing/searching strings. + +template<typename Char> struct CaseInsensitiveCompare { + public: + bool operator()(Char x, Char y) const { + // TODO(darin): Do we really want to do locale sensitive comparisons here? + // See http://crbug.com/24917 + return tolower(x) == tolower(y); + } +}; + +template<typename Char> struct CaseInsensitiveCompareASCII { + public: + bool operator()(Char x, Char y) const { + return ToLowerASCII(x) == ToLowerASCII(y); + } +}; + +} // namespace base + +#if defined(OS_WIN) +#include "base/string_util_win.h" +#elif defined(OS_POSIX) +#include "base/string_util_posix.h" +#else +#error Define string operations appropriately for your platform +#endif + +// These threadsafe functions return references to globally unique empty +// strings. +// +// DO NOT USE THESE AS A GENERAL-PURPOSE SUBSTITUTE FOR DEFAULT CONSTRUCTORS. +// There is only one case where you should use these: functions which need to +// return a string by reference (e.g. as a class member accessor), and don't +// have an empty string to use (e.g. in an error case). These should not be +// used as initializers, function arguments, or return values for functions +// which return by value or outparam. +BASE_EXPORT const std::string& EmptyString(); +BASE_EXPORT const std::wstring& EmptyWString(); +BASE_EXPORT const string16& EmptyString16(); + +BASE_EXPORT extern const wchar_t kWhitespaceWide[]; +BASE_EXPORT extern const char16 kWhitespaceUTF16[]; +BASE_EXPORT extern const char kWhitespaceASCII[]; + +BASE_EXPORT extern const char kUtf8ByteOrderMark[]; + +// Removes characters in |remove_chars| from anywhere in |input|. Returns true +// if any characters were removed. |remove_chars| must be null-terminated. +// NOTE: Safe to use the same variable for both |input| and |output|. +BASE_EXPORT bool RemoveChars(const string16& input, + const char16 remove_chars[], + string16* output); +BASE_EXPORT bool RemoveChars(const std::string& input, + const char remove_chars[], + std::string* output); + +// Replaces characters in |replace_chars| from anywhere in |input| with +// |replace_with|. Each character in |replace_chars| will be replaced with +// the |replace_with| string. Returns true if any characters were replaced. +// |replace_chars| must be null-terminated. +// NOTE: Safe to use the same variable for both |input| and |output|. +BASE_EXPORT bool ReplaceChars(const string16& input, + const char16 replace_chars[], + const string16& replace_with, + string16* output); +BASE_EXPORT bool ReplaceChars(const std::string& input, + const char replace_chars[], + const std::string& replace_with, + std::string* output); + +// Removes characters in |trim_chars| from the beginning and end of |input|. +// |trim_chars| must be null-terminated. +// NOTE: Safe to use the same variable for both |input| and |output|. +BASE_EXPORT bool TrimString(const std::wstring& input, + const wchar_t trim_chars[], + std::wstring* output); +BASE_EXPORT bool TrimString(const string16& input, + const char16 trim_chars[], + string16* output); +BASE_EXPORT bool TrimString(const std::string& input, + const char trim_chars[], + std::string* output); + +// Truncates a string to the nearest UTF-8 character that will leave +// the string less than or equal to the specified byte size. +BASE_EXPORT void TruncateUTF8ToByteSize(const std::string& input, + const size_t byte_size, + std::string* output); + +// Trims any whitespace from either end of the input string. Returns where +// whitespace was found. +// The non-wide version has two functions: +// * TrimWhitespaceASCII() +// This function is for ASCII strings and only looks for ASCII whitespace; +// Please choose the best one according to your usage. +// NOTE: Safe to use the same variable for both input and output. +enum TrimPositions { + TRIM_NONE = 0, + TRIM_LEADING = 1 << 0, + TRIM_TRAILING = 1 << 1, + TRIM_ALL = TRIM_LEADING | TRIM_TRAILING, +}; +BASE_EXPORT TrimPositions TrimWhitespace(const string16& input, + TrimPositions positions, + string16* output); +BASE_EXPORT TrimPositions TrimWhitespaceASCII(const std::string& input, + TrimPositions positions, + std::string* output); + +// Deprecated. This function is only for backward compatibility and calls +// TrimWhitespaceASCII(). +BASE_EXPORT TrimPositions TrimWhitespace(const std::string& input, + TrimPositions positions, + std::string* output); + +// Searches for CR or LF characters. Removes all contiguous whitespace +// strings that contain them. This is useful when trying to deal with text +// copied from terminals. +// Returns |text|, with the following three transformations: +// (1) Leading and trailing whitespace is trimmed. +// (2) If |trim_sequences_with_line_breaks| is true, any other whitespace +// sequences containing a CR or LF are trimmed. +// (3) All other whitespace sequences are converted to single spaces. +BASE_EXPORT std::wstring CollapseWhitespace( + const std::wstring& text, + bool trim_sequences_with_line_breaks); +BASE_EXPORT string16 CollapseWhitespace( + const string16& text, + bool trim_sequences_with_line_breaks); +BASE_EXPORT std::string CollapseWhitespaceASCII( + const std::string& text, + bool trim_sequences_with_line_breaks); + +// Returns true if the passed string is empty or contains only white-space +// characters. +BASE_EXPORT bool ContainsOnlyWhitespaceASCII(const std::string& str); +BASE_EXPORT bool ContainsOnlyWhitespace(const string16& str); + +// Returns true if |input| is empty or contains only characters found in +// |characters|. +BASE_EXPORT bool ContainsOnlyChars(const std::wstring& input, + const std::wstring& characters); +BASE_EXPORT bool ContainsOnlyChars(const string16& input, + const string16& characters); +BASE_EXPORT bool ContainsOnlyChars(const std::string& input, + const std::string& characters); + +// Converts to 7-bit ASCII by truncating. The result must be known to be ASCII +// beforehand. +BASE_EXPORT std::string WideToASCII(const std::wstring& wide); +BASE_EXPORT std::string UTF16ToASCII(const string16& utf16); + +// Converts the given wide string to the corresponding Latin1. This will fail +// (return false) if any characters are more than 255. +BASE_EXPORT bool WideToLatin1(const std::wstring& wide, std::string* latin1); + +// Returns true if the specified string matches the criteria. How can a wide +// string be 8-bit or UTF8? It contains only characters that are < 256 (in the +// first case) or characters that use only 8-bits and whose 8-bit +// representation looks like a UTF-8 string (the second case). +// +// Note that IsStringUTF8 checks not only if the input is structurally +// valid but also if it doesn't contain any non-character codepoint +// (e.g. U+FFFE). It's done on purpose because all the existing callers want +// to have the maximum 'discriminating' power from other encodings. If +// there's a use case for just checking the structural validity, we have to +// add a new function for that. +BASE_EXPORT bool IsStringUTF8(const std::string& str); +BASE_EXPORT bool IsStringASCII(const std::wstring& str); +BASE_EXPORT bool IsStringASCII(const base::StringPiece& str); +BASE_EXPORT bool IsStringASCII(const string16& str); + +// Converts the elements of the given string. This version uses a pointer to +// clearly differentiate it from the non-pointer variant. +template <class str> inline void StringToLowerASCII(str* s) { + for (typename str::iterator i = s->begin(); i != s->end(); ++i) + *i = base::ToLowerASCII(*i); +} + +template <class str> inline str StringToLowerASCII(const str& s) { + // for std::string and std::wstring + str output(s); + StringToLowerASCII(&output); + return output; +} + +// Converts the elements of the given string. This version uses a pointer to +// clearly differentiate it from the non-pointer variant. +template <class str> inline void StringToUpperASCII(str* s) { + for (typename str::iterator i = s->begin(); i != s->end(); ++i) + *i = base::ToUpperASCII(*i); +} + +template <class str> inline str StringToUpperASCII(const str& s) { + // for std::string and std::wstring + str output(s); + StringToUpperASCII(&output); + return output; +} + +// Compare the lower-case form of the given string against the given ASCII +// string. This is useful for doing checking if an input string matches some +// token, and it is optimized to avoid intermediate string copies. This API is +// borrowed from the equivalent APIs in Mozilla. +BASE_EXPORT bool LowerCaseEqualsASCII(const std::string& a, const char* b); +BASE_EXPORT bool LowerCaseEqualsASCII(const std::wstring& a, const char* b); +BASE_EXPORT bool LowerCaseEqualsASCII(const string16& a, const char* b); + +// Same thing, but with string iterators instead. +BASE_EXPORT bool LowerCaseEqualsASCII(std::string::const_iterator a_begin, + std::string::const_iterator a_end, + const char* b); +BASE_EXPORT bool LowerCaseEqualsASCII(std::wstring::const_iterator a_begin, + std::wstring::const_iterator a_end, + const char* b); +BASE_EXPORT bool LowerCaseEqualsASCII(string16::const_iterator a_begin, + string16::const_iterator a_end, + const char* b); +BASE_EXPORT bool LowerCaseEqualsASCII(const char* a_begin, + const char* a_end, + const char* b); +BASE_EXPORT bool LowerCaseEqualsASCII(const wchar_t* a_begin, + const wchar_t* a_end, + const char* b); +BASE_EXPORT bool LowerCaseEqualsASCII(const char16* a_begin, + const char16* a_end, + const char* b); + +// Performs a case-sensitive string compare. The behavior is undefined if both +// strings are not ASCII. +BASE_EXPORT bool EqualsASCII(const string16& a, const base::StringPiece& b); + +// Returns true if str starts with search, or false otherwise. +BASE_EXPORT bool StartsWithASCII(const std::string& str, + const std::string& search, + bool case_sensitive); +BASE_EXPORT bool StartsWith(const std::wstring& str, + const std::wstring& search, + bool case_sensitive); +BASE_EXPORT bool StartsWith(const string16& str, + const string16& search, + bool case_sensitive); + +// Returns true if str ends with search, or false otherwise. +BASE_EXPORT bool EndsWith(const std::string& str, + const std::string& search, + bool case_sensitive); +BASE_EXPORT bool EndsWith(const std::wstring& str, + const std::wstring& search, + bool case_sensitive); +BASE_EXPORT bool EndsWith(const string16& str, + const string16& search, + bool case_sensitive); + + +// Determines the type of ASCII character, independent of locale (the C +// library versions will change based on locale). +template <typename Char> +inline bool IsAsciiWhitespace(Char c) { + return c == ' ' || c == '\r' || c == '\n' || c == '\t'; +} +template <typename Char> +inline bool IsAsciiAlpha(Char c) { + return ((c >= 'A') && (c <= 'Z')) || ((c >= 'a') && (c <= 'z')); +} +template <typename Char> +inline bool IsAsciiDigit(Char c) { + return c >= '0' && c <= '9'; +} + +template <typename Char> +inline bool IsHexDigit(Char c) { + return (c >= '0' && c <= '9') || + (c >= 'A' && c <= 'F') || + (c >= 'a' && c <= 'f'); +} + +template <typename Char> +inline Char HexDigitToInt(Char c) { + DCHECK(IsHexDigit(c)); + if (c >= '0' && c <= '9') + return c - '0'; + if (c >= 'A' && c <= 'F') + return c - 'A' + 10; + if (c >= 'a' && c <= 'f') + return c - 'a' + 10; + return 0; +} + +// Returns true if it's a whitespace character. +inline bool IsWhitespace(wchar_t c) { + return wcschr(kWhitespaceWide, c) != NULL; +} + +// Return a byte string in human-readable format with a unit suffix. Not +// appropriate for use in any UI; use of FormatBytes and friends in ui/base is +// highly recommended instead. TODO(avi): Figure out how to get callers to use +// FormatBytes instead; remove this. +BASE_EXPORT string16 FormatBytesUnlocalized(int64 bytes); + +// Starting at |start_offset| (usually 0), replace the first instance of +// |find_this| with |replace_with|. +BASE_EXPORT void ReplaceFirstSubstringAfterOffset( + string16* str, + string16::size_type start_offset, + const string16& find_this, + const string16& replace_with); +BASE_EXPORT void ReplaceFirstSubstringAfterOffset( + std::string* str, + std::string::size_type start_offset, + const std::string& find_this, + const std::string& replace_with); + +// Starting at |start_offset| (usually 0), look through |str| and replace all +// instances of |find_this| with |replace_with|. +// +// This does entire substrings; use std::replace in <algorithm> for single +// characters, for example: +// std::replace(str.begin(), str.end(), 'a', 'b'); +BASE_EXPORT void ReplaceSubstringsAfterOffset( + string16* str, + string16::size_type start_offset, + const string16& find_this, + const string16& replace_with); +BASE_EXPORT void ReplaceSubstringsAfterOffset( + std::string* str, + std::string::size_type start_offset, + const std::string& find_this, + const std::string& replace_with); + +// Reserves enough memory in |str| to accommodate |length_with_null| characters, +// sets the size of |str| to |length_with_null - 1| characters, and returns a +// pointer to the underlying contiguous array of characters. This is typically +// used when calling a function that writes results into a character array, but +// the caller wants the data to be managed by a string-like object. It is +// convenient in that is can be used inline in the call, and fast in that it +// avoids copying the results of the call from a char* into a string. +// +// |length_with_null| must be at least 2, since otherwise the underlying string +// would have size 0, and trying to access &((*str)[0]) in that case can result +// in a number of problems. +// +// Internally, this takes linear time because the resize() call 0-fills the +// underlying array for potentially all +// (|length_with_null - 1| * sizeof(string_type::value_type)) bytes. Ideally we +// could avoid this aspect of the resize() call, as we expect the caller to +// immediately write over this memory, but there is no other way to set the size +// of the string, and not doing that will mean people who access |str| rather +// than str.c_str() will get back a string of whatever size |str| had on entry +// to this function (probably 0). +template <class string_type> +inline typename string_type::value_type* WriteInto(string_type* str, + size_t length_with_null) { + DCHECK_GT(length_with_null, 1u); + str->reserve(length_with_null); + str->resize(length_with_null - 1); + return &((*str)[0]); +} + +//----------------------------------------------------------------------------- + +// Splits a string into its fields delimited by any of the characters in +// |delimiters|. Each field is added to the |tokens| vector. Returns the +// number of tokens found. +BASE_EXPORT size_t Tokenize(const std::wstring& str, + const std::wstring& delimiters, + std::vector<std::wstring>* tokens); +BASE_EXPORT size_t Tokenize(const string16& str, + const string16& delimiters, + std::vector<string16>* tokens); +BASE_EXPORT size_t Tokenize(const std::string& str, + const std::string& delimiters, + std::vector<std::string>* tokens); +BASE_EXPORT size_t Tokenize(const base::StringPiece& str, + const base::StringPiece& delimiters, + std::vector<base::StringPiece>* tokens); + +// Does the opposite of SplitString(). +BASE_EXPORT string16 JoinString(const std::vector<string16>& parts, char16 s); +BASE_EXPORT std::string JoinString( + const std::vector<std::string>& parts, char s); + +// Join |parts| using |separator|. +BASE_EXPORT std::string JoinString( + const std::vector<std::string>& parts, + const std::string& separator); +BASE_EXPORT string16 JoinString( + const std::vector<string16>& parts, + const string16& separator); + +// Replace $1-$2-$3..$9 in the format string with |a|-|b|-|c|..|i| respectively. +// Additionally, any number of consecutive '$' characters is replaced by that +// number less one. Eg $$->$, $$$->$$, etc. The offsets parameter here can be +// NULL. This only allows you to use up to nine replacements. +BASE_EXPORT string16 ReplaceStringPlaceholders( + const string16& format_string, + const std::vector<string16>& subst, + std::vector<size_t>* offsets); + +BASE_EXPORT std::string ReplaceStringPlaceholders( + const base::StringPiece& format_string, + const std::vector<std::string>& subst, + std::vector<size_t>* offsets); + +// Single-string shortcut for ReplaceStringHolders. |offset| may be NULL. +BASE_EXPORT string16 ReplaceStringPlaceholders(const string16& format_string, + const string16& a, + size_t* offset); + +// Returns true if the string passed in matches the pattern. The pattern +// string can contain wildcards like * and ? +// The backslash character (\) is an escape character for * and ? +// We limit the patterns to having a max of 16 * or ? characters. +// ? matches 0 or 1 character, while * matches 0 or more characters. +BASE_EXPORT bool MatchPattern(const base::StringPiece& string, + const base::StringPiece& pattern); +BASE_EXPORT bool MatchPattern(const string16& string, const string16& pattern); + +// Hack to convert any char-like type to its unsigned counterpart. +// For example, it will convert char, signed char and unsigned char to unsigned +// char. +template<typename T> +struct ToUnsigned { + typedef T Unsigned; +}; + +template<> +struct ToUnsigned<char> { + typedef unsigned char Unsigned; +}; +template<> +struct ToUnsigned<signed char> { + typedef unsigned char Unsigned; +}; +template<> +struct ToUnsigned<wchar_t> { +#if defined(WCHAR_T_IS_UTF16) + typedef unsigned short Unsigned; +#elif defined(WCHAR_T_IS_UTF32) + typedef uint32 Unsigned; +#endif +}; +template<> +struct ToUnsigned<short> { + typedef unsigned short Unsigned; +}; + +#endif // BASE_STRINGS_STRING_UTIL_H_ diff --git a/base/strings/string_util_constants.cc b/base/strings/string_util_constants.cc new file mode 100644 index 0000000..d92e40c --- /dev/null +++ b/base/strings/string_util_constants.cc @@ -0,0 +1,55 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_util.h" + +#define WHITESPACE_UNICODE \ + 0x0009, /* <control-0009> to <control-000D> */ \ + 0x000A, \ + 0x000B, \ + 0x000C, \ + 0x000D, \ + 0x0020, /* Space */ \ + 0x0085, /* <control-0085> */ \ + 0x00A0, /* No-Break Space */ \ + 0x1680, /* Ogham Space Mark */ \ + 0x180E, /* Mongolian Vowel Separator */ \ + 0x2000, /* En Quad to Hair Space */ \ + 0x2001, \ + 0x2002, \ + 0x2003, \ + 0x2004, \ + 0x2005, \ + 0x2006, \ + 0x2007, \ + 0x2008, \ + 0x2009, \ + 0x200A, \ + 0x200C, /* Zero Width Non-Joiner */ \ + 0x2028, /* Line Separator */ \ + 0x2029, /* Paragraph Separator */ \ + 0x202F, /* Narrow No-Break Space */ \ + 0x205F, /* Medium Mathematical Space */ \ + 0x3000, /* Ideographic Space */ \ + 0 + +const wchar_t kWhitespaceWide[] = { + WHITESPACE_UNICODE +}; + +const char16 kWhitespaceUTF16[] = { + WHITESPACE_UNICODE +}; + +const char kWhitespaceASCII[] = { + 0x09, // <control-0009> to <control-000D> + 0x0A, + 0x0B, + 0x0C, + 0x0D, + 0x20, // Space + 0 +}; + +const char kUtf8ByteOrderMark[] = "\xEF\xBB\xBF"; diff --git a/base/strings/string_util_posix.h b/base/strings/string_util_posix.h new file mode 100644 index 0000000..34b14f1 --- /dev/null +++ b/base/strings/string_util_posix.h @@ -0,0 +1,53 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING_UTIL_POSIX_H_ +#define BASE_STRINGS_STRING_UTIL_POSIX_H_ + +#include <stdarg.h> +#include <stdio.h> +#include <string.h> +#include <wchar.h> + +#include "base/logging.h" +#include "base/strings/string_util.h" + +namespace base { + +// Chromium code style is to not use malloc'd strings; this is only for use +// for interaction with APIs that require it. +inline char* strdup(const char* str) { + return ::strdup(str); +} + +inline int strcasecmp(const char* string1, const char* string2) { + return ::strcasecmp(string1, string2); +} + +inline int strncasecmp(const char* string1, const char* string2, size_t count) { + return ::strncasecmp(string1, string2, count); +} + +inline int vsnprintf(char* buffer, size_t size, + const char* format, va_list arguments) { + return ::vsnprintf(buffer, size, format, arguments); +} + +inline int strncmp16(const char16* s1, const char16* s2, size_t count) { +#if defined(WCHAR_T_IS_UTF16) + return ::wcsncmp(s1, s2, count); +#elif defined(WCHAR_T_IS_UTF32) + return c16memcmp(s1, s2, count); +#endif +} + +inline int vswprintf(wchar_t* buffer, size_t size, + const wchar_t* format, va_list arguments) { + DCHECK(IsWprintfFormatPortable(format)); + return ::vswprintf(buffer, size, format, arguments); +} + +} // namespace base + +#endif // BASE_STRINGS_STRING_UTIL_POSIX_H_ diff --git a/base/strings/string_util_unittest.cc b/base/strings/string_util_unittest.cc new file mode 100644 index 0000000..58b7620 --- /dev/null +++ b/base/strings/string_util_unittest.cc @@ -0,0 +1,1191 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/string_util.h" + +#include <math.h> +#include <stdarg.h> + +#include <limits> +#include <sstream> + +#include "base/basictypes.h" +#include "base/strings/string16.h" +#include "base/strings/utf_string_conversions.h" +#include "testing/gmock/include/gmock/gmock.h" +#include "testing/gtest/include/gtest/gtest.h" + +using ::testing::ElementsAre; + +namespace base { + +static const struct trim_case { + const wchar_t* input; + const TrimPositions positions; + const wchar_t* output; + const TrimPositions return_value; +} trim_cases[] = { + {L" Google Video ", TRIM_LEADING, L"Google Video ", TRIM_LEADING}, + {L" Google Video ", TRIM_TRAILING, L" Google Video", TRIM_TRAILING}, + {L" Google Video ", TRIM_ALL, L"Google Video", TRIM_ALL}, + {L"Google Video", TRIM_ALL, L"Google Video", TRIM_NONE}, + {L"", TRIM_ALL, L"", TRIM_NONE}, + {L" ", TRIM_LEADING, L"", TRIM_LEADING}, + {L" ", TRIM_TRAILING, L"", TRIM_TRAILING}, + {L" ", TRIM_ALL, L"", TRIM_ALL}, + {L"\t\rTest String\n", TRIM_ALL, L"Test String", TRIM_ALL}, + {L"\x2002Test String\x00A0\x3000", TRIM_ALL, L"Test String", TRIM_ALL}, +}; + +static const struct trim_case_ascii { + const char* input; + const TrimPositions positions; + const char* output; + const TrimPositions return_value; +} trim_cases_ascii[] = { + {" Google Video ", TRIM_LEADING, "Google Video ", TRIM_LEADING}, + {" Google Video ", TRIM_TRAILING, " Google Video", TRIM_TRAILING}, + {" Google Video ", TRIM_ALL, "Google Video", TRIM_ALL}, + {"Google Video", TRIM_ALL, "Google Video", TRIM_NONE}, + {"", TRIM_ALL, "", TRIM_NONE}, + {" ", TRIM_LEADING, "", TRIM_LEADING}, + {" ", TRIM_TRAILING, "", TRIM_TRAILING}, + {" ", TRIM_ALL, "", TRIM_ALL}, + {"\t\rTest String\n", TRIM_ALL, "Test String", TRIM_ALL}, +}; + +namespace { + +// Helper used to test TruncateUTF8ToByteSize. +bool Truncated(const std::string& input, const size_t byte_size, + std::string* output) { + size_t prev = input.length(); + TruncateUTF8ToByteSize(input, byte_size, output); + return prev != output->length(); +} + +} // namespace + +TEST(StringUtilTest, TruncateUTF8ToByteSize) { + std::string output; + + // Empty strings and invalid byte_size arguments + EXPECT_FALSE(Truncated(std::string(), 0, &output)); + EXPECT_EQ(output, ""); + EXPECT_TRUE(Truncated("\xe1\x80\xbf", 0, &output)); + EXPECT_EQ(output, ""); + EXPECT_FALSE(Truncated("\xe1\x80\xbf", -1, &output)); + EXPECT_FALSE(Truncated("\xe1\x80\xbf", 4, &output)); + + // Testing the truncation of valid UTF8 correctly + EXPECT_TRUE(Truncated("abc", 2, &output)); + EXPECT_EQ(output, "ab"); + EXPECT_TRUE(Truncated("\xc2\x81\xc2\x81", 2, &output)); + EXPECT_EQ(output.compare("\xc2\x81"), 0); + EXPECT_TRUE(Truncated("\xc2\x81\xc2\x81", 3, &output)); + EXPECT_EQ(output.compare("\xc2\x81"), 0); + EXPECT_FALSE(Truncated("\xc2\x81\xc2\x81", 4, &output)); + EXPECT_EQ(output.compare("\xc2\x81\xc2\x81"), 0); + + { + const char array[] = "\x00\x00\xc2\x81\xc2\x81"; + const std::string array_string(array, arraysize(array)); + EXPECT_TRUE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\x00\x00\xc2\x81", 4)), 0); + } + + { + const char array[] = "\x00\xc2\x81\xc2\x81"; + const std::string array_string(array, arraysize(array)); + EXPECT_TRUE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\x00\xc2\x81", 3)), 0); + } + + // Testing invalid UTF8 + EXPECT_TRUE(Truncated("\xed\xa0\x80\xed\xbf\xbf", 6, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xed\xa0\x8f", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xed\xbf\xbf", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + + // Testing invalid UTF8 mixed with valid UTF8 + EXPECT_FALSE(Truncated("\xe1\x80\xbf", 3, &output)); + EXPECT_EQ(output.compare("\xe1\x80\xbf"), 0); + EXPECT_FALSE(Truncated("\xf1\x80\xa0\xbf", 4, &output)); + EXPECT_EQ(output.compare("\xf1\x80\xa0\xbf"), 0); + EXPECT_FALSE(Truncated("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf", + 10, &output)); + EXPECT_EQ(output.compare("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf"), 0); + EXPECT_TRUE(Truncated("a\xc2\x81\xe1\x80\xbf\xf1""a""\x80\xa0", + 10, &output)); + EXPECT_EQ(output.compare("a\xc2\x81\xe1\x80\xbf\xf1""a"), 0); + EXPECT_FALSE(Truncated("\xef\xbb\xbf" "abc", 6, &output)); + EXPECT_EQ(output.compare("\xef\xbb\xbf" "abc"), 0); + + // Overlong sequences + EXPECT_TRUE(Truncated("\xc0\x80", 2, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xc1\x80\xc1\x81", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xe0\x80\x80", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xe0\x82\x80", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xe0\x9f\xbf", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x80\x80\x8D", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x80\x82\x91", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x80\xa0\x80", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x8f\xbb\xbf", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf8\x80\x80\x80\xbf", 5, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xfc\x80\x80\x80\xa0\xa5", 6, &output)); + EXPECT_EQ(output.compare(""), 0); + + // Beyond U+10FFFF (the upper limit of Unicode codespace) + EXPECT_TRUE(Truncated("\xf4\x90\x80\x80", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf8\xa0\xbf\x80\xbf", 5, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xfc\x9c\xbf\x80\xbf\x80", 6, &output)); + EXPECT_EQ(output.compare(""), 0); + + // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE) + EXPECT_TRUE(Truncated("\xfe\xff", 2, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xff\xfe", 2, &output)); + EXPECT_EQ(output.compare(""), 0); + + { + const char array[] = "\x00\x00\xfe\xff"; + const std::string array_string(array, arraysize(array)); + EXPECT_TRUE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\x00\x00", 2)), 0); + } + + // Variants on the previous test + { + const char array[] = "\xff\xfe\x00\x00"; + const std::string array_string(array, 4); + EXPECT_FALSE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\xff\xfe\x00\x00", 4)), 0); + } + { + const char array[] = "\xff\x00\x00\xfe"; + const std::string array_string(array, arraysize(array)); + EXPECT_TRUE(Truncated(array_string, 4, &output)); + EXPECT_EQ(output.compare(std::string("\xff\x00\x00", 3)), 0); + } + + // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF> + EXPECT_TRUE(Truncated("\xef\xbf\xbe", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf0\x8f\xbf\xbe", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xf3\xbf\xbf\xbf", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xef\xb7\x90", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_TRUE(Truncated("\xef\xb7\xaf", 3, &output)); + EXPECT_EQ(output.compare(""), 0); + + // Strings in legacy encodings that are valid in UTF-8, but + // are invalid as UTF-8 in real data. + EXPECT_TRUE(Truncated("caf\xe9", 4, &output)); + EXPECT_EQ(output.compare("caf"), 0); + EXPECT_TRUE(Truncated("\xb0\xa1\xb0\xa2", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + EXPECT_FALSE(Truncated("\xa7\x41\xa6\x6e", 4, &output)); + EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0); + EXPECT_TRUE(Truncated("\xa7\x41\xa6\x6e\xd9\xee\xe4\xee", 7, + &output)); + EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0); + + // Testing using the same string as input and output. + EXPECT_FALSE(Truncated(output, 4, &output)); + EXPECT_EQ(output.compare("\xa7\x41\xa6\x6e"), 0); + EXPECT_TRUE(Truncated(output, 3, &output)); + EXPECT_EQ(output.compare("\xa7\x41"), 0); + + // "abc" with U+201[CD] in windows-125[0-8] + EXPECT_TRUE(Truncated("\x93" "abc\x94", 5, &output)); + EXPECT_EQ(output.compare("\x93" "abc"), 0); + + // U+0639 U+064E U+0644 U+064E in ISO-8859-6 + EXPECT_TRUE(Truncated("\xd9\xee\xe4\xee", 4, &output)); + EXPECT_EQ(output.compare(""), 0); + + // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 + EXPECT_TRUE(Truncated("\xe3\xe5\xe9\xdC", 4, &output)); + EXPECT_EQ(output.compare(""), 0); +} + +TEST(StringUtilTest, TrimWhitespace) { + string16 output; // Allow contents to carry over to next testcase + for (size_t i = 0; i < arraysize(trim_cases); ++i) { + const trim_case& value = trim_cases[i]; + EXPECT_EQ(value.return_value, + TrimWhitespace(WideToUTF16(value.input), value.positions, + &output)); + EXPECT_EQ(WideToUTF16(value.output), output); + } + + // Test that TrimWhitespace() can take the same string for input and output + output = ASCIIToUTF16(" This is a test \r\n"); + EXPECT_EQ(TRIM_ALL, TrimWhitespace(output, TRIM_ALL, &output)); + EXPECT_EQ(ASCIIToUTF16("This is a test"), output); + + // Once more, but with a string of whitespace + output = ASCIIToUTF16(" \r\n"); + EXPECT_EQ(TRIM_ALL, TrimWhitespace(output, TRIM_ALL, &output)); + EXPECT_EQ(string16(), output); + + std::string output_ascii; + for (size_t i = 0; i < arraysize(trim_cases_ascii); ++i) { + const trim_case_ascii& value = trim_cases_ascii[i]; + EXPECT_EQ(value.return_value, + TrimWhitespace(value.input, value.positions, &output_ascii)); + EXPECT_EQ(value.output, output_ascii); + } +} + +static const struct collapse_case { + const wchar_t* input; + const bool trim; + const wchar_t* output; +} collapse_cases[] = { + {L" Google Video ", false, L"Google Video"}, + {L"Google Video", false, L"Google Video"}, + {L"", false, L""}, + {L" ", false, L""}, + {L"\t\rTest String\n", false, L"Test String"}, + {L"\x2002Test String\x00A0\x3000", false, L"Test String"}, + {L" Test \n \t String ", false, L"Test String"}, + {L"\x2002Test\x1680 \x2028 \tString\x00A0\x3000", false, L"Test String"}, + {L" Test String", false, L"Test String"}, + {L"Test String ", false, L"Test String"}, + {L"Test String", false, L"Test String"}, + {L"", true, L""}, + {L"\n", true, L""}, + {L" \r ", true, L""}, + {L"\nFoo", true, L"Foo"}, + {L"\r Foo ", true, L"Foo"}, + {L" Foo bar ", true, L"Foo bar"}, + {L" \tFoo bar \n", true, L"Foo bar"}, + {L" a \r b\n c \r\n d \t\re \t f \n ", true, L"abcde f"}, +}; + +TEST(StringUtilTest, CollapseWhitespace) { + for (size_t i = 0; i < arraysize(collapse_cases); ++i) { + const collapse_case& value = collapse_cases[i]; + EXPECT_EQ(value.output, CollapseWhitespace(value.input, value.trim)); + } +} + +static const struct collapse_case_ascii { + const char* input; + const bool trim; + const char* output; +} collapse_cases_ascii[] = { + {" Google Video ", false, "Google Video"}, + {"Google Video", false, "Google Video"}, + {"", false, ""}, + {" ", false, ""}, + {"\t\rTest String\n", false, "Test String"}, + {" Test \n \t String ", false, "Test String"}, + {" Test String", false, "Test String"}, + {"Test String ", false, "Test String"}, + {"Test String", false, "Test String"}, + {"", true, ""}, + {"\n", true, ""}, + {" \r ", true, ""}, + {"\nFoo", true, "Foo"}, + {"\r Foo ", true, "Foo"}, + {" Foo bar ", true, "Foo bar"}, + {" \tFoo bar \n", true, "Foo bar"}, + {" a \r b\n c \r\n d \t\re \t f \n ", true, "abcde f"}, +}; + +TEST(StringUtilTest, CollapseWhitespaceASCII) { + for (size_t i = 0; i < arraysize(collapse_cases_ascii); ++i) { + const collapse_case_ascii& value = collapse_cases_ascii[i]; + EXPECT_EQ(value.output, CollapseWhitespaceASCII(value.input, value.trim)); + } +} + +TEST(StringUtilTest, ContainsOnlyWhitespaceASCII) { + EXPECT_TRUE(ContainsOnlyWhitespaceASCII(std::string())); + EXPECT_TRUE(ContainsOnlyWhitespaceASCII(" ")); + EXPECT_TRUE(ContainsOnlyWhitespaceASCII("\t")); + EXPECT_TRUE(ContainsOnlyWhitespaceASCII("\t \r \n ")); + EXPECT_FALSE(ContainsOnlyWhitespaceASCII("a")); + EXPECT_FALSE(ContainsOnlyWhitespaceASCII("\thello\r \n ")); +} + +TEST(StringUtilTest, ContainsOnlyWhitespace) { + EXPECT_TRUE(ContainsOnlyWhitespace(string16())); + EXPECT_TRUE(ContainsOnlyWhitespace(ASCIIToUTF16(" "))); + EXPECT_TRUE(ContainsOnlyWhitespace(ASCIIToUTF16("\t"))); + EXPECT_TRUE(ContainsOnlyWhitespace(ASCIIToUTF16("\t \r \n "))); + EXPECT_FALSE(ContainsOnlyWhitespace(ASCIIToUTF16("a"))); + EXPECT_FALSE(ContainsOnlyWhitespace(ASCIIToUTF16("\thello\r \n "))); +} + +TEST(StringUtilTest, IsStringUTF8) { + EXPECT_TRUE(IsStringUTF8("abc")); + EXPECT_TRUE(IsStringUTF8("\xc2\x81")); + EXPECT_TRUE(IsStringUTF8("\xe1\x80\xbf")); + EXPECT_TRUE(IsStringUTF8("\xf1\x80\xa0\xbf")); + EXPECT_TRUE(IsStringUTF8("a\xc2\x81\xe1\x80\xbf\xf1\x80\xa0\xbf")); + EXPECT_TRUE(IsStringUTF8("\xef\xbb\xbf" "abc")); // UTF-8 BOM + + // surrogate code points + EXPECT_FALSE(IsStringUTF8("\xed\xa0\x80\xed\xbf\xbf")); + EXPECT_FALSE(IsStringUTF8("\xed\xa0\x8f")); + EXPECT_FALSE(IsStringUTF8("\xed\xbf\xbf")); + + // overlong sequences + EXPECT_FALSE(IsStringUTF8("\xc0\x80")); // U+0000 + EXPECT_FALSE(IsStringUTF8("\xc1\x80\xc1\x81")); // "AB" + EXPECT_FALSE(IsStringUTF8("\xe0\x80\x80")); // U+0000 + EXPECT_FALSE(IsStringUTF8("\xe0\x82\x80")); // U+0080 + EXPECT_FALSE(IsStringUTF8("\xe0\x9f\xbf")); // U+07ff + EXPECT_FALSE(IsStringUTF8("\xf0\x80\x80\x8D")); // U+000D + EXPECT_FALSE(IsStringUTF8("\xf0\x80\x82\x91")); // U+0091 + EXPECT_FALSE(IsStringUTF8("\xf0\x80\xa0\x80")); // U+0800 + EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbb\xbf")); // U+FEFF (BOM) + EXPECT_FALSE(IsStringUTF8("\xf8\x80\x80\x80\xbf")); // U+003F + EXPECT_FALSE(IsStringUTF8("\xfc\x80\x80\x80\xa0\xa5")); // U+00A5 + + // Beyond U+10FFFF (the upper limit of Unicode codespace) + EXPECT_FALSE(IsStringUTF8("\xf4\x90\x80\x80")); // U+110000 + EXPECT_FALSE(IsStringUTF8("\xf8\xa0\xbf\x80\xbf")); // 5 bytes + EXPECT_FALSE(IsStringUTF8("\xfc\x9c\xbf\x80\xbf\x80")); // 6 bytes + + // BOMs in UTF-16(BE|LE) and UTF-32(BE|LE) + EXPECT_FALSE(IsStringUTF8("\xfe\xff")); + EXPECT_FALSE(IsStringUTF8("\xff\xfe")); + EXPECT_FALSE(IsStringUTF8(std::string("\x00\x00\xfe\xff", 4))); + EXPECT_FALSE(IsStringUTF8("\xff\xfe\x00\x00")); + + // Non-characters : U+xxFFF[EF] where xx is 0x00 through 0x10 and <FDD0,FDEF> + EXPECT_FALSE(IsStringUTF8("\xef\xbf\xbe")); // U+FFFE) + EXPECT_FALSE(IsStringUTF8("\xf0\x8f\xbf\xbe")); // U+1FFFE + EXPECT_FALSE(IsStringUTF8("\xf3\xbf\xbf\xbf")); // U+10FFFF + EXPECT_FALSE(IsStringUTF8("\xef\xb7\x90")); // U+FDD0 + EXPECT_FALSE(IsStringUTF8("\xef\xb7\xaf")); // U+FDEF + // Strings in legacy encodings. We can certainly make up strings + // in a legacy encoding that are valid in UTF-8, but in real data, + // most of them are invalid as UTF-8. + EXPECT_FALSE(IsStringUTF8("caf\xe9")); // cafe with U+00E9 in ISO-8859-1 + EXPECT_FALSE(IsStringUTF8("\xb0\xa1\xb0\xa2")); // U+AC00, U+AC001 in EUC-KR + EXPECT_FALSE(IsStringUTF8("\xa7\x41\xa6\x6e")); // U+4F60 U+597D in Big5 + // "abc" with U+201[CD] in windows-125[0-8] + EXPECT_FALSE(IsStringUTF8("\x93" "abc\x94")); + // U+0639 U+064E U+0644 U+064E in ISO-8859-6 + EXPECT_FALSE(IsStringUTF8("\xd9\xee\xe4\xee")); + // U+03B3 U+03B5 U+03B9 U+03AC in ISO-8859-7 + EXPECT_FALSE(IsStringUTF8("\xe3\xe5\xe9\xdC")); + + // Check that we support Embedded Nulls. The first uses the canonical UTF-8 + // representation, and the second uses a 2-byte sequence. The second version + // is invalid UTF-8 since UTF-8 states that the shortest encoding for a + // given codepoint must be used. + static const char kEmbeddedNull[] = "embedded\0null"; + EXPECT_TRUE(IsStringUTF8( + std::string(kEmbeddedNull, sizeof(kEmbeddedNull)))); + EXPECT_FALSE(IsStringUTF8("embedded\xc0\x80U+0000")); +} + +TEST(StringUtilTest, ConvertASCII) { + static const char* char_cases[] = { + "Google Video", + "Hello, world\n", + "0123ABCDwxyz \a\b\t\r\n!+,.~" + }; + + static const wchar_t* const wchar_cases[] = { + L"Google Video", + L"Hello, world\n", + L"0123ABCDwxyz \a\b\t\r\n!+,.~" + }; + + for (size_t i = 0; i < arraysize(char_cases); ++i) { + EXPECT_TRUE(IsStringASCII(char_cases[i])); + std::wstring wide = ASCIIToWide(char_cases[i]); + EXPECT_EQ(wchar_cases[i], wide); + + EXPECT_TRUE(IsStringASCII(wchar_cases[i])); + std::string ascii = WideToASCII(wchar_cases[i]); + EXPECT_EQ(char_cases[i], ascii); + } + + EXPECT_FALSE(IsStringASCII("Google \x80Video")); + EXPECT_FALSE(IsStringASCII(L"Google \x80Video")); + + // Convert empty strings. + std::wstring wempty; + std::string empty; + EXPECT_EQ(empty, WideToASCII(wempty)); + EXPECT_EQ(wempty, ASCIIToWide(empty)); + + // Convert strings with an embedded NUL character. + const char chars_with_nul[] = "test\0string"; + const int length_with_nul = arraysize(chars_with_nul) - 1; + std::string string_with_nul(chars_with_nul, length_with_nul); + std::wstring wide_with_nul = ASCIIToWide(string_with_nul); + EXPECT_EQ(static_cast<std::wstring::size_type>(length_with_nul), + wide_with_nul.length()); + std::string narrow_with_nul = WideToASCII(wide_with_nul); + EXPECT_EQ(static_cast<std::string::size_type>(length_with_nul), + narrow_with_nul.length()); + EXPECT_EQ(0, string_with_nul.compare(narrow_with_nul)); +} + +TEST(StringUtilTest, ToUpperASCII) { + EXPECT_EQ('C', ToUpperASCII('C')); + EXPECT_EQ('C', ToUpperASCII('c')); + EXPECT_EQ('2', ToUpperASCII('2')); + + EXPECT_EQ(L'C', ToUpperASCII(L'C')); + EXPECT_EQ(L'C', ToUpperASCII(L'c')); + EXPECT_EQ(L'2', ToUpperASCII(L'2')); + + std::string in_place_a("Cc2"); + StringToUpperASCII(&in_place_a); + EXPECT_EQ("CC2", in_place_a); + + std::wstring in_place_w(L"Cc2"); + StringToUpperASCII(&in_place_w); + EXPECT_EQ(L"CC2", in_place_w); + + std::string original_a("Cc2"); + std::string upper_a = StringToUpperASCII(original_a); + EXPECT_EQ("CC2", upper_a); + + std::wstring original_w(L"Cc2"); + std::wstring upper_w = StringToUpperASCII(original_w); + EXPECT_EQ(L"CC2", upper_w); +} + +TEST(StringUtilTest, LowerCaseEqualsASCII) { + static const struct { + const wchar_t* src_w; + const char* src_a; + const char* dst; + } lowercase_cases[] = { + { L"FoO", "FoO", "foo" }, + { L"foo", "foo", "foo" }, + { L"FOO", "FOO", "foo" }, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(lowercase_cases); ++i) { + EXPECT_TRUE(LowerCaseEqualsASCII(lowercase_cases[i].src_w, + lowercase_cases[i].dst)); + EXPECT_TRUE(LowerCaseEqualsASCII(lowercase_cases[i].src_a, + lowercase_cases[i].dst)); + } +} + +TEST(StringUtilTest, FormatBytesUnlocalized) { + static const struct { + int64 bytes; + const char* expected; + } cases[] = { + // Expected behavior: we show one post-decimal digit when we have + // under two pre-decimal digits, except in cases where it makes no + // sense (zero or bytes). + // Since we switch units once we cross the 1000 mark, this keeps + // the display of file sizes or bytes consistently around three + // digits. + {0, "0 B"}, + {512, "512 B"}, + {1024*1024, "1.0 MB"}, + {1024*1024*1024, "1.0 GB"}, + {10LL*1024*1024*1024, "10.0 GB"}, + {99LL*1024*1024*1024, "99.0 GB"}, + {105LL*1024*1024*1024, "105 GB"}, + {105LL*1024*1024*1024 + 500LL*1024*1024, "105 GB"}, + {~(1LL<<63), "8192 PB"}, + + {99*1024 + 103, "99.1 kB"}, + {1024*1024 + 103, "1.0 MB"}, + {1024*1024 + 205 * 1024, "1.2 MB"}, + {1024*1024*1024 + (927 * 1024*1024), "1.9 GB"}, + {10LL*1024*1024*1024, "10.0 GB"}, + {100LL*1024*1024*1024, "100 GB"}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) { + EXPECT_EQ(ASCIIToUTF16(cases[i].expected), + FormatBytesUnlocalized(cases[i].bytes)); + } +} +TEST(StringUtilTest, ReplaceSubstringsAfterOffset) { + static const struct { + const char* str; + string16::size_type start_offset; + const char* find_this; + const char* replace_with; + const char* expected; + } cases[] = { + {"aaa", 0, "a", "b", "bbb"}, + {"abb", 0, "ab", "a", "ab"}, + {"Removing some substrings inging", 0, "ing", "", "Remov some substrs "}, + {"Not found", 0, "x", "0", "Not found"}, + {"Not found again", 5, "x", "0", "Not found again"}, + {" Making it much longer ", 0, " ", "Four score and seven years ago", + "Four score and seven years agoMakingFour score and seven years agoit" + "Four score and seven years agomuchFour score and seven years agolonger" + "Four score and seven years ago"}, + {"Invalid offset", 9999, "t", "foobar", "Invalid offset"}, + {"Replace me only me once", 9, "me ", "", "Replace me only once"}, + {"abababab", 2, "ab", "c", "abccc"}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); i++) { + string16 str = ASCIIToUTF16(cases[i].str); + ReplaceSubstringsAfterOffset(&str, cases[i].start_offset, + ASCIIToUTF16(cases[i].find_this), + ASCIIToUTF16(cases[i].replace_with)); + EXPECT_EQ(ASCIIToUTF16(cases[i].expected), str); + } +} + +TEST(StringUtilTest, ReplaceFirstSubstringAfterOffset) { + static const struct { + const char* str; + string16::size_type start_offset; + const char* find_this; + const char* replace_with; + const char* expected; + } cases[] = { + {"aaa", 0, "a", "b", "baa"}, + {"abb", 0, "ab", "a", "ab"}, + {"Removing some substrings inging", 0, "ing", "", + "Remov some substrings inging"}, + {"Not found", 0, "x", "0", "Not found"}, + {"Not found again", 5, "x", "0", "Not found again"}, + {" Making it much longer ", 0, " ", "Four score and seven years ago", + "Four score and seven years agoMaking it much longer "}, + {"Invalid offset", 9999, "t", "foobar", "Invalid offset"}, + {"Replace me only me once", 4, "me ", "", "Replace only me once"}, + {"abababab", 2, "ab", "c", "abcabab"}, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); i++) { + string16 str = ASCIIToUTF16(cases[i].str); + ReplaceFirstSubstringAfterOffset(&str, cases[i].start_offset, + ASCIIToUTF16(cases[i].find_this), + ASCIIToUTF16(cases[i].replace_with)); + EXPECT_EQ(ASCIIToUTF16(cases[i].expected), str); + } +} + +TEST(StringUtilTest, HexDigitToInt) { + EXPECT_EQ(0, HexDigitToInt('0')); + EXPECT_EQ(1, HexDigitToInt('1')); + EXPECT_EQ(2, HexDigitToInt('2')); + EXPECT_EQ(3, HexDigitToInt('3')); + EXPECT_EQ(4, HexDigitToInt('4')); + EXPECT_EQ(5, HexDigitToInt('5')); + EXPECT_EQ(6, HexDigitToInt('6')); + EXPECT_EQ(7, HexDigitToInt('7')); + EXPECT_EQ(8, HexDigitToInt('8')); + EXPECT_EQ(9, HexDigitToInt('9')); + EXPECT_EQ(10, HexDigitToInt('A')); + EXPECT_EQ(11, HexDigitToInt('B')); + EXPECT_EQ(12, HexDigitToInt('C')); + EXPECT_EQ(13, HexDigitToInt('D')); + EXPECT_EQ(14, HexDigitToInt('E')); + EXPECT_EQ(15, HexDigitToInt('F')); + + // Verify the lower case as well. + EXPECT_EQ(10, HexDigitToInt('a')); + EXPECT_EQ(11, HexDigitToInt('b')); + EXPECT_EQ(12, HexDigitToInt('c')); + EXPECT_EQ(13, HexDigitToInt('d')); + EXPECT_EQ(14, HexDigitToInt('e')); + EXPECT_EQ(15, HexDigitToInt('f')); +} + +// This checks where we can use the assignment operator for a va_list. We need +// a way to do this since Visual C doesn't support va_copy, but assignment on +// va_list is not guaranteed to be a copy. See StringAppendVT which uses this +// capability. +static void VariableArgsFunc(const char* format, ...) { + va_list org; + va_start(org, format); + + va_list dup; + GG_VA_COPY(dup, org); + int i1 = va_arg(org, int); + int j1 = va_arg(org, int); + char* s1 = va_arg(org, char*); + double d1 = va_arg(org, double); + va_end(org); + + int i2 = va_arg(dup, int); + int j2 = va_arg(dup, int); + char* s2 = va_arg(dup, char*); + double d2 = va_arg(dup, double); + + EXPECT_EQ(i1, i2); + EXPECT_EQ(j1, j2); + EXPECT_STREQ(s1, s2); + EXPECT_EQ(d1, d2); + + va_end(dup); +} + +TEST(StringUtilTest, VAList) { + VariableArgsFunc("%d %d %s %lf", 45, 92, "This is interesting", 9.21); +} + +// Test for Tokenize +template <typename STR> +void TokenizeTest() { + std::vector<STR> r; + size_t size; + + size = Tokenize(STR("This is a string"), STR(" "), &r); + EXPECT_EQ(4U, size); + ASSERT_EQ(4U, r.size()); + EXPECT_EQ(r[0], STR("This")); + EXPECT_EQ(r[1], STR("is")); + EXPECT_EQ(r[2], STR("a")); + EXPECT_EQ(r[3], STR("string")); + r.clear(); + + size = Tokenize(STR("one,two,three"), STR(","), &r); + EXPECT_EQ(3U, size); + ASSERT_EQ(3U, r.size()); + EXPECT_EQ(r[0], STR("one")); + EXPECT_EQ(r[1], STR("two")); + EXPECT_EQ(r[2], STR("three")); + r.clear(); + + size = Tokenize(STR("one,two:three;four"), STR(",:"), &r); + EXPECT_EQ(3U, size); + ASSERT_EQ(3U, r.size()); + EXPECT_EQ(r[0], STR("one")); + EXPECT_EQ(r[1], STR("two")); + EXPECT_EQ(r[2], STR("three;four")); + r.clear(); + + size = Tokenize(STR("one,two:three;four"), STR(";,:"), &r); + EXPECT_EQ(4U, size); + ASSERT_EQ(4U, r.size()); + EXPECT_EQ(r[0], STR("one")); + EXPECT_EQ(r[1], STR("two")); + EXPECT_EQ(r[2], STR("three")); + EXPECT_EQ(r[3], STR("four")); + r.clear(); + + size = Tokenize(STR("one, two, three"), STR(","), &r); + EXPECT_EQ(3U, size); + ASSERT_EQ(3U, r.size()); + EXPECT_EQ(r[0], STR("one")); + EXPECT_EQ(r[1], STR(" two")); + EXPECT_EQ(r[2], STR(" three")); + r.clear(); + + size = Tokenize(STR("one, two, three, "), STR(","), &r); + EXPECT_EQ(4U, size); + ASSERT_EQ(4U, r.size()); + EXPECT_EQ(r[0], STR("one")); + EXPECT_EQ(r[1], STR(" two")); + EXPECT_EQ(r[2], STR(" three")); + EXPECT_EQ(r[3], STR(" ")); + r.clear(); + + size = Tokenize(STR("one, two, three,"), STR(","), &r); + EXPECT_EQ(3U, size); + ASSERT_EQ(3U, r.size()); + EXPECT_EQ(r[0], STR("one")); + EXPECT_EQ(r[1], STR(" two")); + EXPECT_EQ(r[2], STR(" three")); + r.clear(); + + size = Tokenize(STR(), STR(","), &r); + EXPECT_EQ(0U, size); + ASSERT_EQ(0U, r.size()); + r.clear(); + + size = Tokenize(STR(","), STR(","), &r); + EXPECT_EQ(0U, size); + ASSERT_EQ(0U, r.size()); + r.clear(); + + size = Tokenize(STR(",;:."), STR(".:;,"), &r); + EXPECT_EQ(0U, size); + ASSERT_EQ(0U, r.size()); + r.clear(); + + size = Tokenize(STR("\t\ta\t"), STR("\t"), &r); + EXPECT_EQ(1U, size); + ASSERT_EQ(1U, r.size()); + EXPECT_EQ(r[0], STR("a")); + r.clear(); + + size = Tokenize(STR("\ta\t\nb\tcc"), STR("\n"), &r); + EXPECT_EQ(2U, size); + ASSERT_EQ(2U, r.size()); + EXPECT_EQ(r[0], STR("\ta\t")); + EXPECT_EQ(r[1], STR("b\tcc")); + r.clear(); +} + +TEST(StringUtilTest, TokenizeStdString) { + TokenizeTest<std::string>(); +} + +TEST(StringUtilTest, TokenizeStringPiece) { + TokenizeTest<base::StringPiece>(); +} + +// Test for JoinString +TEST(StringUtilTest, JoinString) { + std::vector<std::string> in; + EXPECT_EQ("", JoinString(in, ',')); + + in.push_back("a"); + EXPECT_EQ("a", JoinString(in, ',')); + + in.push_back("b"); + in.push_back("c"); + EXPECT_EQ("a,b,c", JoinString(in, ',')); + + in.push_back(std::string()); + EXPECT_EQ("a,b,c,", JoinString(in, ',')); + in.push_back(" "); + EXPECT_EQ("a|b|c|| ", JoinString(in, '|')); +} + +// Test for JoinString overloaded with std::string separator +TEST(StringUtilTest, JoinStringWithString) { + std::string separator(", "); + std::vector<std::string> parts; + EXPECT_EQ(std::string(), JoinString(parts, separator)); + + parts.push_back("a"); + EXPECT_EQ("a", JoinString(parts, separator)); + + parts.push_back("b"); + parts.push_back("c"); + EXPECT_EQ("a, b, c", JoinString(parts, separator)); + + parts.push_back(std::string()); + EXPECT_EQ("a, b, c, ", JoinString(parts, separator)); + parts.push_back(" "); + EXPECT_EQ("a|b|c|| ", JoinString(parts, "|")); +} + +// Test for JoinString overloaded with string16 separator +TEST(StringUtilTest, JoinStringWithString16) { + string16 separator = ASCIIToUTF16(", "); + std::vector<string16> parts; + EXPECT_EQ(string16(), JoinString(parts, separator)); + + parts.push_back(ASCIIToUTF16("a")); + EXPECT_EQ(ASCIIToUTF16("a"), JoinString(parts, separator)); + + parts.push_back(ASCIIToUTF16("b")); + parts.push_back(ASCIIToUTF16("c")); + EXPECT_EQ(ASCIIToUTF16("a, b, c"), JoinString(parts, separator)); + + parts.push_back(ASCIIToUTF16("")); + EXPECT_EQ(ASCIIToUTF16("a, b, c, "), JoinString(parts, separator)); + parts.push_back(ASCIIToUTF16(" ")); + EXPECT_EQ(ASCIIToUTF16("a|b|c|| "), JoinString(parts, ASCIIToUTF16("|"))); +} + +TEST(StringUtilTest, StartsWith) { + EXPECT_TRUE(StartsWithASCII("javascript:url", "javascript", true)); + EXPECT_FALSE(StartsWithASCII("JavaScript:url", "javascript", true)); + EXPECT_TRUE(StartsWithASCII("javascript:url", "javascript", false)); + EXPECT_TRUE(StartsWithASCII("JavaScript:url", "javascript", false)); + EXPECT_FALSE(StartsWithASCII("java", "javascript", true)); + EXPECT_FALSE(StartsWithASCII("java", "javascript", false)); + EXPECT_FALSE(StartsWithASCII(std::string(), "javascript", false)); + EXPECT_FALSE(StartsWithASCII(std::string(), "javascript", true)); + EXPECT_TRUE(StartsWithASCII("java", std::string(), false)); + EXPECT_TRUE(StartsWithASCII("java", std::string(), true)); + + EXPECT_TRUE(StartsWith(L"javascript:url", L"javascript", true)); + EXPECT_FALSE(StartsWith(L"JavaScript:url", L"javascript", true)); + EXPECT_TRUE(StartsWith(L"javascript:url", L"javascript", false)); + EXPECT_TRUE(StartsWith(L"JavaScript:url", L"javascript", false)); + EXPECT_FALSE(StartsWith(L"java", L"javascript", true)); + EXPECT_FALSE(StartsWith(L"java", L"javascript", false)); + EXPECT_FALSE(StartsWith(std::wstring(), L"javascript", false)); + EXPECT_FALSE(StartsWith(std::wstring(), L"javascript", true)); + EXPECT_TRUE(StartsWith(L"java", std::wstring(), false)); + EXPECT_TRUE(StartsWith(L"java", std::wstring(), true)); +} + +TEST(StringUtilTest, EndsWith) { + EXPECT_TRUE(EndsWith(L"Foo.plugin", L".plugin", true)); + EXPECT_FALSE(EndsWith(L"Foo.Plugin", L".plugin", true)); + EXPECT_TRUE(EndsWith(L"Foo.plugin", L".plugin", false)); + EXPECT_TRUE(EndsWith(L"Foo.Plugin", L".plugin", false)); + EXPECT_FALSE(EndsWith(L".plug", L".plugin", true)); + EXPECT_FALSE(EndsWith(L".plug", L".plugin", false)); + EXPECT_FALSE(EndsWith(L"Foo.plugin Bar", L".plugin", true)); + EXPECT_FALSE(EndsWith(L"Foo.plugin Bar", L".plugin", false)); + EXPECT_FALSE(EndsWith(std::wstring(), L".plugin", false)); + EXPECT_FALSE(EndsWith(std::wstring(), L".plugin", true)); + EXPECT_TRUE(EndsWith(L"Foo.plugin", std::wstring(), false)); + EXPECT_TRUE(EndsWith(L"Foo.plugin", std::wstring(), true)); + EXPECT_TRUE(EndsWith(L".plugin", L".plugin", false)); + EXPECT_TRUE(EndsWith(L".plugin", L".plugin", true)); + EXPECT_TRUE(EndsWith(std::wstring(), std::wstring(), false)); + EXPECT_TRUE(EndsWith(std::wstring(), std::wstring(), true)); +} + +TEST(StringUtilTest, GetStringFWithOffsets) { + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("1")); + subst.push_back(ASCIIToUTF16("2")); + std::vector<size_t> offsets; + + ReplaceStringPlaceholders(ASCIIToUTF16("Hello, $1. Your number is $2."), + subst, + &offsets); + EXPECT_EQ(2U, offsets.size()); + EXPECT_EQ(7U, offsets[0]); + EXPECT_EQ(25U, offsets[1]); + offsets.clear(); + + ReplaceStringPlaceholders(ASCIIToUTF16("Hello, $2. Your number is $1."), + subst, + &offsets); + EXPECT_EQ(2U, offsets.size()); + EXPECT_EQ(25U, offsets[0]); + EXPECT_EQ(7U, offsets[1]); + offsets.clear(); +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersTooFew) { + // Test whether replacestringplaceholders works as expected when there + // are fewer inputs than outputs. + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("9a")); + subst.push_back(ASCIIToUTF16("8b")); + subst.push_back(ASCIIToUTF16("7c")); + + string16 formatted = + ReplaceStringPlaceholders( + ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$1g,$2h,$3i"), subst, NULL); + + EXPECT_EQ(formatted, ASCIIToUTF16("9aa,8bb,7cc,d,e,f,9ag,8bh,7ci")); +} + +TEST(StringUtilTest, ReplaceStringPlaceholders) { + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("9a")); + subst.push_back(ASCIIToUTF16("8b")); + subst.push_back(ASCIIToUTF16("7c")); + subst.push_back(ASCIIToUTF16("6d")); + subst.push_back(ASCIIToUTF16("5e")); + subst.push_back(ASCIIToUTF16("4f")); + subst.push_back(ASCIIToUTF16("3g")); + subst.push_back(ASCIIToUTF16("2h")); + subst.push_back(ASCIIToUTF16("1i")); + + string16 formatted = + ReplaceStringPlaceholders( + ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i"), subst, NULL); + + EXPECT_EQ(formatted, ASCIIToUTF16("9aa,8bb,7cc,6dd,5ee,4ff,3gg,2hh,1ii")); +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersMoreThan9Replacements) { + std::vector<string16> subst; + subst.push_back(ASCIIToUTF16("9a")); + subst.push_back(ASCIIToUTF16("8b")); + subst.push_back(ASCIIToUTF16("7c")); + subst.push_back(ASCIIToUTF16("6d")); + subst.push_back(ASCIIToUTF16("5e")); + subst.push_back(ASCIIToUTF16("4f")); + subst.push_back(ASCIIToUTF16("3g")); + subst.push_back(ASCIIToUTF16("2h")); + subst.push_back(ASCIIToUTF16("1i")); + subst.push_back(ASCIIToUTF16("0j")); + subst.push_back(ASCIIToUTF16("-1k")); + subst.push_back(ASCIIToUTF16("-2l")); + subst.push_back(ASCIIToUTF16("-3m")); + subst.push_back(ASCIIToUTF16("-4n")); + + string16 formatted = + ReplaceStringPlaceholders( + ASCIIToUTF16("$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i," + "$10j,$11k,$12l,$13m,$14n,$1"), subst, NULL); + + EXPECT_EQ(formatted, ASCIIToUTF16("9aa,8bb,7cc,6dd,5ee,4ff,3gg,2hh," + "1ii,0jj,-1kk,-2ll,-3mm,-4nn,9a")); +} + +TEST(StringUtilTest, StdStringReplaceStringPlaceholders) { + std::vector<std::string> subst; + subst.push_back("9a"); + subst.push_back("8b"); + subst.push_back("7c"); + subst.push_back("6d"); + subst.push_back("5e"); + subst.push_back("4f"); + subst.push_back("3g"); + subst.push_back("2h"); + subst.push_back("1i"); + + std::string formatted = + ReplaceStringPlaceholders( + "$1a,$2b,$3c,$4d,$5e,$6f,$7g,$8h,$9i", subst, NULL); + + EXPECT_EQ(formatted, "9aa,8bb,7cc,6dd,5ee,4ff,3gg,2hh,1ii"); +} + +TEST(StringUtilTest, ReplaceStringPlaceholdersConsecutiveDollarSigns) { + std::vector<std::string> subst; + subst.push_back("a"); + subst.push_back("b"); + subst.push_back("c"); + EXPECT_EQ(ReplaceStringPlaceholders("$$1 $$$2 $$$$3", subst, NULL), + "$1 $$2 $$$3"); +} + +TEST(StringUtilTest, MatchPatternTest) { + EXPECT_TRUE(MatchPattern("www.google.com", "*.com")); + EXPECT_TRUE(MatchPattern("www.google.com", "*")); + EXPECT_FALSE(MatchPattern("www.google.com", "www*.g*.org")); + EXPECT_TRUE(MatchPattern("Hello", "H?l?o")); + EXPECT_FALSE(MatchPattern("www.google.com", "http://*)")); + EXPECT_FALSE(MatchPattern("www.msn.com", "*.COM")); + EXPECT_TRUE(MatchPattern("Hello*1234", "He??o\\*1*")); + EXPECT_FALSE(MatchPattern("", "*.*")); + EXPECT_TRUE(MatchPattern("", "*")); + EXPECT_TRUE(MatchPattern("", "?")); + EXPECT_TRUE(MatchPattern("", "")); + EXPECT_FALSE(MatchPattern("Hello", "")); + EXPECT_TRUE(MatchPattern("Hello*", "Hello*")); + // Stop after a certain recursion depth. + EXPECT_FALSE(MatchPattern("123456789012345678", "?????????????????*")); + + // Test UTF8 matching. + EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0", "*\xe2\x99\xa0")); + EXPECT_TRUE(MatchPattern("heart: \xe2\x99\xa0.", "heart: ?.")); + EXPECT_TRUE(MatchPattern("hearts: \xe2\x99\xa0\xe2\x99\xa0", "*")); + // Invalid sequences should be handled as a single invalid character. + EXPECT_TRUE(MatchPattern("invalid: \xef\xbf\xbe", "invalid: ?")); + // If the pattern has invalid characters, it shouldn't match anything. + EXPECT_FALSE(MatchPattern("\xf4\x90\x80\x80", "\xf4\x90\x80\x80")); + + // Test UTF16 character matching. + EXPECT_TRUE(MatchPattern(UTF8ToUTF16("www.google.com"), + UTF8ToUTF16("*.com"))); + EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello*1234"), + UTF8ToUTF16("He??o\\*1*"))); + + // This test verifies that consecutive wild cards are collapsed into 1 + // wildcard (when this doesn't occur, MatchPattern reaches it's maximum + // recursion depth). + EXPECT_TRUE(MatchPattern(UTF8ToUTF16("Hello"), + UTF8ToUTF16("He********************************o"))); +} + +TEST(StringUtilTest, LcpyTest) { + // Test the normal case where we fit in our buffer. + { + char dst[10]; + wchar_t wdst[10]; + EXPECT_EQ(7U, base::strlcpy(dst, "abcdefg", arraysize(dst))); + EXPECT_EQ(0, memcmp(dst, "abcdefg", 8)); + EXPECT_EQ(7U, base::wcslcpy(wdst, L"abcdefg", arraysize(wdst))); + EXPECT_EQ(0, memcmp(wdst, L"abcdefg", sizeof(wchar_t) * 8)); + } + + // Test dst_size == 0, nothing should be written to |dst| and we should + // have the equivalent of strlen(src). + { + char dst[2] = {1, 2}; + wchar_t wdst[2] = {1, 2}; + EXPECT_EQ(7U, base::strlcpy(dst, "abcdefg", 0)); + EXPECT_EQ(1, dst[0]); + EXPECT_EQ(2, dst[1]); + EXPECT_EQ(7U, base::wcslcpy(wdst, L"abcdefg", 0)); +#if defined(WCHAR_T_IS_UNSIGNED) + EXPECT_EQ(1U, wdst[0]); + EXPECT_EQ(2U, wdst[1]); +#else + EXPECT_EQ(1, wdst[0]); + EXPECT_EQ(2, wdst[1]); +#endif + } + + // Test the case were we _just_ competely fit including the null. + { + char dst[8]; + wchar_t wdst[8]; + EXPECT_EQ(7U, base::strlcpy(dst, "abcdefg", arraysize(dst))); + EXPECT_EQ(0, memcmp(dst, "abcdefg", 8)); + EXPECT_EQ(7U, base::wcslcpy(wdst, L"abcdefg", arraysize(wdst))); + EXPECT_EQ(0, memcmp(wdst, L"abcdefg", sizeof(wchar_t) * 8)); + } + + // Test the case were we we are one smaller, so we can't fit the null. + { + char dst[7]; + wchar_t wdst[7]; + EXPECT_EQ(7U, base::strlcpy(dst, "abcdefg", arraysize(dst))); + EXPECT_EQ(0, memcmp(dst, "abcdef", 7)); + EXPECT_EQ(7U, base::wcslcpy(wdst, L"abcdefg", arraysize(wdst))); + EXPECT_EQ(0, memcmp(wdst, L"abcdef", sizeof(wchar_t) * 7)); + } + + // Test the case were we are just too small. + { + char dst[3]; + wchar_t wdst[3]; + EXPECT_EQ(7U, base::strlcpy(dst, "abcdefg", arraysize(dst))); + EXPECT_EQ(0, memcmp(dst, "ab", 3)); + EXPECT_EQ(7U, base::wcslcpy(wdst, L"abcdefg", arraysize(wdst))); + EXPECT_EQ(0, memcmp(wdst, L"ab", sizeof(wchar_t) * 3)); + } +} + +TEST(StringUtilTest, WprintfFormatPortabilityTest) { + static const struct { + const wchar_t* input; + bool portable; + } cases[] = { + { L"%ls", true }, + { L"%s", false }, + { L"%S", false }, + { L"%lS", false }, + { L"Hello, %s", false }, + { L"%lc", true }, + { L"%c", false }, + { L"%C", false }, + { L"%lC", false }, + { L"%ls %s", false }, + { L"%s %ls", false }, + { L"%s %ls %s", false }, + { L"%f", true }, + { L"%f %F", false }, + { L"%d %D", false }, + { L"%o %O", false }, + { L"%u %U", false }, + { L"%f %d %o %u", true }, + { L"%-8d (%02.1f%)", true }, + { L"% 10s", false }, + { L"% 10ls", true } + }; + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) + EXPECT_EQ(cases[i].portable, base::IsWprintfFormatPortable(cases[i].input)); +} + +TEST(StringUtilTest, RemoveChars) { + const char* kRemoveChars = "-/+*"; + std::string input = "A-+bc/d!*"; + EXPECT_TRUE(RemoveChars(input, kRemoveChars, &input)); + EXPECT_EQ("Abcd!", input); + + // No characters match kRemoveChars. + EXPECT_FALSE(RemoveChars(input, kRemoveChars, &input)); + EXPECT_EQ("Abcd!", input); + + // Empty string. + input.clear(); + EXPECT_FALSE(RemoveChars(input, kRemoveChars, &input)); + EXPECT_EQ(std::string(), input); +} + +TEST(StringUtilTest, ReplaceChars) { + struct TestData { + const char* input; + const char* replace_chars; + const char* replace_with; + const char* output; + bool result; + } cases[] = { + { "", "", "", "", false }, + { "test", "", "", "test", false }, + { "test", "", "!", "test", false }, + { "test", "z", "!", "test", false }, + { "test", "e", "!", "t!st", true }, + { "test", "e", "!?", "t!?st", true }, + { "test", "ez", "!", "t!st", true }, + { "test", "zed", "!?", "t!?st", true }, + { "test", "t", "!?", "!?es!?", true }, + { "test", "et", "!>", "!>!>s!>", true }, + { "test", "zest", "!", "!!!!", true }, + { "test", "szt", "!", "!e!!", true }, + { "test", "t", "test", "testestest", true }, + }; + + for (size_t i = 0; i < ARRAYSIZE_UNSAFE(cases); ++i) { + std::string output; + bool result = ReplaceChars(cases[i].input, + cases[i].replace_chars, + cases[i].replace_with, + &output); + EXPECT_EQ(cases[i].result, result); + EXPECT_EQ(cases[i].output, output); + } +} + +TEST(StringUtilTest, ContainsOnlyChars) { + // Providing an empty list of characters should return false but for the empty + // string. + EXPECT_TRUE(ContainsOnlyChars(std::string(), std::string())); + EXPECT_FALSE(ContainsOnlyChars("Hello", std::string())); + + EXPECT_TRUE(ContainsOnlyChars(std::string(), "1234")); + EXPECT_TRUE(ContainsOnlyChars("1", "1234")); + EXPECT_TRUE(ContainsOnlyChars("1", "4321")); + EXPECT_TRUE(ContainsOnlyChars("123", "4321")); + EXPECT_FALSE(ContainsOnlyChars("123a", "4321")); +} + +class WriteIntoTest : public testing::Test { + protected: + static void WritesCorrectly(size_t num_chars) { + std::string buffer; + char kOriginal[] = "supercali"; + strncpy(WriteInto(&buffer, num_chars + 1), kOriginal, num_chars); + // Using std::string(buffer.c_str()) instead of |buffer| truncates the + // string at the first \0. + EXPECT_EQ(std::string(kOriginal, + std::min(num_chars, arraysize(kOriginal) - 1)), + std::string(buffer.c_str())); + EXPECT_EQ(num_chars, buffer.size()); + } +}; + +TEST_F(WriteIntoTest, WriteInto) { + // Validate that WriteInto reserves enough space and + // sizes a string correctly. + WritesCorrectly(1); + WritesCorrectly(2); + WritesCorrectly(5000); + + // Validate that WriteInto doesn't modify other strings + // when using a Copy-on-Write implementation. + const char kLive[] = "live"; + const char kDead[] = "dead"; + const std::string live = kLive; + std::string dead = live; + strncpy(WriteInto(&dead, 5), kDead, 4); + EXPECT_EQ(kDead, dead); + EXPECT_EQ(4u, dead.size()); + EXPECT_EQ(kLive, live); + EXPECT_EQ(4u, live.size()); +} + +} // namespace base diff --git a/base/strings/string_util_win.h b/base/strings/string_util_win.h new file mode 100644 index 0000000..602ba27 --- /dev/null +++ b/base/strings/string_util_win.h @@ -0,0 +1,61 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRING_UTIL_WIN_H_ +#define BASE_STRINGS_STRING_UTIL_WIN_H_ + +#include <stdarg.h> +#include <stdio.h> +#include <string.h> +#include <wchar.h> + +#include "base/logging.h" + +namespace base { + +// Chromium code style is to not use malloc'd strings; this is only for use +// for interaction with APIs that require it. +inline char* strdup(const char* str) { + return _strdup(str); +} + +inline int strcasecmp(const char* s1, const char* s2) { + return _stricmp(s1, s2); +} + +inline int strncasecmp(const char* s1, const char* s2, size_t count) { + return _strnicmp(s1, s2, count); +} + +inline int strncmp16(const char16* s1, const char16* s2, size_t count) { + return ::wcsncmp(s1, s2, count); +} + +inline int vsnprintf(char* buffer, size_t size, + const char* format, va_list arguments) { + int length = _vsprintf_p(buffer, size, format, arguments); + if (length < 0) { + if (size > 0) + buffer[0] = 0; + return _vscprintf_p(format, arguments); + } + return length; +} + +inline int vswprintf(wchar_t* buffer, size_t size, + const wchar_t* format, va_list arguments) { + DCHECK(IsWprintfFormatPortable(format)); + + int length = _vswprintf_p(buffer, size, format, arguments); + if (length < 0) { + if (size > 0) + buffer[0] = 0; + return _vscwprintf_p(format, arguments); + } + return length; +} + +} // namespace base + +#endif // BASE_STRINGS_STRING_UTIL_WIN_H_ diff --git a/base/strings/stringprintf.cc b/base/strings/stringprintf.cc new file mode 100644 index 0000000..fe23daa --- /dev/null +++ b/base/strings/stringprintf.cc @@ -0,0 +1,186 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/stringprintf.h" + +#include <errno.h> + +#include "base/scoped_clear_errno.h" +#include "base/strings/string_util.h" +#include "base/strings/utf_string_conversions.h" + +namespace base { + +namespace { + +// Overloaded wrappers around vsnprintf and vswprintf. The buf_size parameter +// is the size of the buffer. These return the number of characters in the +// formatted string excluding the NUL terminator. If the buffer is not +// large enough to accommodate the formatted string without truncation, they +// return the number of characters that would be in the fully-formatted string +// (vsnprintf, and vswprintf on Windows), or -1 (vswprintf on POSIX platforms). +inline int vsnprintfT(char* buffer, + size_t buf_size, + const char* format, + va_list argptr) { + return base::vsnprintf(buffer, buf_size, format, argptr); +} + +#if !defined(OS_ANDROID) +inline int vsnprintfT(wchar_t* buffer, + size_t buf_size, + const wchar_t* format, + va_list argptr) { + return base::vswprintf(buffer, buf_size, format, argptr); +} +#endif + +// Templatized backend for StringPrintF/StringAppendF. This does not finalize +// the va_list, the caller is expected to do that. +template <class StringType> +static void StringAppendVT(StringType* dst, + const typename StringType::value_type* format, + va_list ap) { + // First try with a small fixed size buffer. + // This buffer size should be kept in sync with StringUtilTest.GrowBoundary + // and StringUtilTest.StringPrintfBounds. + typename StringType::value_type stack_buf[1024]; + + va_list ap_copy; + GG_VA_COPY(ap_copy, ap); + +#if !defined(OS_WIN) + ScopedClearErrno clear_errno; +#endif + int result = vsnprintfT(stack_buf, arraysize(stack_buf), format, ap_copy); + va_end(ap_copy); + + if (result >= 0 && result < static_cast<int>(arraysize(stack_buf))) { + // It fit. + dst->append(stack_buf, result); + return; + } + + // Repeatedly increase buffer size until it fits. + int mem_length = arraysize(stack_buf); + while (true) { + if (result < 0) { +#if !defined(OS_WIN) + // On Windows, vsnprintfT always returns the number of characters in a + // fully-formatted string, so if we reach this point, something else is + // wrong and no amount of buffer-doubling is going to fix it. + if (errno != 0 && errno != EOVERFLOW) +#endif + { + // If an error other than overflow occurred, it's never going to work. + DLOG(WARNING) << "Unable to printf the requested string due to error."; + return; + } + // Try doubling the buffer size. + mem_length *= 2; + } else { + // We need exactly "result + 1" characters. + mem_length = result + 1; + } + + if (mem_length > 32 * 1024 * 1024) { + // That should be plenty, don't try anything larger. This protects + // against huge allocations when using vsnprintfT implementations that + // return -1 for reasons other than overflow without setting errno. + DLOG(WARNING) << "Unable to printf the requested string due to size."; + return; + } + + std::vector<typename StringType::value_type> mem_buf(mem_length); + + // NOTE: You can only use a va_list once. Since we're in a while loop, we + // need to make a new copy each time so we don't use up the original. + GG_VA_COPY(ap_copy, ap); + result = vsnprintfT(&mem_buf[0], mem_length, format, ap_copy); + va_end(ap_copy); + + if ((result >= 0) && (result < mem_length)) { + // It fit. + dst->append(&mem_buf[0], result); + return; + } + } +} + +} // namespace + +std::string StringPrintf(const char* format, ...) { + va_list ap; + va_start(ap, format); + std::string result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} + +#if !defined(OS_ANDROID) +std::wstring StringPrintf(const wchar_t* format, ...) { + va_list ap; + va_start(ap, format); + std::wstring result; + StringAppendV(&result, format, ap); + va_end(ap); + return result; +} +#endif + +std::string StringPrintV(const char* format, va_list ap) { + std::string result; + StringAppendV(&result, format, ap); + return result; +} + +const std::string& SStringPrintf(std::string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + dst->clear(); + StringAppendV(dst, format, ap); + va_end(ap); + return *dst; +} + +#if !defined(OS_ANDROID) +const std::wstring& SStringPrintf(std::wstring* dst, + const wchar_t* format, ...) { + va_list ap; + va_start(ap, format); + dst->clear(); + StringAppendV(dst, format, ap); + va_end(ap); + return *dst; +} +#endif + +void StringAppendF(std::string* dst, const char* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(dst, format, ap); + va_end(ap); +} + +#if !defined(OS_ANDROID) +void StringAppendF(std::wstring* dst, const wchar_t* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(dst, format, ap); + va_end(ap); +} +#endif + +void StringAppendV(std::string* dst, const char* format, va_list ap) { + StringAppendVT(dst, format, ap); +} + +#if !defined(OS_ANDROID) +void StringAppendV(std::wstring* dst, const wchar_t* format, va_list ap) { + StringAppendVT(dst, format, ap); +} +#endif + +} // namespace base diff --git a/base/strings/stringprintf.h b/base/strings/stringprintf.h new file mode 100644 index 0000000..3c0e399 --- /dev/null +++ b/base/strings/stringprintf.h @@ -0,0 +1,62 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef BASE_STRINGS_STRINGPRINTF_H_ +#define BASE_STRINGS_STRINGPRINTF_H_ + +#include <stdarg.h> // va_list + +#include <string> + +#include "base/base_export.h" +#include "base/compiler_specific.h" + +namespace base { + +// Return a C++ string given printf-like input. +BASE_EXPORT std::string StringPrintf(const char* format, ...) + PRINTF_FORMAT(1, 2); +// OS_ANDROID's libc does not support wchar_t, so several overloads are omitted. +#if !defined(OS_ANDROID) +BASE_EXPORT std::wstring StringPrintf(const wchar_t* format, ...) + WPRINTF_FORMAT(1, 2); +#endif + +// Return a C++ string given vprintf-like input. +BASE_EXPORT std::string StringPrintV(const char* format, va_list ap) + PRINTF_FORMAT(1, 0); + +// Store result into a supplied string and return it. +BASE_EXPORT const std::string& SStringPrintf(std::string* dst, + const char* format, ...) + PRINTF_FORMAT(2, 3); +#if !defined(OS_ANDROID) +BASE_EXPORT const std::wstring& SStringPrintf(std::wstring* dst, + const wchar_t* format, ...) + WPRINTF_FORMAT(2, 3); +#endif + +// Append result to a supplied string. +BASE_EXPORT void StringAppendF(std::string* dst, const char* format, ...) + PRINTF_FORMAT(2, 3); +#if !defined(OS_ANDROID) +// TODO(evanm): this is only used in a few places in the code; +// replace with string16 version. +BASE_EXPORT void StringAppendF(std::wstring* dst, const wchar_t* format, ...) + WPRINTF_FORMAT(2, 3); +#endif + +// Lower-level routine that takes a va_list and appends to a specified +// string. All other routines are just convenience wrappers around it. +BASE_EXPORT void StringAppendV(std::string* dst, const char* format, va_list ap) + PRINTF_FORMAT(2, 0); +#if !defined(OS_ANDROID) +BASE_EXPORT void StringAppendV(std::wstring* dst, + const wchar_t* format, va_list ap) + WPRINTF_FORMAT(2, 0); +#endif + +} // namespace base + +#endif // BASE_STRINGS_STRINGPRINTF_H_ diff --git a/base/strings/stringprintf_unittest.cc b/base/strings/stringprintf_unittest.cc new file mode 100644 index 0000000..a1bf2da --- /dev/null +++ b/base/strings/stringprintf_unittest.cc @@ -0,0 +1,188 @@ +// Copyright 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/strings/stringprintf.h" + +#include <errno.h> + +#include "base/basictypes.h" +#include "testing/gtest/include/gtest/gtest.h" + +namespace base { + +namespace { + +// A helper for the StringAppendV test that follows. +// +// Just forwards its args to StringAppendV. +static void StringAppendVTestHelper(std::string* out, const char* format, ...) { + va_list ap; + va_start(ap, format); + StringAppendV(out, format, ap); + va_end(ap); +} + +} // namespace + +TEST(StringPrintfTest, StringPrintfEmpty) { + EXPECT_EQ("", StringPrintf("%s", "")); +} + +TEST(StringPrintfTest, StringPrintfMisc) { + EXPECT_EQ("123hello w", StringPrintf("%3d%2s %1c", 123, "hello", 'w')); +#if !defined(OS_ANDROID) + EXPECT_EQ(L"123hello w", StringPrintf(L"%3d%2ls %1lc", 123, L"hello", 'w')); +#endif +} + +TEST(StringPrintfTest, StringAppendfEmptyString) { + std::string value("Hello"); + StringAppendF(&value, "%s", ""); + EXPECT_EQ("Hello", value); + +#if !defined(OS_ANDROID) + std::wstring valuew(L"Hello"); + StringAppendF(&valuew, L"%ls", L""); + EXPECT_EQ(L"Hello", valuew); +#endif +} + +TEST(StringPrintfTest, StringAppendfString) { + std::string value("Hello"); + StringAppendF(&value, " %s", "World"); + EXPECT_EQ("Hello World", value); + +#if !defined(OS_ANDROID) + std::wstring valuew(L"Hello"); + StringAppendF(&valuew, L" %ls", L"World"); + EXPECT_EQ(L"Hello World", valuew); +#endif +} + +TEST(StringPrintfTest, StringAppendfInt) { + std::string value("Hello"); + StringAppendF(&value, " %d", 123); + EXPECT_EQ("Hello 123", value); + +#if !defined(OS_ANDROID) + std::wstring valuew(L"Hello"); + StringAppendF(&valuew, L" %d", 123); + EXPECT_EQ(L"Hello 123", valuew); +#endif +} + +// Make sure that lengths exactly around the initial buffer size are handled +// correctly. +TEST(StringPrintfTest, StringPrintfBounds) { + const int kSrcLen = 1026; + char src[kSrcLen]; + for (size_t i = 0; i < arraysize(src); i++) + src[i] = 'A'; + + wchar_t srcw[kSrcLen]; + for (size_t i = 0; i < arraysize(srcw); i++) + srcw[i] = 'A'; + + for (int i = 1; i < 3; i++) { + src[kSrcLen - i] = 0; + std::string out; + SStringPrintf(&out, "%s", src); + EXPECT_STREQ(src, out.c_str()); + +#if !defined(OS_ANDROID) + srcw[kSrcLen - i] = 0; + std::wstring outw; + SStringPrintf(&outw, L"%ls", srcw); + EXPECT_STREQ(srcw, outw.c_str()); +#endif + } +} + +// Test very large sprintfs that will cause the buffer to grow. +TEST(StringPrintfTest, Grow) { + char src[1026]; + for (size_t i = 0; i < arraysize(src); i++) + src[i] = 'A'; + src[1025] = 0; + + const char* fmt = "%sB%sB%sB%sB%sB%sB%s"; + + std::string out; + SStringPrintf(&out, fmt, src, src, src, src, src, src, src); + + const int kRefSize = 320000; + char* ref = new char[kRefSize]; +#if defined(OS_WIN) + sprintf_s(ref, kRefSize, fmt, src, src, src, src, src, src, src); +#elif defined(OS_POSIX) + snprintf(ref, kRefSize, fmt, src, src, src, src, src, src, src); +#endif + + EXPECT_STREQ(ref, out.c_str()); + delete[] ref; +} + +TEST(StringPrintfTest, StringAppendV) { + std::string out; + StringAppendVTestHelper(&out, "%d foo %s", 1, "bar"); + EXPECT_EQ("1 foo bar", out); +} + +// Test the boundary condition for the size of the string_util's +// internal buffer. +TEST(StringPrintfTest, GrowBoundary) { + const int string_util_buf_len = 1024; + // Our buffer should be one larger than the size of StringAppendVT's stack + // buffer. + const int buf_len = string_util_buf_len + 1; + char src[buf_len + 1]; // Need extra one for NULL-terminator. + for (int i = 0; i < buf_len; ++i) + src[i] = 'a'; + src[buf_len] = 0; + + std::string out; + SStringPrintf(&out, "%s", src); + + EXPECT_STREQ(src, out.c_str()); +} + +// TODO(evanm): what's the proper cross-platform test here? +#if defined(OS_WIN) +// sprintf in Visual Studio fails when given U+FFFF. This tests that the +// failure case is gracefuly handled. +TEST(StringPrintfTest, Invalid) { + wchar_t invalid[2]; + invalid[0] = 0xffff; + invalid[1] = 0; + + std::wstring out; + SStringPrintf(&out, L"%ls", invalid); + EXPECT_STREQ(L"", out.c_str()); +} +#endif + +// Test that the positional parameters work. +TEST(StringPrintfTest, PositionalParameters) { + std::string out; + SStringPrintf(&out, "%1$s %1$s", "test"); + EXPECT_STREQ("test test", out.c_str()); + +#if defined(OS_WIN) + std::wstring wout; + SStringPrintf(&wout, L"%1$ls %1$ls", L"test"); + EXPECT_STREQ(L"test test", wout.c_str()); +#endif +} + +// Test that StringPrintf and StringAppendV do not change errno. +TEST(StringPrintfTest, StringPrintfErrno) { + errno = 1; + EXPECT_EQ("", StringPrintf("%s", "")); + EXPECT_EQ(1, errno); + std::string out; + StringAppendVTestHelper(&out, "%d foo %s", 1, "bar"); + EXPECT_EQ(1, errno); +} + +} // namespace base |