// Copyright 2014 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. // Create a state machine for validating UTF-8. The algorithm in brief: // 1. Convert the complete unicode range of code points, except for the // surrogate code points, to an ordered array of sequences of bytes in // UTF-8. // 2. Convert individual bytes to ranges, starting from the right of each byte // sequence. For each range, ensure the bytes on the left and the ranges // on the right are the identical. // 3. Convert the resulting list of ranges into a state machine, collapsing // identical states. // 4. Convert the state machine to an array of bytes. // 5. Output as a C++ file. // // To use: // $ ninja -C out/Release build_utf8_validator_tables // $ out/Release/build_utf8_validator_tables // --output=base/i18n/utf8_validator_tables.cc // $ git add base/i18n/utf8_validator_tables.cc // // Because the table is not expected to ever change, it is checked into the // repository rather than being regenerated at build time. // // This code uses type uint8_t throughout to represent bytes, to avoid // signed/unsigned char confusion. #include #include #include #include #include #include #include #include #include #include "base/command_line.h" #include "base/files/file_path.h" #include "base/files/file_util.h" #include "base/logging.h" #include "base/macros.h" #include "base/numerics/safe_conversions.h" #include "base/strings/stringprintf.h" #include "third_party/icu/source/common/unicode/utf8.h" namespace { const char kHelpText[] = "Usage: build_utf8_validator_tables [ --help ] [ --output= ]\n"; const char kProlog[] = "// Copyright 2013 The Chromium Authors. All rights reserved.\n" "// Use of this source code is governed by a BSD-style license that can " "be\n" "// found in the LICENSE file.\n" "\n" "// This file is auto-generated by build_utf8_validator_tables.\n" "// DO NOT EDIT.\n" "\n" "#include \"base/i18n/utf8_validator_tables.h\"\n" "\n" "namespace base {\n" "namespace internal {\n" "\n" "const uint8_t kUtf8ValidatorTables[] = {\n"; const char kEpilog[] = "};\n" "\n" "const size_t kUtf8ValidatorTablesSize = arraysize(kUtf8ValidatorTables);\n" "\n" "} // namespace internal\n" "} // namespace base\n"; // Ranges are inclusive at both ends--they represent [from, to] class Range { public: // Ranges always start with just one byte. explicit Range(uint8_t value) : from_(value), to_(value) {} // Range objects are copyable and assignable to be used in STL // containers. Since they only contain non-pointer POD types, the default copy // constructor, assignment operator and destructor will work. // Add a byte to the range. We intentionally only support adding a byte at the // end, since that is the only operation the code needs. void AddByte(uint8_t to) { CHECK(to == to_ + 1); to_ = to; } uint8_t from() const { return from_; } uint8_t to() const { return to_; } bool operator<(const Range& rhs) const { return (from() < rhs.from() || (from() == rhs.from() && to() < rhs.to())); } bool operator==(const Range& rhs) const { return from() == rhs.from() && to() == rhs.to(); } private: uint8_t from_; uint8_t to_; }; // A vector of Ranges is like a simple regular expression--it corresponds to // a set of strings of the same length that have bytes in each position in // the appropriate range. typedef std::vector StringSet; // A UTF-8 "character" is represented by a sequence of bytes. typedef std::vector Character; // In the second stage of the algorithm, we want to convert a large list of // Characters into a small list of StringSets. struct Pair { Character character; StringSet set; }; typedef std::vector PairVector; // A class to print a table of numbers in the same style as clang-format. class TablePrinter { public: explicit TablePrinter(FILE* stream) : stream_(stream), values_on_this_line_(0), current_offset_(0) {} void PrintValue(uint8_t value) { if (values_on_this_line_ == 0) { fputs(" ", stream_); } else if (values_on_this_line_ == kMaxValuesPerLine) { fprintf(stream_, " // 0x%02x\n ", current_offset_); values_on_this_line_ = 0; } fprintf(stream_, " 0x%02x,", static_cast(value)); ++values_on_this_line_; ++current_offset_; } void NewLine() { while (values_on_this_line_ < kMaxValuesPerLine) { fputs(" ", stream_); ++values_on_this_line_; } fprintf(stream_, " // 0x%02x\n", current_offset_); values_on_this_line_ = 0; } private: // stdio stream. Not owned. FILE* stream_; // Number of values so far printed on this line. int values_on_this_line_; // Total values printed so far. int current_offset_; static const int kMaxValuesPerLine = 8; DISALLOW_COPY_AND_ASSIGN(TablePrinter); }; // Start by filling a PairVector with characters. The resulting vector goes from // "\x00" to "\xf4\x8f\xbf\xbf". PairVector InitializeCharacters() { PairVector vector; for (int i = 0; i <= 0x10FFFF; ++i) { if (i >= 0xD800 && i < 0xE000) { // Surrogate codepoints are not permitted. Non-character code points are // explicitly permitted. continue; } uint8_t bytes[4]; unsigned int offset = 0; UBool is_error = false; U8_APPEND(bytes, offset, arraysize(bytes), i, is_error); DCHECK(!is_error); DCHECK_GT(offset, 0u); DCHECK_LE(offset, arraysize(bytes)); Pair pair = {Character(bytes, bytes + offset), StringSet()}; vector.push_back(pair); } return vector; } // Construct a new Pair from |character| and the concatenation of |new_range| // and |existing_set|, and append it to |pairs|. void ConstructPairAndAppend(const Character& character, const Range& new_range, const StringSet& existing_set, PairVector* pairs) { Pair new_pair = {character, StringSet(1, new_range)}; new_pair.set.insert( new_pair.set.end(), existing_set.begin(), existing_set.end()); pairs->push_back(new_pair); } // Each pass over the PairVector strips one byte off the right-hand-side of the // characters and adds a range to the set on the right. For example, the first // pass converts the range from "\xe0\xa0\x80" to "\xe0\xa0\xbf" to ("\xe0\xa0", // [\x80-\xbf]), then the second pass converts the range from ("\xe0\xa0", // [\x80-\xbf]) to ("\xe0\xbf", [\x80-\xbf]) to ("\xe0", // [\xa0-\xbf][\x80-\xbf]). void MoveRightMostCharToSet(PairVector* pairs) { PairVector new_pairs; PairVector::const_iterator it = pairs->begin(); while (it != pairs->end() && it->character.empty()) { new_pairs.push_back(*it); ++it; } CHECK(it != pairs->end()); Character unconverted_bytes(it->character.begin(), it->character.end() - 1); Range new_range(it->character.back()); StringSet converted = it->set; ++it; while (it != pairs->end()) { const Pair& current_pair = *it++; if (current_pair.character.size() == unconverted_bytes.size() + 1 && std::equal(unconverted_bytes.begin(), unconverted_bytes.end(), current_pair.character.begin()) && converted == current_pair.set) { // The particular set of UTF-8 codepoints we are validating guarantees // that each byte range will be contiguous. This would not necessarily be // true for an arbitrary set of UTF-8 codepoints. DCHECK_EQ(new_range.to() + 1, current_pair.character.back()); new_range.AddByte(current_pair.character.back()); continue; } ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); unconverted_bytes = Character(current_pair.character.begin(), current_pair.character.end() - 1); new_range = Range(current_pair.character.back()); converted = current_pair.set; } ConstructPairAndAppend(unconverted_bytes, new_range, converted, &new_pairs); new_pairs.swap(*pairs); } void MoveAllCharsToSets(PairVector* pairs) { // Since each pass of the function moves one character, and UTF-8 sequences // are at most 4 characters long, this simply runs the algorithm four times. for (int i = 0; i < 4; ++i) { MoveRightMostCharToSet(pairs); } #if DCHECK_IS_ON() for (PairVector::const_iterator it = pairs->begin(); it != pairs->end(); ++it) { DCHECK(it->character.empty()); } #endif } // Logs the generated string sets in regular-expression style, ie. [\x00-\x7f], // [\xc2-\xdf][\x80-\xbf], etc. This can be a useful sanity-check that the // algorithm is working. Use the command-line option // --vmodule=build_utf8_validator_tables=1 to see this output. void LogStringSets(const PairVector& pairs) { for (PairVector::const_iterator pair_it = pairs.begin(); pair_it != pairs.end(); ++pair_it) { std::string set_as_string; for (StringSet::const_iterator set_it = pair_it->set.begin(); set_it != pair_it->set.end(); ++set_it) { set_as_string += base::StringPrintf("[\\x%02x-\\x%02x]", static_cast(set_it->from()), static_cast(set_it->to())); } VLOG(1) << set_as_string; } } // A single state in the state machine is represented by a sorted vector of // start bytes and target states. All input bytes in the range between the start // byte and the next entry in the vector (or 0xFF) result in a transition to the // target state. struct StateRange { uint8_t from; uint8_t target_state; }; typedef std::vector State; // Generates a state where all bytes go to state 1 (invalid). This is also used // as an initialiser for other states (since bytes from outside the desired // range are invalid). State GenerateInvalidState() { const StateRange range = {0, 1}; return State(1, range); } // A map from a state (ie. a set of strings which will match from this state) to // a number (which is an index into the array of states). typedef std::map StateMap; // Create a new state corresponding to |set|, add it |states| and |state_map| // and return the index it was given in |states|. uint8_t MakeState(const StringSet& set, std::vector* states, StateMap* state_map) { DCHECK(!set.empty()); const Range& range = set.front(); const StringSet rest(set.begin() + 1, set.end()); const StateMap::const_iterator where = state_map->find(rest); const uint8_t target_state = where == state_map->end() ? MakeState(rest, states, state_map) : where->second; DCHECK_LT(0, range.from()); DCHECK_LT(range.to(), 0xFF); const StateRange new_state_initializer[] = { {0, 1}, {range.from(), target_state}, {static_cast(range.to() + 1), 1}}; states->push_back( State(new_state_initializer, new_state_initializer + arraysize(new_state_initializer))); const uint8_t new_state_number = base::checked_cast(states->size() - 1); CHECK(state_map->insert(std::make_pair(set, new_state_number)).second); return new_state_number; } std::vector GenerateStates(const PairVector& pairs) { // States 0 and 1 are the initial/valid state and invalid state, respectively. std::vector states(2, GenerateInvalidState()); StateMap state_map; state_map.insert(std::make_pair(StringSet(), 0)); for (PairVector::const_iterator it = pairs.begin(); it != pairs.end(); ++it) { DCHECK(it->character.empty()); DCHECK(!it->set.empty()); const Range& range = it->set.front(); const StringSet rest(it->set.begin() + 1, it->set.end()); const StateMap::const_iterator where = state_map.find(rest); const uint8_t target_state = where == state_map.end() ? MakeState(rest, &states, &state_map) : where->second; if (states[0].back().from == range.from()) { DCHECK_EQ(1, states[0].back().target_state); states[0].back().target_state = target_state; DCHECK_LT(range.to(), 0xFF); const StateRange new_range = {static_cast(range.to() + 1), 1}; states[0].push_back(new_range); } else { DCHECK_LT(range.to(), 0xFF); const StateRange new_range_initializer[] = { {range.from(), target_state}, {static_cast(range.to() + 1), 1}}; states[0] .insert(states[0].end(), new_range_initializer, new_range_initializer + arraysize(new_range_initializer)); } } return states; } // Output the generated states as a C++ table. Two tricks are used to compact // the table: each state in the table starts with a shift value which indicates // how many bits we can discard from the right-hand-side of the byte before // doing the table lookup. Secondly, only the state-transitions for bytes // with the top-bit set are included in the table; bytes without the top-bit set // are just ASCII and are handled directly by the code. void PrintStates(const std::vector& states, FILE* stream) { // First calculate the start-offset of each state. This allows the state // machine to jump directly to the correct offset, avoiding an extra // indirection. State 0 starts at offset 0. std::vector state_offset(1, 0); std::vector shifts; uint8_t pos = 0; for (std::vector::const_iterator state_it = states.begin(); state_it != states.end(); ++state_it) { // We want to set |shift| to the (0-based) index of the least-significant // set bit in any of the ranges for this state, since this tells us how many // bits we can discard and still determine what range a byte lies in. Sadly // it appears that ffs() is not portable, so we do it clumsily. uint8_t shift = 7; for (State::const_iterator range_it = state_it->begin(); range_it != state_it->end(); ++range_it) { while (shift > 0 && range_it->from % (1 << shift) != 0) { --shift; } } shifts.push_back(shift); pos += 1 + (1 << (7 - shift)); state_offset.push_back(pos); } DCHECK_EQ(129, state_offset[1]); fputs(kProlog, stream); TablePrinter table_printer(stream); for (uint8_t state_index = 0; state_index < states.size(); ++state_index) { const uint8_t shift = shifts[state_index]; uint8_t next_range = 0; uint8_t target_state = 1; fprintf(stream, " // State %d, offset 0x%02x\n", static_cast(state_index), static_cast(state_offset[state_index])); table_printer.PrintValue(shift); for (int i = 0; i < 0x100; i += (1 << shift)) { if (next_range < states[state_index].size() && states[state_index][next_range].from == i) { target_state = states[state_index][next_range].target_state; ++next_range; } if (i >= 0x80) { table_printer.PrintValue(state_offset[target_state]); } } table_printer.NewLine(); } fputs(kEpilog, stream); } } // namespace int main(int argc, char* argv[]) { base::CommandLine::Init(argc, argv); logging::LoggingSettings settings; settings.logging_dest = logging::LOG_TO_SYSTEM_DEBUG_LOG; logging::InitLogging(settings); if (base::CommandLine::ForCurrentProcess()->HasSwitch("help")) { fwrite(kHelpText, 1, arraysize(kHelpText), stdout); exit(EXIT_SUCCESS); } base::FilePath filename = base::CommandLine::ForCurrentProcess()->GetSwitchValuePath("output"); FILE* output = stdout; if (!filename.empty()) { output = base::OpenFile(filename, "wb"); if (!output) PLOG(FATAL) << "Couldn't open '" << filename.AsUTF8Unsafe() << "' for writing"; } // Step 1: Enumerate the characters PairVector pairs = InitializeCharacters(); // Step 2: Convert to sets. MoveAllCharsToSets(&pairs); if (VLOG_IS_ON(1)) { LogStringSets(pairs); } // Step 3: Generate states. std::vector states = GenerateStates(pairs); // Step 4/5: Print output PrintStates(states, output); if (!filename.empty()) { if (!base::CloseFile(output)) PLOG(FATAL) << "Couldn't finish writing '" << filename.AsUTF8Unsafe() << "'"; } return EXIT_SUCCESS; }