// Copyright (c) 2013 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. #include "tools/gn/string_utils.h" #include #include #include "base/strings/string_number_conversions.h" #include "tools/gn/err.h" #include "tools/gn/input_file.h" #include "tools/gn/parser.h" #include "tools/gn/scope.h" #include "tools/gn/token.h" #include "tools/gn/tokenizer.h" #include "tools/gn/value.h" namespace { // Constructs an Err indicating a range inside a string. We assume that the // token has quotes around it that are not counted by the offset. Err ErrInsideStringToken(const Token& token, size_t offset, size_t size, const std::string& msg, const std::string& help = std::string()) { // The "+1" is skipping over the " at the beginning of the token. int int_offset = static_cast(offset); Location begin_loc(token.location().file(), token.location().line_number(), token.location().column_number() + int_offset + 1, token.location().byte() + int_offset + 1); Location end_loc( token.location().file(), token.location().line_number(), token.location().column_number() + int_offset + 1 + static_cast(size), token.location().byte() + int_offset + 1 + static_cast(size)); return Err(LocationRange(begin_loc, end_loc), msg, help); } // Notes about expression interpolation. This is based loosly on Dart but is // slightly less flexible. In Dart, seeing the ${ in a string is something // the toplevel parser knows about, and it will recurse into the block // treating it as a first-class {...} block. So even things like this work: // "hello ${"foo}"*2+"bar"}" => "hello foo}foo}bar" // (you can see it did not get confused by the nested strings or the nested "}" // inside the block). // // This is cool but complicates the parser for almost no benefit for this // non-general-purpose programming language. The main reason expressions are // supported here at all are to support "${scope.variable}" and "${list[0]}", // neither of which have any of these edge-cases. // // In this simplified approach, we search for the terminating '}' and execute // the result. This means we can't support any expressions with embedded '}' // or '"'. To keep people from getting confusing about what's supported and // what's not, only identifier and accessor expressions are allowed (neither // of these run into any of these edge-cases). bool AppendInterpolatedExpression(Scope* scope, const Token& token, const char* input, size_t begin_offset, size_t end_offset, std::string* output, Err* err) { SourceFile empty_source_file; // Prevent most vexing parse. InputFile input_file(empty_source_file); input_file.SetContents( std::string(&input[begin_offset], end_offset - begin_offset)); // Tokenize. std::vector tokens = Tokenizer::Tokenize(&input_file, err); if (err->has_error()) { // The error will point into our temporary buffer, rewrite it to refer // to the original token. This will make the location information less // precise, but generally there won't be complicated things in string // interpolations. *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, err->message(), err->help_text()); return false; } // Parse. scoped_ptr node = Parser::ParseExpression(tokens, err); if (err->has_error()) { // Rewrite error as above. *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, err->message(), err->help_text()); return false; } if (!(node->AsIdentifier() || node->AsAccessor())) { *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, "Invalid string interpolation.", "The thing inside the ${} must be an identifier ${foo},\n" "a scope access ${foo.bar}, or a list access ${foo[0]}."); return false; } // Evaluate. Value result = node->Execute(scope, err); if (err->has_error()) { // Rewrite error as above. *err = ErrInsideStringToken(token, begin_offset, end_offset - begin_offset, err->message(), err->help_text()); return false; } output->append(result.ToString(false)); return true; } bool AppendInterpolatedIdentifier(Scope* scope, const Token& token, const char* input, size_t begin_offset, size_t end_offset, std::string* output, Err* err) { base::StringPiece identifier(&input[begin_offset], end_offset - begin_offset); const Value* value = scope->GetValue(identifier, true); if (!value) { // We assume the input points inside the token. *err = ErrInsideStringToken( token, identifier.data() - token.value().data() - 1, identifier.size(), "Undefined identifier in string expansion.", std::string("\"") + identifier + "\" is not currently in scope."); return false; } output->append(value->ToString(false)); return true; } // Handles string interpolations: $identifier and ${expression} // // |*i| is the index into |input| after the $. This will be updated to point to // the last character consumed on success. The token is the original string // to blame on failure. // // On failure, returns false and sets the error. On success, appends the // result of the interpolation to |*output|. bool AppendStringInterpolation(Scope* scope, const Token& token, const char* input, size_t size, size_t* i, std::string* output, Err* err) { size_t dollars_index = *i - 1; if (input[*i] == '{') { // Bracketed expression. (*i)++; size_t begin_offset = *i; // Find the closing } and check for non-identifier chars. Don't need to // bother checking for the more-restricted first character of an identifier // since the {} unambiguously denotes the range, and identifiers with // invalid names just won't be found later. bool has_non_ident_chars = false; while (*i < size && input[*i] != '}') { has_non_ident_chars |= Tokenizer::IsIdentifierContinuingChar(input[*i]); (*i)++; } if (*i == size) { *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index, "Unterminated ${..."); return false; } // In the common case, the thing inside the {} will actually be a // simple identifier. Avoid all the complicated parsing of accessors // in this case. if (!has_non_ident_chars) { return AppendInterpolatedIdentifier(scope, token, input, begin_offset, *i, output, err); } return AppendInterpolatedExpression(scope, token, input, begin_offset, *i, output, err); } // Simple identifier. // The first char of an identifier is more restricted. if (!Tokenizer::IsIdentifierFirstChar(input[*i])) { *err = ErrInsideStringToken( token, dollars_index, *i - dollars_index + 1, "$ not followed by an identifier char.", "It you want a literal $ use \"\\$\"."); return false; } size_t begin_offset = *i; (*i)++; // Find the first non-identifier char following the string. while (*i < size && Tokenizer::IsIdentifierContinuingChar(input[*i])) (*i)++; size_t end_offset = *i; (*i)--; // Back up to mark the last character consumed. return AppendInterpolatedIdentifier(scope, token, input, begin_offset, end_offset, output, err); } // Handles a hex literal: $0xFF // // |*i| is the index into |input| after the $. This will be updated to point to // the last character consumed on success. The token is the original string // to blame on failure. // // On failure, returns false and sets the error. On success, appends the // char with the given hex value to |*output|. bool AppendHexByte(Scope* scope, const Token& token, const char* input, size_t size, size_t* i, std::string* output, Err* err) { size_t dollars_index = *i - 1; // "$0" is already known to exist. if (*i + 3 >= size || input[*i + 1] != 'x' || !std::isxdigit(input[*i + 2]) || !std::isxdigit(input[*i + 3])) { *err = ErrInsideStringToken( token, dollars_index, *i - dollars_index + 1, "Invalid hex character. Hex values must look like 0xFF."); return false; } int value = 0; if (!base::HexStringToInt(base::StringPiece(&input[*i + 2], 2), &value)) { *err = ErrInsideStringToken(token, dollars_index, *i - dollars_index + 1, "Could not convert hex value."); return false; } *i += 3; output->push_back(value); return true; } } // namespace bool ExpandStringLiteral(Scope* scope, const Token& literal, Value* result, Err* err) { DCHECK(literal.type() == Token::STRING); DCHECK(literal.value().size() > 1); // Should include quotes. DCHECK(result->type() == Value::STRING); // Should be already set. // The token includes the surrounding quotes, so strip those off. const char* input = &literal.value().data()[1]; size_t size = literal.value().size() - 2; std::string& output = result->string_value(); output.reserve(size); for (size_t i = 0; i < size; i++) { if (input[i] == '\\') { if (i < size - 1) { switch (input[i + 1]) { case '\\': case '"': case '$': output.push_back(input[i + 1]); i++; continue; default: // Everything else has no meaning: pass the literal. break; } } output.push_back(input[i]); } else if (input[i] == '$') { i++; if (i == size) { *err = ErrInsideStringToken(literal, i - 1, 1, "$ at end of string.", "I was expecting an identifier, 0xFF, or {...} after the $."); return false; } if (input[i] == '0') { if (!AppendHexByte(scope, literal, input, size, &i, &output, err)) return false; } else if (!AppendStringInterpolation(scope, literal, input, size, &i, &output, err)) return false; } else { output.push_back(input[i]); } } return true; } size_t EditDistance(const base::StringPiece& s1, const base::StringPiece& s2, size_t max_edit_distance) { // The algorithm implemented below is the "classic" // dynamic-programming algorithm for computing the Levenshtein // distance, which is described here: // // http://en.wikipedia.org/wiki/Levenshtein_distance // // Although the algorithm is typically described using an m x n // array, only one row plus one element are used at a time, so this // implementation just keeps one vector for the row. To update one entry, // only the entries to the left, top, and top-left are needed. The left // entry is in row[x-1], the top entry is what's in row[x] from the last // iteration, and the top-left entry is stored in previous. size_t m = s1.size(); size_t n = s2.size(); std::vector row(n + 1); for (size_t i = 1; i <= n; ++i) row[i] = i; for (size_t y = 1; y <= m; ++y) { row[0] = y; size_t best_this_row = row[0]; size_t previous = y - 1; for (size_t x = 1; x <= n; ++x) { size_t old_row = row[x]; row[x] = std::min(previous + (s1[y - 1] == s2[x - 1] ? 0u : 1u), std::min(row[x - 1], row[x]) + 1u); previous = old_row; best_this_row = std::min(best_this_row, row[x]); } if (max_edit_distance && best_this_row > max_edit_distance) return max_edit_distance + 1; } return row[n]; } base::StringPiece SpellcheckString( const base::StringPiece& text, const std::vector& words) { const size_t kMaxValidEditDistance = 3u; size_t min_distance = kMaxValidEditDistance + 1u; base::StringPiece result; for (base::StringPiece word : words) { size_t distance = EditDistance(word, text, kMaxValidEditDistance); if (distance < min_distance) { min_distance = distance; result = word; } } return result; }