diff options
Diffstat (limited to 'tools/gn/tokenizer.cc')
-rw-r--r-- | tools/gn/tokenizer.cc | 309 |
1 files changed, 309 insertions, 0 deletions
diff --git a/tools/gn/tokenizer.cc b/tools/gn/tokenizer.cc new file mode 100644 index 0000000..971f56b --- /dev/null +++ b/tools/gn/tokenizer.cc @@ -0,0 +1,309 @@ +// Copyright (c) 2013 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "tools/gn/tokenizer.h" + +#include "base/logging.h" +#include "tools/gn/input_file.h" + +namespace { + +bool IsNumberChar(char c) { + return c == '-' || (c >= '0' && c <= '9'); +} + +bool CouldBeTwoCharOperatorBegin(char c) { + return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' || + c == '+' || c == '|' || c == '&'; +} + +bool CouldBeTwoCharOperatorEnd(char c) { + return c == '=' || c == '|' || c == '&'; +} + +bool CouldBeOneCharOperator(char c) { + return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' || + c == ':' || c == '|' || c == '&' || c == '-'; +} + +bool CouldBeOperator(char c) { + return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c); +} + +bool IsSeparatorChar(char c) { + return c == ','; +} + +bool IsScoperChar(char c) { + return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}'; +} + +} // namespace + +Tokenizer::Tokenizer(const InputFile* input_file, Err* err) + : input_file_(input_file), + input_(input_file->contents()), + err_(err), + cur_(0), + line_number_(1), + char_in_line_(1) { +} + +Tokenizer::~Tokenizer() { +} + +// static +std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) { + Tokenizer t(input_file, err); + return t.Run(); +} + +std::vector<Token> Tokenizer::Run() { + std::vector<Token> tokens; + while (!done()) { + AdvanceToNextToken(); + if (done()) + break; + Location location = GetCurrentLocation(); + + Token::Type type = ClassifyCurrent(); + if (type == Token::INVALID) { + *err_ = GetErrorForInvalidToken(location); + break; + } + size_t token_begin = cur_; + AdvanceToEndOfToken(location, type); + if (has_error()) + break; + size_t token_end = cur_; + + // TODO(brettw) This just strips comments from the token stream. This + // is probably wrong, they should be removed at a later stage so we can + // do things like rewrite the file. But this makes the parser simpler and + // is OK for now. + if (type != Token::COMMENT) { + tokens.push_back(Token( + location, + type, + base::StringPiece(&input_.data()[token_begin], + token_end - token_begin))); + } + } + if (err_->has_error()) + tokens.clear(); + return tokens; +} + +// static +size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) { + int cur_line = 1; + size_t cur_byte = 0; + + DCHECK(n > 0); + + if (n == 1) + return 0; + + while (cur_byte < buf.size()) { + if (IsNewline(buf, cur_byte)) { + cur_line++; + if (cur_line == n) + return cur_byte + 1; + } + cur_byte++; + } + return -1; +} + +// static +bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) { + DCHECK(offset < buffer.size()); + // We may need more logic here to handle different line ending styles. + return buffer[offset] == '\n'; +} + + +void Tokenizer::AdvanceToNextToken() { + while (!at_end() && IsCurrentWhitespace()) + Advance(); +} + +Token::Type Tokenizer::ClassifyCurrent() const { + DCHECK(!at_end()); + char next_char = cur_char(); + if (next_char >= '0' && next_char <= '9') + return Token::INTEGER; + if (next_char == '"') + return Token::STRING; + + // Note: '-' handled specially below. + if (next_char != '-' && CouldBeOperator(next_char)) + return Token::OPERATOR; + + if (IsIdentifierFirstChar(next_char)) + return Token::IDENTIFIER; + + if (IsScoperChar(next_char)) + return Token::SCOPER; + + if (IsSeparatorChar(next_char)) + return Token::SEPARATOR; + + if (next_char == '#') + return Token::COMMENT; + + // For the case of '-' differentiate between a negative number and anything + // else. + if (next_char == '-') { + if (!CanIncrement()) + return Token::OPERATOR; // Just the minus before end of file. + char following_char = input_[cur_ + 1]; + if (following_char >= '0' && following_char <= '9') + return Token::INTEGER; + return Token::OPERATOR; + } + + return Token::INVALID; +} + +void Tokenizer::AdvanceToEndOfToken(const Location& location, + Token::Type type) { + switch (type) { + case Token::INTEGER: + do { + Advance(); + } while (!at_end() && IsNumberChar(cur_char())); + if (!at_end()) { + // Require the char after a number to be some kind of space, scope, + // or operator. + char c = cur_char(); + if (!IsCurrentWhitespace() && !CouldBeOperator(c) && + !IsScoperChar(c) && !IsSeparatorChar(c)) { + *err_ = Err(GetCurrentLocation(), + "This is not a valid number.", + "Learn to count."); + // Highlight the number. + err_->AppendRange(LocationRange(location, GetCurrentLocation())); + } + } + break; + + case Token::STRING: { + char initial = cur_char(); + Advance(); // Advance past initial " + for (;;) { + if (at_end()) { + *err_ = Err(LocationRange(location, + Location(input_file_, line_number_, char_in_line_)), + "Unterminated string literal.", + "Don't leave me hanging like this!"); + break; + } + if (IsCurrentStringTerminator(initial)) { + Advance(); // Skip past last " + break; + } else if (cur_char() == '\n') { + *err_ = Err(LocationRange(location, + GetCurrentLocation()), + "Newline in string constant."); + } + Advance(); + } + break; + } + + case Token::OPERATOR: + // Some operators are two characters, some are one. + if (CouldBeTwoCharOperatorBegin(cur_char())) { + if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1])) + Advance(); + } + Advance(); + break; + + case Token::IDENTIFIER: + while (!at_end() && IsIdentifierContinuingChar(cur_char())) + Advance(); + break; + + case Token::SCOPER: + case Token::SEPARATOR: + Advance(); // All are one char. + break; + + case Token::COMMENT: + // Eat to EOL. + while (!at_end() && !IsCurrentNewline()) + Advance(); + break; + + case Token::INVALID: + *err_ = Err(location, "Everything is all messed up", + "Please insert system disk in drive A: and press any key."); + NOTREACHED(); + return; + } +} + +bool Tokenizer::IsCurrentWhitespace() const { + DCHECK(!at_end()); + char c = input_[cur_]; + // Note that tab (0x09) is illegal. + return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20; +} + +bool Tokenizer::IsCurrentStringTerminator(char quote_char) const { + DCHECK(!at_end()); + if (cur_char() != quote_char) + return false; + + // Check for escaping. \" is not a string terminator, but \\" is. Count + // the number of preceeding backslashes. + int num_backslashes = 0; + for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--) + num_backslashes++; + + // Even backslashes mean that they were escaping each other and don't count + // as escaping this quote. + return (num_backslashes % 2) == 0; +} + +bool Tokenizer::IsCurrentNewline() const { + return IsNewline(input_, cur_); +} + +void Tokenizer::Advance() { + DCHECK(cur_ < input_.size()); + if (IsCurrentNewline()) { + line_number_++; + char_in_line_ = 1; + } else { + char_in_line_++; + } + cur_++; +} + +Location Tokenizer::GetCurrentLocation() const { + return Location(input_file_, line_number_, char_in_line_); +} + +Err Tokenizer::GetErrorForInvalidToken(const Location& location) const { + std::string help; + if (cur_char() == ';') { + // Semicolon. + help = "Semicolons are not needed, delete this one."; + } else if (cur_char() == '\t') { + // Tab. + help = "You got a tab character in here. Tabs are evil. " + "Convert to spaces."; + } else if (cur_char() == '/' && cur_ + 1 < input_.size() && + (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) { + // Different types of comments. + help = "Comments should start with # instead"; + } else { + help = "I have no idea what this is."; + } + + return Err(location, "Invalid token.", help); +} |