1 files changed, 309 insertions, 0 deletions
diff --git a/tools/gn/tokenizer.cc b/tools/gn/tokenizer.cc
new file mode 100644
index 0000000..971f56b
--- /dev/null
+++ b/tools/gn/tokenizer.cc
@@ -0,0 +1,309 @@
+// Copyright (c) 2013 The Chromium Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+#include "tools/gn/tokenizer.h"
+
+#include "base/logging.h"
+#include "tools/gn/input_file.h"
+
+namespace {
+
+bool IsNumberChar(char c) {
+  return c == '-' || (c >= '0' && c <= '9');
+}
+
+bool CouldBeTwoCharOperatorBegin(char c) {
+  return c == '<' || c == '>' || c == '!' || c == '=' || c == '-' ||
+         c == '+' || c == '|' || c == '&';
+}
+
+bool CouldBeTwoCharOperatorEnd(char c) {
+  return c == '=' || c == '|' || c == '&';
+}
+
+bool CouldBeOneCharOperator(char c) {
+  return c == '=' || c == '<' || c == '>' || c == '+' || c == '!' ||
+         c == ':' || c == '|' || c == '&' || c == '-';
+}
+
+bool CouldBeOperator(char c) {
+  return CouldBeOneCharOperator(c) || CouldBeTwoCharOperatorBegin(c);
+}
+
+bool IsSeparatorChar(char c) {
+  return c == ',';
+}
+
+bool IsScoperChar(char c) {
+  return c == '(' || c == ')' || c == '[' || c == ']' || c == '{' || c == '}';
+}
+
+}  // namespace
+
+Tokenizer::Tokenizer(const InputFile* input_file, Err* err)
+    : input_file_(input_file),
+      input_(input_file->contents()),
+      err_(err),
+      cur_(0),
+      line_number_(1),
+      char_in_line_(1) {
+}
+
+Tokenizer::~Tokenizer() {
+}
+
+// static
+std::vector<Token> Tokenizer::Tokenize(const InputFile* input_file, Err* err) {
+  Tokenizer t(input_file, err);
+  return t.Run();
+}
+
+std::vector<Token> Tokenizer::Run() {
+  std::vector<Token> tokens;
+  while (!done()) {
+    AdvanceToNextToken();
+    if (done())
+      break;
+    Location location = GetCurrentLocation();
+
+    Token::Type type = ClassifyCurrent();
+    if (type == Token::INVALID) {
+      *err_ = GetErrorForInvalidToken(location);
+      break;
+    }
+    size_t token_begin = cur_;
+    AdvanceToEndOfToken(location, type);
+    if (has_error())
+      break;
+    size_t token_end = cur_;
+
+    // TODO(brettw) This just strips comments from the token stream. This
+    // is probably wrong, they should be removed at a later stage so we can
+    // do things like rewrite the file. But this makes the parser simpler and
+    // is OK for now.
+    if (type != Token::COMMENT) {
+      tokens.push_back(Token(
+          location,
+          type,
+          base::StringPiece(&input_.data()[token_begin],
+                            token_end - token_begin)));
+    }
+  }
+  if (err_->has_error())
+    tokens.clear();
+  return tokens;
+}
+
+// static
+size_t Tokenizer::ByteOffsetOfNthLine(const base::StringPiece& buf, int n) {
+  int cur_line = 1;
+  size_t cur_byte = 0;
+
+  DCHECK(n > 0);
+
+  if (n == 1)
+    return 0;
+
+  while (cur_byte < buf.size()) {
+    if (IsNewline(buf, cur_byte)) {
+      cur_line++;
+      if (cur_line == n)
+        return cur_byte + 1;
+    }
+    cur_byte++;
+  }
+  return -1;
+}
+
+// static
+bool Tokenizer::IsNewline(const base::StringPiece& buffer, size_t offset) {
+  DCHECK(offset < buffer.size());
+  // We may need more logic here to handle different line ending styles.
+  return buffer[offset] == '\n';
+}
+
+
+void Tokenizer::AdvanceToNextToken() {
+  while (!at_end() && IsCurrentWhitespace())
+    Advance();
+}
+
+Token::Type Tokenizer::ClassifyCurrent() const {
+  DCHECK(!at_end());
+  char next_char = cur_char();
+  if (next_char >= '0' && next_char <= '9')
+    return Token::INTEGER;
+  if (next_char == '"')
+    return Token::STRING;
+
+  // Note: '-' handled specially below.
+  if (next_char != '-' && CouldBeOperator(next_char))
+    return Token::OPERATOR;
+
+  if (IsIdentifierFirstChar(next_char))
+    return Token::IDENTIFIER;
+
+  if (IsScoperChar(next_char))
+    return Token::SCOPER;
+
+  if (IsSeparatorChar(next_char))
+    return Token::SEPARATOR;
+
+  if (next_char == '#')
+    return Token::COMMENT;
+
+  // For the case of '-' differentiate between a negative number and anything
+  // else.
+  if (next_char == '-') {
+    if (!CanIncrement())
+      return Token::OPERATOR;  // Just the minus before end of file.
+    char following_char = input_[cur_ + 1];
+    if (following_char >= '0' && following_char <= '9')
+      return Token::INTEGER;
+    return Token::OPERATOR;
+  }
+
+  return Token::INVALID;
+}
+
+void Tokenizer::AdvanceToEndOfToken(const Location& location,
+                                    Token::Type type) {
+  switch (type) {
+    case Token::INTEGER:
+      do {
+        Advance();
+      } while (!at_end() && IsNumberChar(cur_char()));
+      if (!at_end()) {
+        // Require the char after a number to be some kind of space, scope,
+        // or operator.
+        char c = cur_char();
+        if (!IsCurrentWhitespace() && !CouldBeOperator(c) &&
+            !IsScoperChar(c) && !IsSeparatorChar(c)) {
+          *err_ = Err(GetCurrentLocation(),
+              "This is not a valid number.",
+              "Learn to count.");
+          // Highlight the number.
+          err_->AppendRange(LocationRange(location, GetCurrentLocation()));
+        }
+      }
+      break;
+
+    case Token::STRING: {
+      char initial = cur_char();
+      Advance();  // Advance past initial "
+      for (;;) {
+        if (at_end()) {
+          *err_ = Err(LocationRange(location,
+                          Location(input_file_, line_number_, char_in_line_)),
+                     "Unterminated string literal.",
+                     "Don't leave me hanging like this!");
+          break;
+        }
+        if (IsCurrentStringTerminator(initial)) {
+          Advance();  // Skip past last "
+          break;
+        } else if (cur_char() == '\n') {
+          *err_ = Err(LocationRange(location,
+                                   GetCurrentLocation()),
+                     "Newline in string constant.");
+        }
+        Advance();
+      }
+      break;
+    }
+
+    case Token::OPERATOR:
+      // Some operators are two characters, some are one.
+      if (CouldBeTwoCharOperatorBegin(cur_char())) {
+        if (CanIncrement() && CouldBeTwoCharOperatorEnd(input_[cur_ + 1]))
+          Advance();
+      }
+      Advance();
+      break;
+
+    case Token::IDENTIFIER:
+      while (!at_end() && IsIdentifierContinuingChar(cur_char()))
+        Advance();
+      break;
+
+    case Token::SCOPER:
+    case Token::SEPARATOR:
+      Advance();  // All are one char.
+      break;
+
+    case Token::COMMENT:
+      // Eat to EOL.
+      while (!at_end() && !IsCurrentNewline())
+        Advance();
+      break;
+
+    case Token::INVALID:
+      *err_ = Err(location, "Everything is all messed up",
+                  "Please insert system disk in drive A: and press any key.");
+      NOTREACHED();
+      return;
+  }
+}
+
+bool Tokenizer::IsCurrentWhitespace() const {
+  DCHECK(!at_end());
+  char c = input_[cur_];
+  // Note that tab (0x09) is illegal.
+  return c == 0x0A || c == 0x0B || c == 0x0C || c == 0x0D || c == 0x20;
+}
+
+bool Tokenizer::IsCurrentStringTerminator(char quote_char) const {
+  DCHECK(!at_end());
+  if (cur_char() != quote_char)
+    return false;
+
+  // Check for escaping. \" is not a string terminator, but \\" is. Count
+  // the number of preceeding backslashes.
+  int num_backslashes = 0;
+  for (int i = static_cast<int>(cur_) - 1; i >= 0 && input_[i] == '\\'; i--)
+    num_backslashes++;
+
+  // Even backslashes mean that they were escaping each other and don't count
+  // as escaping this quote.
+  return (num_backslashes % 2) == 0;
+}
+
+bool Tokenizer::IsCurrentNewline() const {
+  return IsNewline(input_, cur_);
+}
+
+void Tokenizer::Advance() {
+  DCHECK(cur_ < input_.size());
+  if (IsCurrentNewline()) {
+    line_number_++;
+    char_in_line_ = 1;
+  } else {
+    char_in_line_++;
+  }
+  cur_++;
+}
+
+Location Tokenizer::GetCurrentLocation() const {
+  return Location(input_file_, line_number_, char_in_line_);
+}
+
+Err Tokenizer::GetErrorForInvalidToken(const Location& location) const {
+  std::string help;
+  if (cur_char() == ';') {
+    // Semicolon.
+    help = "Semicolons are not needed, delete this one.";
+  } else if (cur_char() == '\t') {
+    // Tab.
+    help = "You got a tab character in here. Tabs are evil. "
+           "Convert to spaces.";
+  } else if (cur_char() == '/' && cur_ + 1 < input_.size() &&
+      (input_[cur_ + 1] == '/' || input_[cur_ + 1] == '*')) {
+    // Different types of comments.
+    help = "Comments should start with # instead";
+  } else {
+    help = "I have no idea what this is.";
+  }
+
+  return Err(location, "Invalid token.", help);
+}