Move importer files into an importer subdirectory.

Also delete title chomper no one uses it. B=2205 Review URL: http://codereview.chromium.org/3035 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@2154 0039d316-1c4b-4281-b951-d872f2087c98
author: beng@google.com <beng@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2008-09-13 00:56:27 +0000
committer: beng@google.com <beng@google.com@0039d316-1c4b-4281-b951-d872f2087c98> 2008-09-13 00:56:27 +0000
commit: 6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d (patch)
tree: 91072da4d7f80596bcc437e82685cf7de7944dfe /chrome/browser/importer/mork_reader.cc
parent: 231d5a36e476d013a91ca742bb8a0a2973cfee54 (diff)
download: chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.zip
chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.tar.gz
chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.tar.bz2
1 files changed, 581 insertions, 0 deletions
diff --git a/chrome/browser/importer/mork_reader.cc b/chrome/browser/importer/mork_reader.cc
new file mode 100644
index 0000000..3ae4b6c
--- /dev/null
+++ b/chrome/browser/importer/mork_reader.cc
@@ -0,0 +1,581 @@
+/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is the Mork Reader.
+ *
+ * The Initial Developer of the Original Code is
+ * Google Inc.
+ * Portions created by the Initial Developer are Copyright (C) 2006
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Brian Ryner <bryner@brianryner.com> (original author)
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+// Source:
+// http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp
+// This file has been converted to google style.
+
+#include "chrome/browser/importer/mork_reader.h"
+
+#include <algorithm>
+
+#include "base/logging.h"
+#include "base/string_util.h"
+#include "chrome/browser/history/history_types.h"
+#include "chrome/browser/importer/firefox_importer_utils.h"
+
+namespace {
+
+// Convert a hex character (0-9, A-F) to its corresponding byte value.
+// Returns -1 if the character is invalid.
+inline int HexCharToInt(char c) {
+  if ('0' <= c && c <= '9')
+    return c - '0';
+  if ('A' <= c && c <= 'F')
+    return c - 'A' + 10;
+  return -1;
+}
+
+// Unescape a Mork value.  Mork uses $xx escaping to encode non-ASCII
+// characters.  Additionally, '$' and '\' are backslash-escaped.
+// The result of the unescape is in returned.
+std::string MorkUnescape(const std::string& input) {
+  // We optimize for speed over space here -- size the result buffer to
+  // the size of the source, which is an upper bound on the size of the
+  // unescaped string.
+  std::string result;
+  size_t input_length = input.size();
+  result.reserve(input_length);
+
+  for (size_t i = 0; i < input_length; i++) {
+    char c = input[i];
+    if (c == '\\') {
+      // Escaped literal, slip the backslash, append the next character.
+      i++;
+      if (i < input_length)
+        result.push_back(input[i]);
+    } else if (c == '$') {
+      // Dollar sign denotes a hex character.
+      if (i < input_length - 2) {
+        // Would be nice to use ToInteger() here, but it currently
+        // requires a null-terminated string.
+        int first = HexCharToInt(input[++i]);
+        int second = HexCharToInt(input[++i]);
+        if (first >= 0 && second >= 0)
+          result.push_back((first << 4) | second);
+      }
+    } else {
+      // Regular character, just append.
+      result.push_back(input[i]);
+    }
+  }
+  return result;
+}
+
+}  // namespace
+
+MorkReader::MorkReader() {
+}
+
+MorkReader::~MorkReader() {
+  // Need to delete all the pointers to vectors we have in the table.
+  for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i)
+    delete i->second;
+}
+
+bool MorkReader::Read(const std::wstring& filename) {
+  stream_.open(filename.c_str());
+  if (!stream_.is_open())
+    return false;
+
+  std::string line;
+  if (!ReadLine(&line) ||
+      line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0)
+    return false;  // Unexpected file format.
+
+  IndexMap column_map;
+  while (ReadLine(&line)) {
+    // Trim off leading spaces
+    size_t idx = 0;
+    size_t len = line.size();
+    while (idx < len && line[idx] == ' ')
+      ++idx;
+    if (idx >= len)
+      continue;
+
+    // Look at the line to figure out what section type this is
+    if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) {
+      // Column map.  We begin by creating a hash of column id to column name.
+      StringMap column_name_map;
+      ParseMap(line, idx, &column_name_map);
+
+      // Now that we have the list of columns, we put them into a flat array.
+      // Rows will have value arrays of the same size, with indexes that
+      // correspond to the columns array.  As we insert each column into the
+      // array, we also make an entry in columnMap so that we can look up the
+      // index given the column id.
+      columns_.reserve(column_name_map.size());
+
+      for (StringMap::const_iterator i = column_name_map.begin();
+           i != column_name_map.end(); ++i) {
+        column_map[i->first] = static_cast<int>(columns_.size());
+        MorkColumn col(i->first, i->second);
+        columns_.push_back(col);
+      }
+    } else if (StartsWithASCII(&line[idx], "<(", true)) {
+      // Value map.
+      ParseMap(line, idx, &value_map_);
+    } else if (line[idx] == '{' || line[idx] == '[') {
+      // Table / table row.
+      ParseTable(line, idx, &column_map);
+    } else {
+      // Don't know, hopefully don't care.
+    }
+  }
+  return true;
+}
+
+// Parses a key/value map of the form
+// <(k1=v1)(k2=v2)...>
+bool MorkReader::ParseMap(const std::string& first_line,
+                          size_t start_index,
+                          StringMap* map) {
+  // If the first line is the a=c line (column map), just skip over it.
+  std::string line(first_line);
+  if (StartsWithASCII(line, "< <(a=c)>", true))
+    ReadLine(&line);
+
+  std::string key;
+  do {
+    size_t idx = start_index;
+    size_t len = line.size();
+    size_t token_start;
+
+    while (idx < len) {
+      switch (line[idx++]) {
+        case '(':
+          // Beginning of a key/value pair.
+          if (!key.empty()) {
+            DLOG(WARNING) << "unterminated key/value pair?";
+            key.clear();
+          }
+
+          token_start = idx;
+          while (idx < len && line[idx] != '=')
+            ++idx;
+          key.assign(&line[token_start], idx - token_start);
+          break;
+
+        case '=': {
+          // Beginning of the value.
+          if (key.empty()) {
+            DLOG(WARNING) << "stray value";
+            break;
+          }
+
+          token_start = idx;
+          while (idx < len && line[idx] != ')') {
+            if (line[idx] == '\\')
+              ++idx;  // Skip escaped ')' characters.
+            ++idx;
+          }
+          size_t token_end = std::min(idx, len);
+          ++idx;
+
+          std::string value = MorkUnescape(
+              std::string(&line[token_start], token_end - token_start));
+          (*map)[key] = value;
+          key.clear();
+          break;
+        }
+        case '>':
+          // End of the map.
+          DLOG_IF(WARNING, key.empty()) <<
+              "map terminates inside of key/value pair";
+          return true;
+      }
+    }
+
+    // We should start reading the next line at the beginning.
+    start_index = 0;
+  } while (ReadLine(&line));
+
+  // We ran out of lines and the map never terminated.  This probably indicates
+  // a parsing error.
+  DLOG(WARNING) << "didn't find end of key/value map";
+  return false;
+}
+
+// Parses a table row of the form [123(^45^67)..]
+// (row id 123 has the value with id 67 for the column with id 45).
+// A '^' prefix for a column or value references an entry in the column or
+// value map.  '=' is used as the separator when the value is a literal.
+void MorkReader::ParseTable(const std::string& first_line,
+                            size_t start_index,
+                            const IndexMap* column_map) {
+  std::string line(first_line);
+
+  // Column index of the cell we're parsing, minus one if invalid.
+  int column_index = -1;
+
+  // Points to the current row we're parsing inside of the |table_|, will be
+  // NULL if we're not inside a row.
+  ColumnDataList* current_row = NULL;
+
+  bool in_meta_row = false;
+
+  do {
+    size_t idx = start_index;
+    size_t len = line.size();
+
+    while (idx < len) {
+      switch (line[idx++]) {
+        case '{':
+          // This marks the beginning of a table section.  There's a lot of
+          // junk before the first row that looks like cell values but isn't.
+          // Skip to the first '['.
+          while (idx < len && line[idx] != '[') {
+            if (line[idx] == '{') {
+              in_meta_row = true;  // The meta row is enclosed in { }
+            } else if (line[idx] == '}') {
+              in_meta_row = false;
+            }
+            ++idx;
+          }
+          break;
+
+        case '[': {
+          // Start of a new row.  Consume the row id, up to the first '('.
+          // Row edits also have a table namespace, separated from the row id
+          // by a colon.  We don't make use of the namespace, but we need to
+          // make sure not to consider it part of the row id.
+          if (current_row) {
+            DLOG(WARNING) << "unterminated row?";
+            current_row = NULL;
+          }
+
+          // Check for a '-' at the start of the id.  This signifies that
+          // if the row already exists, we should delete all columns from it
+          // before adding the new values.
+          bool cut_columns;
+          if (idx < len && line[idx] == '-') {
+            cut_columns = true;
+            ++idx;
+          } else {
+            cut_columns = false;
+          }
+
+          // Locate the range of the ID.
+          size_t token_start = idx;  // Index of the first char of the token.
+          while (idx < len &&
+                 line[idx] != '(' &&
+                 line[idx] != ']' &&
+                 line[idx] != ':') {
+            ++idx;
+          }
+          size_t token_end = idx;  // Index of the char following the token.
+          while (idx < len && line[idx] != '(' && line[idx] != ']') {
+            ++idx;
+          }
+
+          if (in_meta_row) {
+            // Need to create the meta row.
+            meta_row_.resize(columns_.size());
+            current_row = &meta_row_;
+          } else {
+            // Find or create the regular row for this.
+            IDString row_id(&line[token_start], token_end - token_start);
+            RowMap::iterator found_row = table_.find(row_id);
+            if (found_row == table_.end()) {
+              // We don't already have this row, create a new one for it.
+              current_row = new ColumnDataList(columns_.size());
+              table_[row_id] = current_row;
+            } else {
+              // The row already exists and we're adding/replacing things.
+              current_row = found_row->second;
+            }
+          }
+          if (cut_columns) {
+            for (size_t i = 0; i < current_row->size(); ++i)
+              (*current_row)[i].clear();
+          }
+          break;
+        }
+
+        case ']':
+          // We're done with the row.
+          current_row = NULL;
+          in_meta_row = false;
+          break;
+
+        case '(': {
+          if (!current_row) {
+            DLOG(WARNING) << "cell value outside of row";
+            break;
+          }
+
+          bool column_is_atom;
+          if (line[idx] == '^') {
+            column_is_atom = true;
+            ++idx;  // This is not part of the column id, advance past it.
+          } else {
+            column_is_atom = false;
+          }
+          size_t token_start = idx;
+          while (idx < len && line[idx] != '^' && line[idx] != '=') {
+            if (line[idx] == '\\')
+              ++idx;  // Skip escaped characters.
+            ++idx;
+          }
+
+          size_t token_end = std::min(idx, len);
+
+          IDString column;
+          if (column_is_atom)
+            column.assign(&line[token_start], token_end - token_start);
+          else
+            column = MorkUnescape(line.substr(token_start,
+                                              token_end - token_start));
+
+          IndexMap::const_iterator found_column = column_map->find(column);
+          if (found_column == column_map->end()) {
+            DLOG(WARNING) << "Column not in column map, discarding it";
+            column_index = -1;
+          } else {
+            column_index = found_column->second;
+          }
+          break;
+        }
+
+        case '=':
+        case '^': {
+          if (column_index == -1) {
+            DLOG(WARNING) << "stray ^ or = marker";
+            break;
+          }
+
+          bool value_is_atom = (line[idx - 1] == '^');
+          size_t token_start = idx - 1;  // Include the '=' or '^' marker.
+          while (idx < len && line[idx] != ')') {
+            if (line[idx] == '\\')
+              ++idx;  // Skip escaped characters.
+            ++idx;
+          }
+          size_t token_end = std::min(idx, len);
+          ++idx;
+
+          if (value_is_atom) {
+            (*current_row)[column_index].assign(&line[token_start],
+                                                token_end - token_start);
+          } else {
+            (*current_row)[column_index] =
+                MorkUnescape(line.substr(token_start, token_end - token_start));
+          }
+          column_index = -1;
+        }
+        break;
+      }
+    }
+
+    // Start parsing the next line at the beginning.
+    start_index = 0;
+  } while (current_row && ReadLine(&line));
+}
+
+bool MorkReader::ReadLine(std::string* line) {
+  line->resize(256);
+  std::getline(stream_, *line);
+  if (stream_.eof() || stream_.bad())
+    return false;
+
+  while (!line->empty() &&  (*line)[line->size() - 1] == '\\') {
+    // There is a continuation for this line.  Read it and append.
+    std::string new_line;
+    std::getline(stream_, new_line);
+    if (stream_.eof())
+      return false;
+    line->erase(line->size() - 1);
+    line->append(new_line);
+  }
+
+  return true;
+}
+
+void MorkReader::NormalizeValue(std::string* value) const {
+  if (value->empty())
+    return;
+  MorkReader::StringMap::const_iterator i;
+  switch (value->at(0)) {
+    case '^':
+      // Hex ID, lookup the name for it in the |value_map_|.
+      i = value_map_.find(value->substr(1));
+      if (i == value_map_.end())
+        value->clear();
+      else
+        *value = i->second;
+      break;
+    case '=':
+      // Just use the literal after the equals sign.
+      value->erase(value->begin());
+      break;
+    default:
+      // Anything else is invalid.
+      value->clear();
+      break;
+  }
+}
+
+// Source:
+// http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp
+
+// Columns for entry (non-meta) history rows
+enum {
+  kURLColumn,
+  kNameColumn,
+  kVisitCountColumn,
+  kHiddenColumn,
+  kTypedColumn,
+  kLastVisitColumn,
+  kColumnCount  // Keep me last.
+};
+
+static const char * const gColumnNames[] = {
+  "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate"
+};
+
+struct TableReadClosure {
+  explicit TableReadClosure(const MorkReader& r)
+      : reader(r),
+        swap_bytes(false),
+        byte_order_column(-1) {
+    for (int i = 0; i < kColumnCount; ++i)
+      column_indexes[i] = -1;
+  }
+
+  // Backpointers to the reader and history we're operating on.
+  const MorkReader& reader;
+
+  // Whether we need to swap bytes (file format is other-endian).
+  bool swap_bytes;
+
+  // Indexes of the columns that we care about.
+  int column_indexes[kColumnCount];
+  int byte_order_column;
+};
+
+void AddToHistory(MorkReader::ColumnDataList* column_values,
+                  const TableReadClosure& data,
+                  std::vector<history::URLRow>* rows) {
+  std::string values[kColumnCount];
+
+  for (size_t i = 0; i < kColumnCount; ++i) {
+    if (data.column_indexes[i] != -1) {
+      values[i] = column_values->at(data.column_indexes[i]);
+      data.reader.NormalizeValue(&values[i]);
+      // Do not import hidden records.
+      if (i == kHiddenColumn && values[i] == "1")
+        return;
+    }
+  }
+
+  GURL url(values[kURLColumn]);
+
+  if (CanImportURL(url)) {
+    history::URLRow row(url);
+
+    // title is really a UTF-16 string at this point
+    std::wstring title;
+    if (data.swap_bytes) {
+      CodepageToWide(values[kNameColumn], "UTF-16BE",
+                     OnStringUtilConversionError::SKIP, &title);
+    } else {
+      CodepageToWide(values[kNameColumn], "UTF-16LE",
+                     OnStringUtilConversionError::SKIP, &title);
+    }
+    row.set_title(title);
+
+    int count = atoi(values[kVisitCountColumn].c_str());
+    if (count == 0)
+      count = 1;
+    row.set_visit_count(count);
+
+    time_t date = StringToInt64(values[kLastVisitColumn]);
+    if (date != 0)
+      row.set_last_visit(Time::FromTimeT(date/1000000));
+
+    bool is_typed = (values[kTypedColumn] == "1");
+    if (is_typed)
+      row.set_typed_count(1);
+
+    rows->push_back(row);
+  }
+}
+
+// It sets up the file stream and loops over the lines in the file to
+// parse them, then adds the resulting row set to history.
+void ImportHistoryFromFirefox2(std::wstring file, MessageLoop* loop,
+                               ProfileWriter* writer) {
+  MorkReader reader;
+  reader.Read(file);
+
+  // Gather up the column ids so we don't need to find them on each row
+  TableReadClosure data(reader);
+  const MorkReader::MorkColumnList& columns = reader.columns();
+  for (size_t i = 0; i < columns.size(); ++i) {
+    for (int j = 0; j < kColumnCount; ++j)
+      if (columns[i].name == gColumnNames[j]) {
+        data.column_indexes[j] = static_cast<int>(i);
+        break;
+      }
+    if (columns[i].name == "ByteOrder")
+      data.byte_order_column = static_cast<int>(i);
+  }
+
+  // Determine the byte order from the table's meta-row.
+  const MorkReader::ColumnDataList& meta_row = reader.meta_row();
+  if (!meta_row.empty() && data.byte_order_column != -1) {
+    std::string byte_order = meta_row[data.byte_order_column];
+    if (!byte_order.empty()) {
+      // Note whether the file uses a non-native byte ordering.
+      // If it does, we'll have to swap bytes for PRUnichar values.
+      // "BE" and "LE" are the only recognized values, anything
+      // else is garbage and the file will be treated as native-endian
+      // (no swapping).
+      std::string byte_order_value(byte_order);
+      reader.NormalizeValue(&byte_order_value);
+      data.swap_bytes = (byte_order_value == "BE");
+    }
+  }
+
+  std::vector<history::URLRow> rows;
+  for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i)
+    AddToHistory(i->second, data, &rows);
+  if (!rows.empty())
+    loop->PostTask(FROM_HERE, NewRunnableMethod(writer,
+                   &ProfileWriter::AddHistoryPage, rows));
+}
author	beng@google.com <beng@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2008-09-13 00:56:27 +0000
committer	beng@google.com <beng@google.com@0039d316-1c4b-4281-b951-d872f2087c98>	2008-09-13 00:56:27 +0000
commit	6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d (patch)
tree	91072da4d7f80596bcc437e82685cf7de7944dfe /chrome/browser/importer/mork_reader.cc
parent	231d5a36e476d013a91ca742bb8a0a2973cfee54 (diff)
download	chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.zip chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.tar.gz chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.tar.bz2