diff options
author | beng@google.com <beng@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-09-13 00:56:27 +0000 |
---|---|---|
committer | beng@google.com <beng@google.com@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-09-13 00:56:27 +0000 |
commit | 6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d (patch) | |
tree | 91072da4d7f80596bcc437e82685cf7de7944dfe /chrome/browser/importer/mork_reader.cc | |
parent | 231d5a36e476d013a91ca742bb8a0a2973cfee54 (diff) | |
download | chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.zip chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.tar.gz chromium_src-6c9851a40d3f6280dc322c2392d9cfcf8fba1b2d.tar.bz2 |
Move importer files into an importer subdirectory.
Also delete title chomper no one uses it.
B=2205
Review URL: http://codereview.chromium.org/3035
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@2154 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/importer/mork_reader.cc')
-rw-r--r-- | chrome/browser/importer/mork_reader.cc | 581 |
1 files changed, 581 insertions, 0 deletions
diff --git a/chrome/browser/importer/mork_reader.cc b/chrome/browser/importer/mork_reader.cc new file mode 100644 index 0000000..3ae4b6c --- /dev/null +++ b/chrome/browser/importer/mork_reader.cc @@ -0,0 +1,581 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* ***** BEGIN LICENSE BLOCK ***** + * Version: MPL 1.1/GPL 2.0/LGPL 2.1 + * + * The contents of this file are subject to the Mozilla Public License Version + * 1.1 (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * http://www.mozilla.org/MPL/ + * + * Software distributed under the License is distributed on an "AS IS" basis, + * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License + * for the specific language governing rights and limitations under the + * License. + * + * The Original Code is the Mork Reader. + * + * The Initial Developer of the Original Code is + * Google Inc. + * Portions created by the Initial Developer are Copyright (C) 2006 + * the Initial Developer. All Rights Reserved. + * + * Contributor(s): + * Brian Ryner <bryner@brianryner.com> (original author) + * + * Alternatively, the contents of this file may be used under the terms of + * either the GNU General Public License Version 2 or later (the "GPL"), or + * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"), + * in which case the provisions of the GPL or the LGPL are applicable instead + * of those above. If you wish to allow use of your version of this file only + * under the terms of either the GPL or the LGPL, and not to allow others to + * use your version of this file under the terms of the MPL, indicate your + * decision by deleting the provisions above and replace them with the notice + * and other provisions required by the GPL or the LGPL. If you do not delete + * the provisions above, a recipient may use your version of this file under + * the terms of any one of the MPL, the GPL or the LGPL. + * + * ***** END LICENSE BLOCK ***** */ + +// Source: +// http://mxr.mozilla.org/firefox/source/db/morkreader/nsMorkReader.cpp +// This file has been converted to google style. + +#include "chrome/browser/importer/mork_reader.h" + +#include <algorithm> + +#include "base/logging.h" +#include "base/string_util.h" +#include "chrome/browser/history/history_types.h" +#include "chrome/browser/importer/firefox_importer_utils.h" + +namespace { + +// Convert a hex character (0-9, A-F) to its corresponding byte value. +// Returns -1 if the character is invalid. +inline int HexCharToInt(char c) { + if ('0' <= c && c <= '9') + return c - '0'; + if ('A' <= c && c <= 'F') + return c - 'A' + 10; + return -1; +} + +// Unescape a Mork value. Mork uses $xx escaping to encode non-ASCII +// characters. Additionally, '$' and '\' are backslash-escaped. +// The result of the unescape is in returned. +std::string MorkUnescape(const std::string& input) { + // We optimize for speed over space here -- size the result buffer to + // the size of the source, which is an upper bound on the size of the + // unescaped string. + std::string result; + size_t input_length = input.size(); + result.reserve(input_length); + + for (size_t i = 0; i < input_length; i++) { + char c = input[i]; + if (c == '\\') { + // Escaped literal, slip the backslash, append the next character. + i++; + if (i < input_length) + result.push_back(input[i]); + } else if (c == '$') { + // Dollar sign denotes a hex character. + if (i < input_length - 2) { + // Would be nice to use ToInteger() here, but it currently + // requires a null-terminated string. + int first = HexCharToInt(input[++i]); + int second = HexCharToInt(input[++i]); + if (first >= 0 && second >= 0) + result.push_back((first << 4) | second); + } + } else { + // Regular character, just append. + result.push_back(input[i]); + } + } + return result; +} + +} // namespace + +MorkReader::MorkReader() { +} + +MorkReader::~MorkReader() { + // Need to delete all the pointers to vectors we have in the table. + for (RowMap::iterator i = table_.begin(); i != table_.end(); ++i) + delete i->second; +} + +bool MorkReader::Read(const std::wstring& filename) { + stream_.open(filename.c_str()); + if (!stream_.is_open()) + return false; + + std::string line; + if (!ReadLine(&line) || + line.compare("// <!-- <mdb:mork:z v=\"1.4\"/> -->") != 0) + return false; // Unexpected file format. + + IndexMap column_map; + while (ReadLine(&line)) { + // Trim off leading spaces + size_t idx = 0; + size_t len = line.size(); + while (idx < len && line[idx] == ' ') + ++idx; + if (idx >= len) + continue; + + // Look at the line to figure out what section type this is + if (StartsWithASCII(&line[idx], "< <(a=c)>", true)) { + // Column map. We begin by creating a hash of column id to column name. + StringMap column_name_map; + ParseMap(line, idx, &column_name_map); + + // Now that we have the list of columns, we put them into a flat array. + // Rows will have value arrays of the same size, with indexes that + // correspond to the columns array. As we insert each column into the + // array, we also make an entry in columnMap so that we can look up the + // index given the column id. + columns_.reserve(column_name_map.size()); + + for (StringMap::const_iterator i = column_name_map.begin(); + i != column_name_map.end(); ++i) { + column_map[i->first] = static_cast<int>(columns_.size()); + MorkColumn col(i->first, i->second); + columns_.push_back(col); + } + } else if (StartsWithASCII(&line[idx], "<(", true)) { + // Value map. + ParseMap(line, idx, &value_map_); + } else if (line[idx] == '{' || line[idx] == '[') { + // Table / table row. + ParseTable(line, idx, &column_map); + } else { + // Don't know, hopefully don't care. + } + } + return true; +} + +// Parses a key/value map of the form +// <(k1=v1)(k2=v2)...> +bool MorkReader::ParseMap(const std::string& first_line, + size_t start_index, + StringMap* map) { + // If the first line is the a=c line (column map), just skip over it. + std::string line(first_line); + if (StartsWithASCII(line, "< <(a=c)>", true)) + ReadLine(&line); + + std::string key; + do { + size_t idx = start_index; + size_t len = line.size(); + size_t token_start; + + while (idx < len) { + switch (line[idx++]) { + case '(': + // Beginning of a key/value pair. + if (!key.empty()) { + DLOG(WARNING) << "unterminated key/value pair?"; + key.clear(); + } + + token_start = idx; + while (idx < len && line[idx] != '=') + ++idx; + key.assign(&line[token_start], idx - token_start); + break; + + case '=': { + // Beginning of the value. + if (key.empty()) { + DLOG(WARNING) << "stray value"; + break; + } + + token_start = idx; + while (idx < len && line[idx] != ')') { + if (line[idx] == '\\') + ++idx; // Skip escaped ')' characters. + ++idx; + } + size_t token_end = std::min(idx, len); + ++idx; + + std::string value = MorkUnescape( + std::string(&line[token_start], token_end - token_start)); + (*map)[key] = value; + key.clear(); + break; + } + case '>': + // End of the map. + DLOG_IF(WARNING, key.empty()) << + "map terminates inside of key/value pair"; + return true; + } + } + + // We should start reading the next line at the beginning. + start_index = 0; + } while (ReadLine(&line)); + + // We ran out of lines and the map never terminated. This probably indicates + // a parsing error. + DLOG(WARNING) << "didn't find end of key/value map"; + return false; +} + +// Parses a table row of the form [123(^45^67)..] +// (row id 123 has the value with id 67 for the column with id 45). +// A '^' prefix for a column or value references an entry in the column or +// value map. '=' is used as the separator when the value is a literal. +void MorkReader::ParseTable(const std::string& first_line, + size_t start_index, + const IndexMap* column_map) { + std::string line(first_line); + + // Column index of the cell we're parsing, minus one if invalid. + int column_index = -1; + + // Points to the current row we're parsing inside of the |table_|, will be + // NULL if we're not inside a row. + ColumnDataList* current_row = NULL; + + bool in_meta_row = false; + + do { + size_t idx = start_index; + size_t len = line.size(); + + while (idx < len) { + switch (line[idx++]) { + case '{': + // This marks the beginning of a table section. There's a lot of + // junk before the first row that looks like cell values but isn't. + // Skip to the first '['. + while (idx < len && line[idx] != '[') { + if (line[idx] == '{') { + in_meta_row = true; // The meta row is enclosed in { } + } else if (line[idx] == '}') { + in_meta_row = false; + } + ++idx; + } + break; + + case '[': { + // Start of a new row. Consume the row id, up to the first '('. + // Row edits also have a table namespace, separated from the row id + // by a colon. We don't make use of the namespace, but we need to + // make sure not to consider it part of the row id. + if (current_row) { + DLOG(WARNING) << "unterminated row?"; + current_row = NULL; + } + + // Check for a '-' at the start of the id. This signifies that + // if the row already exists, we should delete all columns from it + // before adding the new values. + bool cut_columns; + if (idx < len && line[idx] == '-') { + cut_columns = true; + ++idx; + } else { + cut_columns = false; + } + + // Locate the range of the ID. + size_t token_start = idx; // Index of the first char of the token. + while (idx < len && + line[idx] != '(' && + line[idx] != ']' && + line[idx] != ':') { + ++idx; + } + size_t token_end = idx; // Index of the char following the token. + while (idx < len && line[idx] != '(' && line[idx] != ']') { + ++idx; + } + + if (in_meta_row) { + // Need to create the meta row. + meta_row_.resize(columns_.size()); + current_row = &meta_row_; + } else { + // Find or create the regular row for this. + IDString row_id(&line[token_start], token_end - token_start); + RowMap::iterator found_row = table_.find(row_id); + if (found_row == table_.end()) { + // We don't already have this row, create a new one for it. + current_row = new ColumnDataList(columns_.size()); + table_[row_id] = current_row; + } else { + // The row already exists and we're adding/replacing things. + current_row = found_row->second; + } + } + if (cut_columns) { + for (size_t i = 0; i < current_row->size(); ++i) + (*current_row)[i].clear(); + } + break; + } + + case ']': + // We're done with the row. + current_row = NULL; + in_meta_row = false; + break; + + case '(': { + if (!current_row) { + DLOG(WARNING) << "cell value outside of row"; + break; + } + + bool column_is_atom; + if (line[idx] == '^') { + column_is_atom = true; + ++idx; // This is not part of the column id, advance past it. + } else { + column_is_atom = false; + } + size_t token_start = idx; + while (idx < len && line[idx] != '^' && line[idx] != '=') { + if (line[idx] == '\\') + ++idx; // Skip escaped characters. + ++idx; + } + + size_t token_end = std::min(idx, len); + + IDString column; + if (column_is_atom) + column.assign(&line[token_start], token_end - token_start); + else + column = MorkUnescape(line.substr(token_start, + token_end - token_start)); + + IndexMap::const_iterator found_column = column_map->find(column); + if (found_column == column_map->end()) { + DLOG(WARNING) << "Column not in column map, discarding it"; + column_index = -1; + } else { + column_index = found_column->second; + } + break; + } + + case '=': + case '^': { + if (column_index == -1) { + DLOG(WARNING) << "stray ^ or = marker"; + break; + } + + bool value_is_atom = (line[idx - 1] == '^'); + size_t token_start = idx - 1; // Include the '=' or '^' marker. + while (idx < len && line[idx] != ')') { + if (line[idx] == '\\') + ++idx; // Skip escaped characters. + ++idx; + } + size_t token_end = std::min(idx, len); + ++idx; + + if (value_is_atom) { + (*current_row)[column_index].assign(&line[token_start], + token_end - token_start); + } else { + (*current_row)[column_index] = + MorkUnescape(line.substr(token_start, token_end - token_start)); + } + column_index = -1; + } + break; + } + } + + // Start parsing the next line at the beginning. + start_index = 0; + } while (current_row && ReadLine(&line)); +} + +bool MorkReader::ReadLine(std::string* line) { + line->resize(256); + std::getline(stream_, *line); + if (stream_.eof() || stream_.bad()) + return false; + + while (!line->empty() && (*line)[line->size() - 1] == '\\') { + // There is a continuation for this line. Read it and append. + std::string new_line; + std::getline(stream_, new_line); + if (stream_.eof()) + return false; + line->erase(line->size() - 1); + line->append(new_line); + } + + return true; +} + +void MorkReader::NormalizeValue(std::string* value) const { + if (value->empty()) + return; + MorkReader::StringMap::const_iterator i; + switch (value->at(0)) { + case '^': + // Hex ID, lookup the name for it in the |value_map_|. + i = value_map_.find(value->substr(1)); + if (i == value_map_.end()) + value->clear(); + else + *value = i->second; + break; + case '=': + // Just use the literal after the equals sign. + value->erase(value->begin()); + break; + default: + // Anything else is invalid. + value->clear(); + break; + } +} + +// Source: +// http://mxr.mozilla.org/firefox/source/toolkit/components/places/src/nsMorkHistoryImporter.cpp + +// Columns for entry (non-meta) history rows +enum { + kURLColumn, + kNameColumn, + kVisitCountColumn, + kHiddenColumn, + kTypedColumn, + kLastVisitColumn, + kColumnCount // Keep me last. +}; + +static const char * const gColumnNames[] = { + "URL", "Name", "VisitCount", "Hidden", "Typed", "LastVisitDate" +}; + +struct TableReadClosure { + explicit TableReadClosure(const MorkReader& r) + : reader(r), + swap_bytes(false), + byte_order_column(-1) { + for (int i = 0; i < kColumnCount; ++i) + column_indexes[i] = -1; + } + + // Backpointers to the reader and history we're operating on. + const MorkReader& reader; + + // Whether we need to swap bytes (file format is other-endian). + bool swap_bytes; + + // Indexes of the columns that we care about. + int column_indexes[kColumnCount]; + int byte_order_column; +}; + +void AddToHistory(MorkReader::ColumnDataList* column_values, + const TableReadClosure& data, + std::vector<history::URLRow>* rows) { + std::string values[kColumnCount]; + + for (size_t i = 0; i < kColumnCount; ++i) { + if (data.column_indexes[i] != -1) { + values[i] = column_values->at(data.column_indexes[i]); + data.reader.NormalizeValue(&values[i]); + // Do not import hidden records. + if (i == kHiddenColumn && values[i] == "1") + return; + } + } + + GURL url(values[kURLColumn]); + + if (CanImportURL(url)) { + history::URLRow row(url); + + // title is really a UTF-16 string at this point + std::wstring title; + if (data.swap_bytes) { + CodepageToWide(values[kNameColumn], "UTF-16BE", + OnStringUtilConversionError::SKIP, &title); + } else { + CodepageToWide(values[kNameColumn], "UTF-16LE", + OnStringUtilConversionError::SKIP, &title); + } + row.set_title(title); + + int count = atoi(values[kVisitCountColumn].c_str()); + if (count == 0) + count = 1; + row.set_visit_count(count); + + time_t date = StringToInt64(values[kLastVisitColumn]); + if (date != 0) + row.set_last_visit(Time::FromTimeT(date/1000000)); + + bool is_typed = (values[kTypedColumn] == "1"); + if (is_typed) + row.set_typed_count(1); + + rows->push_back(row); + } +} + +// It sets up the file stream and loops over the lines in the file to +// parse them, then adds the resulting row set to history. +void ImportHistoryFromFirefox2(std::wstring file, MessageLoop* loop, + ProfileWriter* writer) { + MorkReader reader; + reader.Read(file); + + // Gather up the column ids so we don't need to find them on each row + TableReadClosure data(reader); + const MorkReader::MorkColumnList& columns = reader.columns(); + for (size_t i = 0; i < columns.size(); ++i) { + for (int j = 0; j < kColumnCount; ++j) + if (columns[i].name == gColumnNames[j]) { + data.column_indexes[j] = static_cast<int>(i); + break; + } + if (columns[i].name == "ByteOrder") + data.byte_order_column = static_cast<int>(i); + } + + // Determine the byte order from the table's meta-row. + const MorkReader::ColumnDataList& meta_row = reader.meta_row(); + if (!meta_row.empty() && data.byte_order_column != -1) { + std::string byte_order = meta_row[data.byte_order_column]; + if (!byte_order.empty()) { + // Note whether the file uses a non-native byte ordering. + // If it does, we'll have to swap bytes for PRUnichar values. + // "BE" and "LE" are the only recognized values, anything + // else is garbage and the file will be treated as native-endian + // (no swapping). + std::string byte_order_value(byte_order); + reader.NormalizeValue(&byte_order_value); + data.swap_bytes = (byte_order_value == "BE"); + } + } + + std::vector<history::URLRow> rows; + for (MorkReader::iterator i = reader.begin(); i != reader.end(); ++i) + AddToHistory(i->second, data, &rows); + if (!rows.empty()) + loop->PostTask(FROM_HERE, NewRunnableMethod(writer, + &ProfileWriter::AddHistoryPage, rows)); +} |