diff options
author | initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-07-26 23:55:29 +0000 |
---|---|---|
committer | initial.commit <initial.commit@0039d316-1c4b-4281-b951-d872f2087c98> | 2008-07-26 23:55:29 +0000 |
commit | 09911bf300f1a419907a9412154760efd0b7abc3 (patch) | |
tree | f131325fb4e2ad12c6d3504ab75b16dd92facfed /chrome/browser/history/snippet.cc | |
parent | 586acc5fe142f498261f52c66862fa417c3d52d2 (diff) | |
download | chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.zip chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.tar.gz chromium_src-09911bf300f1a419907a9412154760efd0b7abc3.tar.bz2 |
Add chrome to the repository.
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@15 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'chrome/browser/history/snippet.cc')
-rw-r--r-- | chrome/browser/history/snippet.cc | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/chrome/browser/history/snippet.cc b/chrome/browser/history/snippet.cc new file mode 100644 index 0000000..ab7cb40 --- /dev/null +++ b/chrome/browser/history/snippet.cc @@ -0,0 +1,297 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +#include "chrome/browser/history/snippet.h" + +#include <algorithm> + +#include "base/logging.h" +#include "base/scoped_ptr.h" +#include "base/string_util.h" +#include "unicode/brkiter.h" +#include "unicode/utext.h" +#include "unicode/utf8.h" + +namespace { + +bool PairFirstLessThan(const std::pair<int,int>& a, + const std::pair<int,int>& b) { + return a.first < b.first; +} + +// Combines all pairs after offset in match_positions that are contained +// or touch the pair at offset. +void CoalescePositionsFrom(size_t offset, + Snippet::MatchPositions* match_positions) { + DCHECK(offset < match_positions->size()); + std::pair<int,int>& pair((*match_positions)[offset]); + ++offset; + while (offset < match_positions->size() && + pair.second >= (*match_positions)[offset].first) { + pair.second = std::max(pair.second, (*match_positions)[offset].second); + match_positions->erase(match_positions->begin() + offset); + } +} + +// Makes sure there is a pair in match_positions that contains the specified +// range. This keeps the pairs ordered in match_positions by first, and makes +// sure none of the pairs in match_positions touch each other. +void AddMatch(int start, int end, Snippet::MatchPositions* match_positions) { + DCHECK(start < end && match_positions); + std::pair<int,int> pair(start, end); + if (match_positions->empty()) { + match_positions->push_back(pair); + return; + } + // There's at least one match. Find the position of the new match, + // potentially extending pairs around it. + Snippet::MatchPositions::iterator i = + std::lower_bound(match_positions->begin(), match_positions->end(), + pair, &PairFirstLessThan); + if (i != match_positions->end() && i->first == start) { + // Match not at the end and there is already a pair with the same + // start. + if (end > i->second) { + // New pair extends beyond existing pair. Extend existing pair and + // coalesce matches after it. + i->second = end; + CoalescePositionsFrom(i - match_positions->begin(), match_positions); + } // else case, new pair completely contained in existing pair, nothing + // to do. + } else if (i == match_positions->begin()) { + // Match at the beginning and the first pair doesn't have the same + // start. Insert new pair and coalesce matches after it. + match_positions->insert(i, pair); + CoalescePositionsFrom(0, match_positions); + } else { + // Not at the beginning (but may be at the end). + --i; + if (start <= i->second && end > i->second) { + // Previous element contains match. Extend it and coalesce. + i->second = end; + CoalescePositionsFrom(i - match_positions->begin(), match_positions); + } else if (end > i->second) { + // Region doesn't touch previous element. See if region touches current + // element. + ++i; + if (i == match_positions->end() || end < i->first) { + match_positions->insert(i, pair); + } else { + i->first = start; + i->second = end; + CoalescePositionsFrom(i - match_positions->begin(), match_positions); + } + } + } +} + +// Converts an index in a utf8 string into the index in the corresponding wide +// string and returns the wide index. This is intended to be called in a loop +// iterating through a utf8 string. +// +// utf8_string: the utf8 string. +// utf8_length: length of the utf8 string. +// offset: the utf8 offset to convert. +// utf8_pos: current offset in the utf8 string. This is modified and on return +// matches offset. +// wide_pos: current index in the wide string. This is the same as the return +// value. +int AdvanceAndReturnWidePos(const char* utf8_string, + int utf8_length, + int offset, + int* utf8_pos, + int* wide_pos) { + DCHECK(offset >= *utf8_pos && offset <= utf8_length); + + UChar32 wide_char; + while (*utf8_pos < offset) { + U8_NEXT(utf8_string, *utf8_pos, utf8_length, wide_char); + *wide_pos += (wide_char <= 0xFFFF) ? 1 : 2; + } + return *wide_pos; +} + +// Given a character break iterator over a UTF-8 string, set the iterator +// position to |*utf8_pos| and move by |count| characters. |count| can +// be either positive or negative. +void MoveByNGraphemes(BreakIterator* bi, int count, int* utf8_pos) { + // Ignore the return value. A side effect of the current position + // being set at or following |*utf8_pos| is exploited here. + // It's simpler than calling following(n) and then previous(). + // isBoundary() is not very fast, but should be good enough for the + // snippet generation. If not, revisit the way we scan in ComputeSnippet. + bi->isBoundary(*utf8_pos); + bi->next(count); + *utf8_pos = static_cast<int>(bi->current()); +} + +// The amount of context to include for a given hit. Note that it's counted +// in terms of graphemes rather than bytes. +const int kSnippetContext = 50; + +// Returns true if next match falls within a snippet window +// from the previous match. The window size is counted in terms +// of graphemes rather than bytes in UTF-8. +bool IsNextMatchWithinSnippetWindow(BreakIterator* bi, + int previous_match_end, + int next_match_start) { + // If it's within a window in terms of bytes, it's certain + // that it's within a window in terms of graphemes as well. + if (next_match_start < previous_match_end + kSnippetContext) + return true; + bi->isBoundary(previous_match_end); + // An alternative to this is to call |bi->next()| at most + // kSnippetContext times, compare |bi->current()| with |next_match_start| + // after each call and return early if possible. There are other + // heuristics to speed things up if necessary, but it's not likely that + // we need to bother. + bi->next(kSnippetContext); + int64_t current = bi->current(); + return (next_match_start < current || current == BreakIterator::DONE); +} + +} // namespace + +// static +void Snippet::ExtractMatchPositions(const std::string& offsets_str, + const std::string& column_num, + MatchPositions* match_positions) { + DCHECK(match_positions); + if (offsets_str.empty()) + return; + std::vector<std::string> offsets; + SplitString(offsets_str, ' ', &offsets); + // SQLite offsets are sets of four integers: + // column, query term, match offset, match length + // Matches within a string are marked by (start, end) pairs. + for (size_t i = 0; i < offsets.size() - 3; i += 4) { + if (offsets[i] != column_num) + continue; + const int start = atoi(offsets[i+2].c_str()); + const int end = start + atoi(offsets[i+3].c_str()); + AddMatch(start, end, match_positions); + } +} + +// static +void Snippet::ConvertMatchPositionsToWide( + const std::string& utf8_string, + Snippet::MatchPositions* match_positions) { + DCHECK(match_positions); + int utf8_pos = 0; + int wide_pos = 0; + const char* utf8_cstring = utf8_string.c_str(); + const int utf8_length = static_cast<int>(utf8_string.size()); + for (Snippet::MatchPositions::iterator i = match_positions->begin(); + i != match_positions->end(); ++i) { + i->first = AdvanceAndReturnWidePos(utf8_cstring, utf8_length, + i->first, &utf8_pos, &wide_pos); + i->second = + AdvanceAndReturnWidePos(utf8_cstring, utf8_length, i->second, &utf8_pos, + &wide_pos); + } +} + +void Snippet::ComputeSnippet(const MatchPositions& match_positions, + const std::string& document) { + // The length of snippets we try to produce. + // We can generate longer snippets but stop once we cross kSnippetMaxLength. + const size_t kSnippetMaxLength = 200; + + + const std::wstring kEllipsis = L" ... "; + + // Grab the size as an int to cut down on casts later. + const int document_size = static_cast<int>(document.size()); + + UText* document_utext = NULL; + UErrorCode status = U_ZERO_ERROR; + document_utext = utext_openUTF8(document_utext, document.data(), + document_size, &status); + // Locale does not matter because there's no per-locale customization + // for character iterator. + scoped_ptr<BreakIterator> bi( + BreakIterator::createCharacterInstance(Locale::getDefault(), status)); + bi->setText(document_utext, status); + DCHECK(U_SUCCESS(status)); + + // We build the snippet by iterating through the matches and then grabbing + // context around each match. If matches are near enough each other (within + // kSnippetContext), we skip the "..." between them. + std::wstring snippet; + int start = 0; + for (size_t i = 0; i < match_positions.size(); ++i) { + // Some shorter names for the current match. + const int match_start = match_positions[i].first; + const int match_end = match_positions[i].second; + + // Add the context, if any, to show before the match. + int context_start = match_start; + MoveByNGraphemes(bi.get(), -kSnippetContext, &context_start); + start = std::max(start, context_start); + if (start < match_start) { + if (start > 0) + snippet += kEllipsis; + snippet += UTF8ToWide(document.substr(start, match_start - start)); + } + + // Add the match. + matches_.push_back(std::make_pair(static_cast<int>(snippet.size()), 0)); + snippet += UTF8ToWide(document.substr(match_start, + match_end - match_start)); + matches_.back().second = static_cast<int>(snippet.size()); + + // Compute the context, if any, to show after the match. + int end; + // Check if the next match falls within our snippet window. + if (i + 1 < match_positions.size() && + IsNextMatchWithinSnippetWindow(bi.get(), match_end, + match_positions[i + 1].first)) { + // Yes, it's within the window. Make the end context extend just up + // to the next match. + end = match_positions[i + 1].first; + snippet += UTF8ToWide(document.substr(match_end, end - match_end)); + } else { + // No, there's either no next match or the next match is too far away. + end = match_end; + MoveByNGraphemes(bi.get(), kSnippetContext, &end); + snippet += UTF8ToWide(document.substr(match_end, end - match_end)); + if (end < document_size) + snippet += kEllipsis; + } + start = end; + + // Stop here if we have enough snippet computed. + if (snippet.size() >= kSnippetMaxLength) + break; + } + + utext_close(document_utext); + swap(text_, snippet); +} |