components/omnibox/browser/in_memory_url_index_types.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171

// Copyright (c) 2012 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_
#define COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_

#include <map>
#include <set>
#include <vector>

#include "base/strings/string16.h"
#include "components/history/core/browser/history_types.h"
#include "url/gurl.h"

// Matches within URL and Title Strings ----------------------------------------

// Specifies where an omnibox term occurs within a string. Used for specifying
// highlights in AutocompleteMatches (ACMatchClassifications) and to assist in
// scoring a result.
struct TermMatch {
  TermMatch() : term_num(0), offset(0), length(0) {}
  TermMatch(int term_num, size_t offset, size_t length)
      : term_num(term_num),
        offset(offset),
        length(length) {}

  int term_num;  // The index of the term in the original search string.
  size_t offset;  // The starting offset of the substring match.
  size_t length;  // The length of the substring match.
};
typedef std::vector<TermMatch> TermMatches;

// Returns a TermMatches which has an entry for each occurrence of the
// string |term| found in the string |cleaned_string|. Use
// CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before passing
// |cleaned_string| to this function. The function marks each match
// with |term_num| so that the resulting TermMatches can be merged
// with other TermMatches for other terms. Note that only the first
// 2,048 characters of |string| are considered during the match
// operation.
TermMatches MatchTermInString(const base::string16& term,
                              const base::string16& cleaned_string,
                              int term_num);

// Sorts and removes overlapping substring matches from |matches| and
// returns the cleaned up matches.
TermMatches SortAndDeoverlapMatches(const TermMatches& matches);

// Extracts and returns the offsets from |matches|.  This includes both
// the offsets corresponding to the beginning of a match and the offsets
// corresponding to the end of a match (i.e., offset+length for that match).
std::vector<size_t> OffsetsFromTermMatches(const TermMatches& matches);

// Replaces the offsets and lengths in |matches| with those given in |offsets|.
// |offsets| gives beginning and ending offsets for each match; this function
// translates (beginning, ending) offset into (beginning offset, length of
// match).  It deletes any matches for which an endpoint is npos and returns
// the updated list of matches.
TermMatches ReplaceOffsetsInTermMatches(const TermMatches& matches,
                                        const std::vector<size_t>& offsets);

// Convenience Types -----------------------------------------------------------

typedef std::vector<base::string16> String16Vector;
typedef std::set<base::string16> String16Set;
typedef std::set<base::char16> Char16Set;
typedef std::vector<base::char16> Char16Vector;

// A vector that contains the offsets at which each word starts within a string.
typedef std::vector<size_t> WordStarts;

// Utility Functions -----------------------------------------------------------

// Breaks the string |cleaned_uni_string| down into individual words.
// Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching() before
// passing |cleaned_uni_string| to this function. If |word_starts| is
// not NULL then clears and pushes the offsets within
// |cleaned_uni_string| at which each word starts onto
// |word_starts|. These offsets are collected only up to the first
// kMaxSignificantChars of |cleaned_uni_string|.
String16Set String16SetFromString16(const base::string16& cleaned_uni_string,
                                    WordStarts* word_starts);

// Breaks the |cleaned_uni_string| string down into individual words
// and return a vector with the individual words in their original
// order.  Use CleanUpUrlForMatching() or CleanUpUrlTitleMatching()
// before passing |cleaned_uni_string| to this function.  If
// |break_on_space| is false then the resulting list will contain only
// words containing alpha-numeric characters. If |break_on_space| is
// true then the resulting list will contain strings broken at
// whitespace. (|break_on_space| indicates that the
// BreakIterator::BREAK_SPACE (equivalent to BREAK_LINE) approach is
// to be used. For a complete description of this algorithm refer to
// the comments in base/i18n/break_iterator.h.) If |word_starts| is
// not NULL then clears and pushes the word starts onto |word_starts|.
//
// Example:
//   Given: |cleaned_uni_string|: "http://www.google.com/ harry the rabbit."
//   With |break_on_space| false the returned list will contain:
//    "http", "www", "google", "com", "harry", "the", "rabbit"
//   With |break_on_space| true the returned list will contain:
//    "http://", "www.google.com/", "harry", "the", "rabbit."
String16Vector String16VectorFromString16(
    const base::string16& cleaned_uni_string,
    bool break_on_space,
    WordStarts* word_starts);

// Breaks the |uni_word| string down into its individual characters.
// Note that this is temporarily intended to work on a single word, but
// _will_ work on a string of words, perhaps with unexpected results.
// TODO(mrossetti): Lots of optimizations possible here for not restarting
// a search if the user is just typing along. Also, change this to uniString
// and properly handle substring matches, scoring and sorting the results
// by score. Also, provide the metrics for where the matches occur so that
// the UI can highlight the matched sections.
Char16Set Char16SetFromString16(const base::string16& uni_word);

// Support for InMemoryURLIndex Private Data -----------------------------------

// An index into a list of all of the words we have indexed.
typedef size_t WordID;

// A map allowing a WordID to be determined given a word.
typedef std::map<base::string16, WordID> WordMap;

// A map from character to the word_ids of words containing that character.
typedef std::set<WordID> WordIDSet;  // An index into the WordList.
typedef std::map<base::char16, WordIDSet> CharWordIDMap;

// A map from word (by word_id) to history items containing that word.
typedef history::URLID HistoryID;
typedef std::set<HistoryID> HistoryIDSet;
typedef std::vector<HistoryID> HistoryIDVector;
typedef std::map<WordID, HistoryIDSet> WordIDHistoryMap;
typedef std::map<HistoryID, WordIDSet> HistoryIDWordMap;


// Information used in scoring a particular URL.
typedef std::vector<history::VisitInfo> VisitInfoVector;
struct HistoryInfoMapValue {
  HistoryInfoMapValue();
  ~HistoryInfoMapValue();

  // This field is always populated.
  history::URLRow url_row;

  // This field gets filled in asynchronously after a visit.  As such,
  // it's almost always correct.  If it's wrong, it's likely to either
  // be empty (if this URL was recently added to the index) or
  // slightly out-of-date (one visit behind).
  VisitInfoVector visits;
};

// A map from history_id to the history's URL and title.
typedef std::map<HistoryID, HistoryInfoMapValue> HistoryInfoMap;

// A map from history_id to URL and page title word start metrics.
struct RowWordStarts {
  RowWordStarts();
  ~RowWordStarts();

  // Clears both url_word_starts_ and title_word_starts_.
  void Clear();

  WordStarts url_word_starts_;
  WordStarts title_word_starts_;
};
typedef std::map<HistoryID, RowWordStarts> WordStartsMap;

#endif  // COMPONENTS_OMNIBOX_BROWSER_IN_MEMORY_URL_INDEX_TYPES_H_