summaryrefslogtreecommitdiffstats
path: root/components/query_parser/query_parser.h
blob: a6e2d229ea1a783ad380d4a06c71c552e1f739eb (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Copyright 2014 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_
#define COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_

#include <stddef.h>

#include <vector>

#include "base/macros.h"
#include "base/strings/string16.h"
#include "components/query_parser/snippet.h"

namespace query_parser {

class QueryNodeList;

// Used by HasMatchIn.
struct QueryWord {
  // The work to match against.
  base::string16 word;

  // The starting position of the word in the original text.
  size_t position;
};

enum class MatchingAlgorithm {
  // Only words long enough are considered for prefix search. Shorter words are
  // considered for exact matches.
  DEFAULT,
  // All words are considered for a prefix search.
  ALWAYS_PREFIX_SEARCH,
};

typedef std::vector<query_parser::QueryWord> QueryWordVector;

// QueryNode is used by QueryParser to represent the elements that constitute a
// query. While QueryNode is exposed by way of ParseQuery, it really isn't meant
// for external usage.
class QueryNode {
 public:
  virtual ~QueryNode() {}

  // Serialize ourselves out to a string that can be passed to SQLite. Returns
  // the number of words in this node.
  virtual int AppendToSQLiteQuery(base::string16* query) const = 0;

  // Return true if this is a QueryNodeWord, false if it's a QueryNodeList.
  virtual bool IsWord() const = 0;

  // Returns true if this node matches |word|. If |exact| is true, the string
  // must exactly match. Otherwise, this uses a starts with comparison.
  virtual bool Matches(const base::string16& word, bool exact) const = 0;

  // Returns true if this node matches at least one of the words in |words|. An
  // entry is added to |match_positions| for all matching words giving the
  // matching regions.
  virtual bool HasMatchIn(const QueryWordVector& words,
                          Snippet::MatchPositions* match_positions) const = 0;

  // Returns true if this node matches at least one of the words in |words|.
  virtual bool HasMatchIn(const QueryWordVector& words) const = 0;

  // Appends the words that make up this node in |words|.
  virtual void AppendWords(std::vector<base::string16>* words) const = 0;
};

typedef std::vector<query_parser::QueryNode*> QueryNodeStarVector;

// This class is used to parse queries entered into the history search into more
// normalized queries that can be passed to the SQLite backend.
class QueryParser {
 public:
  QueryParser();

  // For CJK ideographs and Korean Hangul, even a single character
  // can be useful in prefix matching, but that may give us too many
  // false positives. Moreover, the current ICU word breaker gives us
  // back every single Chinese character as a word so that there's no
  // point doing anything for them and we only adjust the minimum length
  // to 2 for Korean Hangul while using 3 for others. This is a temporary
  // hack until we have a segmentation support.
  static bool IsWordLongEnoughForPrefixSearch(
      const base::string16& word,
      MatchingAlgorithm matching_algorithm);

  // Parse a query into a SQLite query. The resulting query is placed in
  // |sqlite_query| and the number of words is returned.
  int ParseQuery(const base::string16& query,
                 MatchingAlgorithm matching_algorithm,
                 base::string16* sqlite_query);

  // Parses |query|, returning the words that make up it. Any words in quotes
  // are put in |words| without the quotes. For example, the query text
  // "foo bar" results in two entries being added to words, one for foo and one
  // for bar.
  void ParseQueryWords(const base::string16& query,
                       MatchingAlgorithm matching_algorithm,
                       std::vector<base::string16>* words);

  // Parses |query|, returning the nodes that constitute the valid words in the
  // query. This is intended for later usage with DoesQueryMatch. Ownership of
  // the nodes passes to the caller.
  void ParseQueryNodes(const base::string16& query,
                       MatchingAlgorithm matching_algorithm,
                       QueryNodeStarVector* nodes);

  // Returns true if the string text matches the query nodes created by a call
  // to ParseQuery. If the query does match, each of the matching positions in
  // the text is added to |match_positions|.
  bool DoesQueryMatch(const base::string16& text,
                      const QueryNodeStarVector& nodes,
                      Snippet::MatchPositions* match_positions);

  // Returns true if all of the |words| match the query |nodes| created by a
  // call to ParseQuery.
  bool DoesQueryMatch(const QueryWordVector& words,
                      const QueryNodeStarVector& nodes);

  // Extracts the words from |text|, placing each word into |words|.
  void ExtractQueryWords(const base::string16& text,
                         QueryWordVector* words);

  // Sorts the match positions in |matches| by their first index, then
  // coalesces any match positions that intersect each other.
  static void SortAndCoalesceMatchPositions(Snippet::MatchPositions* matches);

 private:
  // Does the work of parsing |query|; creates nodes in |root| as appropriate.
  // This is invoked from both of the ParseQuery methods.
  bool ParseQueryImpl(const base::string16& query,
                      MatchingAlgorithm matching_algorithm,
                      QueryNodeList* root);

  DISALLOW_COPY_AND_ASSIGN(QueryParser);
};

}  // namespace query_parser

#endif  // COMPONENTS_QUERY_PARSER_QUERY_PARSER_H_