summaryrefslogtreecommitdiffstats
path: root/components/feedback/anonymizer_tool.cc
blob: 713ceb6a207e475e357dcdc34324708c0c8bb075 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
// Copyright 2015 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "components/feedback/anonymizer_tool.h"

#include <base/strings/string_number_conversions.h>
#include <base/strings/string_util.h>
#include <base/strings/stringprintf.h>

#include "third_party/re2/src/re2/re2.h"

using re2::RE2;

namespace feedback {

namespace {

// The |kCustomPatterns| array defines patterns to match and anonymize. Each
// pattern needs to define three capturing parentheses groups:
//
// - a group for the pattern before the identifier to be anonymized;
// - a group for the identifier to be anonymized;
// - a group for the pattern after the identifier to be anonymized.
//
// Every matched identifier (in the context of the whole pattern) is anonymized
// by replacing it with an incremental instance identifier. Every different
// pattern defines a separate instance identifier space. See the unit test for
// AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples.
//
// Useful regular expression syntax:
//
// +? is a non-greedy (lazy) +.
// \b matches a word boundary.
// (?i) turns on case insensitivy for the remainder of the regex.
// (?-s) turns off "dot matches newline" for the remainder of the regex.
// (?:regex) denotes non-capturing parentheses group.
const char* kCustomPatterns[] = {
    "(\\bCell ID: ')([0-9a-fA-F]+)(')",                  // ModemManager
    "(\\bLocation area code: ')([0-9a-fA-F]+)(')",       // ModemManager
    "(?i-s)(\\bssid[= ]')(.+)(')",                       // wpa_supplicant
    "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()",  // wpa_supplicant
    "(?-s)(\\[SSID=)(.+?)(\\])",                         // shill
};

}  // namespace

AnonymizerTool::AnonymizerTool()
    : custom_patterns_(arraysize(kCustomPatterns)) {}

AnonymizerTool::~AnonymizerTool() {}

std::string AnonymizerTool::Anonymize(const std::string& input) {
  std::string anonymized = AnonymizeMACAddresses(input);
  anonymized = AnonymizeCustomPatterns(std::move(anonymized));
  return anonymized;
}

std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) {
  // This regular expression finds the next MAC address. It splits the data into
  // a section preceding the MAC address, an OUI (Organizationally Unique
  // Identifier) part and a NIC (Network Interface Controller) specific part.

  RE2::Options options;
  // set_multiline of pcre is not supported by RE2, yet.
  options.set_dot_nl(true);  // Dot matches a new line.
  RE2 mac_re(
      "(.*?)("
      "[0-9a-fA-F][0-9a-fA-F]:"
      "[0-9a-fA-F][0-9a-fA-F]:"
      "[0-9a-fA-F][0-9a-fA-F]):("
      "[0-9a-fA-F][0-9a-fA-F]:"
      "[0-9a-fA-F][0-9a-fA-F]:"
      "[0-9a-fA-F][0-9a-fA-F])",
      options);

  std::string result;
  result.reserve(input.size());

  // Keep consuming, building up a result string as we go.
  re2::StringPiece text(input);
  std::string pre_mac, oui, nic;
  while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui),
                           RE2::Arg(&nic))) {
    // Look up the MAC address in the hash.
    oui = base::ToLowerASCII(oui);
    nic = base::ToLowerASCII(nic);
    std::string mac = oui + ":" + nic;
    std::string replacement_mac = mac_addresses_[mac];
    if (replacement_mac.empty()) {
      // If not found, build up a replacement MAC address by generating a new
      // NIC part.
      int mac_id = mac_addresses_.size();
      replacement_mac = base::StringPrintf(
          "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16,
          (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff));
      mac_addresses_[mac] = replacement_mac;
    }

    result += pre_mac;
    result += replacement_mac;
  }

  text.AppendToString(&result);
  return result;
}

std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) {
  for (size_t i = 0; i < arraysize(kCustomPatterns); i++) {
    input =
        AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]);
  }
  return input;
}

// static
std::string AnonymizerTool::AnonymizeCustomPattern(
    const std::string& input,
    const std::string& pattern,
    std::map<std::string, std::string>* identifier_space) {
  RE2::Options options;
  // set_multiline of pcre is not supported by RE2, yet.
  options.set_dot_nl(true);  // Dot matches a new line.
  RE2 re("(.*?)" + pattern, options);
  DCHECK_EQ(4, re.NumberOfCapturingGroups());

  std::string result;
  result.reserve(input.size());

  // Keep consuming, building up a result string as we go.
  re2::StringPiece text(input);
  std::string pre_match, pre_matched_id, matched_id, post_matched_id;
  while (RE2::Consume(&text, re, RE2::Arg(&pre_match),
                      RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id),
                      RE2::Arg(&post_matched_id))) {
    std::string replacement_id = (*identifier_space)[matched_id];
    if (replacement_id.empty()) {
      replacement_id = base::IntToString(identifier_space->size());
      (*identifier_space)[matched_id] = replacement_id;
    }

    result += pre_match;
    result += pre_matched_id;
    result += replacement_id;
    result += post_matched_id;
  }
  text.AppendToString(&result);
  return result;
}

}  // namespace feedback