diff options
Diffstat (limited to 'components/feedback')
-rw-r--r-- | components/feedback/BUILD.gn | 4 | ||||
-rw-r--r-- | components/feedback/DEPS | 3 | ||||
-rw-r--r-- | components/feedback/OWNERS | 2 | ||||
-rw-r--r-- | components/feedback/anonymizer_tool.cc | 151 | ||||
-rw-r--r-- | components/feedback/anonymizer_tool.h | 52 | ||||
-rw-r--r-- | components/feedback/anonymizer_tool_unittest.cc | 109 | ||||
-rw-r--r-- | components/feedback/feedback_common_unittest.cc | 53 |
7 files changed, 347 insertions, 27 deletions
diff --git a/components/feedback/BUILD.gn b/components/feedback/BUILD.gn index f126e05..ec4e9f19 100644 --- a/components/feedback/BUILD.gn +++ b/components/feedback/BUILD.gn @@ -4,6 +4,8 @@ source_set("feedback") { sources = [ + "anonymizer_tool.cc", + "anonymizer_tool.h", "feedback_common.cc", "feedback_common.h", "feedback_data.cc", @@ -35,6 +37,7 @@ source_set("feedback") { "//content/public/browser", "//content/public/common", "//net", + "//third_party/re2", "//third_party/zlib:zip", ] } @@ -42,6 +45,7 @@ source_set("feedback") { source_set("unit_tests") { testonly = true sources = [ + "anonymizer_tool_unittest.cc", "feedback_common_unittest.cc", "feedback_data_unittest.cc", "feedback_uploader_chrome_unittest.cc", diff --git a/components/feedback/DEPS b/components/feedback/DEPS index c88ff4e..3a9dc0b20 100644 --- a/components/feedback/DEPS +++ b/components/feedback/DEPS @@ -8,5 +8,6 @@ include_rules = [ "+content/public/test", "+net/base", "+net/url_request", - "+third_party/zlib/google/zip.h", + "+third_party/re2", + "+third_party/zlib/google", ] diff --git a/components/feedback/OWNERS b/components/feedback/OWNERS index 5654d44..5b2bc16 100644 --- a/components/feedback/OWNERS +++ b/components/feedback/OWNERS @@ -2,3 +2,5 @@ achaulk@chromium.org bsimonnet@chromium.org rkc@chromium.org zork@chromium.org + +per-file anonymizer_tool*=battre@chromium.org diff --git a/components/feedback/anonymizer_tool.cc b/components/feedback/anonymizer_tool.cc new file mode 100644 index 0000000..713ceb6 --- /dev/null +++ b/components/feedback/anonymizer_tool.cc @@ -0,0 +1,151 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/feedback/anonymizer_tool.h" + +#include <base/strings/string_number_conversions.h> +#include <base/strings/string_util.h> +#include <base/strings/stringprintf.h> + +#include "third_party/re2/src/re2/re2.h" + +using re2::RE2; + +namespace feedback { + +namespace { + +// The |kCustomPatterns| array defines patterns to match and anonymize. Each +// pattern needs to define three capturing parentheses groups: +// +// - a group for the pattern before the identifier to be anonymized; +// - a group for the identifier to be anonymized; +// - a group for the pattern after the identifier to be anonymized. +// +// Every matched identifier (in the context of the whole pattern) is anonymized +// by replacing it with an incremental instance identifier. Every different +// pattern defines a separate instance identifier space. See the unit test for +// AnonymizerTool::AnonymizeCustomPattern for pattern anonymization examples. +// +// Useful regular expression syntax: +// +// +? is a non-greedy (lazy) +. +// \b matches a word boundary. +// (?i) turns on case insensitivy for the remainder of the regex. +// (?-s) turns off "dot matches newline" for the remainder of the regex. +// (?:regex) denotes non-capturing parentheses group. +const char* kCustomPatterns[] = { + "(\\bCell ID: ')([0-9a-fA-F]+)(')", // ModemManager + "(\\bLocation area code: ')([0-9a-fA-F]+)(')", // ModemManager + "(?i-s)(\\bssid[= ]')(.+)(')", // wpa_supplicant + "(?-s)(\\bSSID - hexdump\\(len=[0-9]+\\): )(.+)()", // wpa_supplicant + "(?-s)(\\[SSID=)(.+?)(\\])", // shill +}; + +} // namespace + +AnonymizerTool::AnonymizerTool() + : custom_patterns_(arraysize(kCustomPatterns)) {} + +AnonymizerTool::~AnonymizerTool() {} + +std::string AnonymizerTool::Anonymize(const std::string& input) { + std::string anonymized = AnonymizeMACAddresses(input); + anonymized = AnonymizeCustomPatterns(std::move(anonymized)); + return anonymized; +} + +std::string AnonymizerTool::AnonymizeMACAddresses(const std::string& input) { + // This regular expression finds the next MAC address. It splits the data into + // a section preceding the MAC address, an OUI (Organizationally Unique + // Identifier) part and a NIC (Network Interface Controller) specific part. + + RE2::Options options; + // set_multiline of pcre is not supported by RE2, yet. + options.set_dot_nl(true); // Dot matches a new line. + RE2 mac_re( + "(.*?)(" + "[0-9a-fA-F][0-9a-fA-F]:" + "[0-9a-fA-F][0-9a-fA-F]:" + "[0-9a-fA-F][0-9a-fA-F]):(" + "[0-9a-fA-F][0-9a-fA-F]:" + "[0-9a-fA-F][0-9a-fA-F]:" + "[0-9a-fA-F][0-9a-fA-F])", + options); + + std::string result; + result.reserve(input.size()); + + // Keep consuming, building up a result string as we go. + re2::StringPiece text(input); + std::string pre_mac, oui, nic; + while (re2::RE2::Consume(&text, mac_re, RE2::Arg(&pre_mac), RE2::Arg(&oui), + RE2::Arg(&nic))) { + // Look up the MAC address in the hash. + oui = base::ToLowerASCII(oui); + nic = base::ToLowerASCII(nic); + std::string mac = oui + ":" + nic; + std::string replacement_mac = mac_addresses_[mac]; + if (replacement_mac.empty()) { + // If not found, build up a replacement MAC address by generating a new + // NIC part. + int mac_id = mac_addresses_.size(); + replacement_mac = base::StringPrintf( + "%s:%02x:%02x:%02x", oui.c_str(), (mac_id & 0x00ff0000) >> 16, + (mac_id & 0x0000ff00) >> 8, (mac_id & 0x000000ff)); + mac_addresses_[mac] = replacement_mac; + } + + result += pre_mac; + result += replacement_mac; + } + + text.AppendToString(&result); + return result; +} + +std::string AnonymizerTool::AnonymizeCustomPatterns(std::string input) { + for (size_t i = 0; i < arraysize(kCustomPatterns); i++) { + input = + AnonymizeCustomPattern(input, kCustomPatterns[i], &custom_patterns_[i]); + } + return input; +} + +// static +std::string AnonymizerTool::AnonymizeCustomPattern( + const std::string& input, + const std::string& pattern, + std::map<std::string, std::string>* identifier_space) { + RE2::Options options; + // set_multiline of pcre is not supported by RE2, yet. + options.set_dot_nl(true); // Dot matches a new line. + RE2 re("(.*?)" + pattern, options); + DCHECK_EQ(4, re.NumberOfCapturingGroups()); + + std::string result; + result.reserve(input.size()); + + // Keep consuming, building up a result string as we go. + re2::StringPiece text(input); + std::string pre_match, pre_matched_id, matched_id, post_matched_id; + while (RE2::Consume(&text, re, RE2::Arg(&pre_match), + RE2::Arg(&pre_matched_id), RE2::Arg(&matched_id), + RE2::Arg(&post_matched_id))) { + std::string replacement_id = (*identifier_space)[matched_id]; + if (replacement_id.empty()) { + replacement_id = base::IntToString(identifier_space->size()); + (*identifier_space)[matched_id] = replacement_id; + } + + result += pre_match; + result += pre_matched_id; + result += replacement_id; + result += post_matched_id; + } + text.AppendToString(&result); + return result; +} + +} // namespace feedback diff --git a/components/feedback/anonymizer_tool.h b/components/feedback/anonymizer_tool.h new file mode 100644 index 0000000..54a690f --- /dev/null +++ b/components/feedback/anonymizer_tool.h @@ -0,0 +1,52 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COMPONENTS_FEEDBACK_ANONYMIZER_TOOL_H_ +#define COMPONENTS_FEEDBACK_ANONYMIZER_TOOL_H_ + +#include <map> +#include <string> +#include <vector> + +#include <base/macros.h> + +namespace feedback { + +class AnonymizerTool { + public: + AnonymizerTool(); + ~AnonymizerTool(); + + // Returns an anonymized version of |input|. PII-sensitive data (such as MAC + // addresses) in |input| is replaced with unique identifiers. + std::string Anonymize(const std::string& input); + + private: + friend class AnonymizerToolTest; + + std::string AnonymizeMACAddresses(const std::string& input); + std::string AnonymizeCustomPatterns(std::string input); + static std::string AnonymizeCustomPattern( + const std::string& input, + const std::string& pattern, + std::map<std::string, std::string>* identifier_space); + + // Map of MAC addresses discovered in anonymized strings to anonymized + // representations. 11:22:33:44:55:66 gets anonymized to 11:22:33:00:00:01, + // where the first three bytes represent the manufacturer. The last three + // bytes are used to distinguish different MAC addresses and are incremented + // for each newly discovered MAC address. + std::map<std::string, std::string> mac_addresses_; + + // Like mac addresses, identifiers in custom patterns are anonymized. + // custom_patterns_[i] contains a map of original identifier to anonymized + // identifier for custom pattern number i. + std::vector<std::map<std::string, std::string>> custom_patterns_; + + DISALLOW_COPY_AND_ASSIGN(AnonymizerTool); +}; + +} // namespace feedback + +#endif // COMPONENTS_FEEDBACK_ANONYMIZER_TOOL_H_ diff --git a/components/feedback/anonymizer_tool_unittest.cc b/components/feedback/anonymizer_tool_unittest.cc new file mode 100644 index 0000000..68f35a8 --- /dev/null +++ b/components/feedback/anonymizer_tool_unittest.cc @@ -0,0 +1,109 @@ +// Copyright 2015 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "components/feedback/anonymizer_tool.h" + +#include <gtest/gtest.h> + +namespace feedback { + +class AnonymizerToolTest : public testing::Test { + protected: + std::string AnonymizeMACAddresses(const std::string& input) { + return anonymizer_.AnonymizeMACAddresses(input); + } + + std::string AnonymizeCustomPatterns(const std::string& input) { + return anonymizer_.AnonymizeCustomPatterns(input); + } + + static std::string AnonymizeCustomPattern( + const std::string& input, + const std::string& pattern, + std::map<std::string, std::string>* space) { + return AnonymizerTool::AnonymizeCustomPattern(input, pattern, space); + } + + AnonymizerTool anonymizer_; +}; + +TEST_F(AnonymizerToolTest, Anonymize) { + EXPECT_EQ("", anonymizer_.Anonymize("")); + EXPECT_EQ("foo\nbar\n", anonymizer_.Anonymize("foo\nbar\n")); + + // Make sure MAC address anonymization is invoked. + EXPECT_EQ("02:46:8a:00:00:01", anonymizer_.Anonymize("02:46:8a:ce:13:57")); + + // Make sure custom pattern anonymization is invoked. + EXPECT_EQ("Cell ID: '1'", AnonymizeCustomPatterns("Cell ID: 'A1B2'")); +} + +TEST_F(AnonymizerToolTest, AnonymizeMACAddresses) { + EXPECT_EQ("", AnonymizeMACAddresses("")); + EXPECT_EQ("foo\nbar\n", AnonymizeMACAddresses("foo\nbar\n")); + EXPECT_EQ("11:22:33:44:55", AnonymizeMACAddresses("11:22:33:44:55")); + EXPECT_EQ("aa:bb:cc:00:00:01", AnonymizeMACAddresses("aa:bb:cc:dd:ee:ff")); + EXPECT_EQ( + "BSSID: aa:bb:cc:00:00:01 in the middle\n" + "bb:cc:dd:00:00:02 start of line\n" + "end of line aa:bb:cc:00:00:01\n" + "no match across lines aa:bb:cc:\n" + "dd:ee:ff two on the same line:\n" + "x bb:cc:dd:00:00:02 cc:dd:ee:00:00:03 x\n", + AnonymizeMACAddresses("BSSID: aa:bb:cc:dd:ee:ff in the middle\n" + "bb:cc:dd:ee:ff:00 start of line\n" + "end of line aa:bb:cc:dd:ee:ff\n" + "no match across lines aa:bb:cc:\n" + "dd:ee:ff two on the same line:\n" + "x bb:cc:dd:ee:ff:00 cc:dd:ee:ff:00:11 x\n")); + EXPECT_EQ("Remember bb:cc:dd:00:00:02?", + AnonymizeMACAddresses("Remember bB:Cc:DD:ee:ff:00?")); +} + +TEST_F(AnonymizerToolTest, AnonymizeCustomPatterns) { + EXPECT_EQ("", AnonymizeCustomPatterns("")); + + EXPECT_EQ("Cell ID: '1'", AnonymizeCustomPatterns("Cell ID: 'A1B2'")); + EXPECT_EQ("Cell ID: '2'", AnonymizeCustomPatterns("Cell ID: 'C1D2'")); + EXPECT_EQ("foo Cell ID: '1' bar", + AnonymizeCustomPatterns("foo Cell ID: 'A1B2' bar")); + + EXPECT_EQ("foo Location area code: '1' bar", + AnonymizeCustomPatterns("foo Location area code: 'A1B2' bar")); + + EXPECT_EQ("foo\na SSID='1' b\n'", + AnonymizeCustomPatterns("foo\na SSID='Joe's' b\n'")); + EXPECT_EQ("ssid '2'", AnonymizeCustomPatterns("ssid 'My AP'")); + EXPECT_EQ("bssid 'aa:bb'", AnonymizeCustomPatterns("bssid 'aa:bb'")); + + EXPECT_EQ("Scan SSID - hexdump(len=6): 1\nfoo", + AnonymizeCustomPatterns( + "Scan SSID - hexdump(len=6): 47 6f 6f 67 6c 65\nfoo")); + + EXPECT_EQ( + "a\nb [SSID=1] [SSID=2] [SSID=foo\nbar] b", + AnonymizeCustomPatterns("a\nb [SSID=foo] [SSID=bar] [SSID=foo\nbar] b")); +} + +TEST_F(AnonymizerToolTest, AnonymizeCustomPattern) { + const char kPattern[] = "(\\b(?i)id:? ')(\\d+)(')"; + std::map<std::string, std::string> space; + EXPECT_EQ("", AnonymizeCustomPattern("", kPattern, &space)); + EXPECT_EQ("foo\nbar\n", + AnonymizeCustomPattern("foo\nbar\n", kPattern, &space)); + EXPECT_EQ("id '1'", AnonymizeCustomPattern("id '2345'", kPattern, &space)); + EXPECT_EQ("id '2'", AnonymizeCustomPattern("id '1234'", kPattern, &space)); + EXPECT_EQ("id: '2'", AnonymizeCustomPattern("id: '1234'", kPattern, &space)); + EXPECT_EQ("ID: '1'", AnonymizeCustomPattern("ID: '2345'", kPattern, &space)); + EXPECT_EQ("x1 id '1' 1x id '2'\nid '1'\n", + AnonymizeCustomPattern("x1 id '2345' 1x id '1234'\nid '2345'\n", + kPattern, &space)); + space.clear(); + EXPECT_EQ("id '1'", AnonymizeCustomPattern("id '1234'", kPattern, &space)); + + space.clear(); + EXPECT_EQ("x1z", AnonymizeCustomPattern("xyz", "()(y+)()", &space)); +} + +} // namespace feedback diff --git a/components/feedback/feedback_common_unittest.cc b/components/feedback/feedback_common_unittest.cc index d3fb950..61895ab 100644 --- a/components/feedback/feedback_common_unittest.cc +++ b/components/feedback/feedback_common_unittest.cc @@ -25,55 +25,56 @@ const char kLogsAttachmentName[] = "system_logs.zip"; class FeedbackCommonTest : public testing::Test { protected: FeedbackCommonTest() { - feedback = scoped_refptr<FeedbackCommon>(new FeedbackCommon()); + feedback_ = scoped_refptr<FeedbackCommon>(new FeedbackCommon()); } ~FeedbackCommonTest() override {} - scoped_refptr<FeedbackCommon> feedback; - userfeedback::ExtensionSubmit report; + scoped_refptr<FeedbackCommon> feedback_; + userfeedback::ExtensionSubmit report_; }; TEST_F(FeedbackCommonTest, TestBasicData) { // Test that basic data can be set and propagates to the request. - feedback->set_category_tag(kOne); - feedback->set_description(kTwo); - feedback->set_page_url(kThree); - feedback->set_user_email(kFour); - feedback->PrepareReport(&report); + feedback_->set_category_tag(kOne); + feedback_->set_description(kTwo); + feedback_->set_page_url(kThree); + feedback_->set_user_email(kFour); + feedback_->PrepareReport(&report_); - EXPECT_EQ(kOne, report.bucket()); - EXPECT_EQ(kTwo, report.common_data().description()); - EXPECT_EQ(kThree, report.web_data().url()); - EXPECT_EQ(kFour, report.common_data().user_email()); + EXPECT_EQ(kOne, report_.bucket()); + EXPECT_EQ(kTwo, report_.common_data().description()); + EXPECT_EQ(kThree, report_.web_data().url()); + EXPECT_EQ(kFour, report_.common_data().user_email()); } TEST_F(FeedbackCommonTest, TestAddLogs) { - feedback->AddLog(kOne, kTwo); - feedback->AddLog(kThree, kFour); + feedback_->AddLog(kOne, kTwo); + feedback_->AddLog(kThree, kFour); - EXPECT_EQ(2U, feedback->sys_info()->size()); + EXPECT_EQ(2U, feedback_->sys_info()->size()); } TEST_F(FeedbackCommonTest, TestCompressionThreshold) { // Add a large and small log, verify that only the small log gets // included in the report. - feedback->AddLog(kOne, kTwo); - feedback->AddLog(kThree, kLongLog); - feedback->PrepareReport(&report); + feedback_->AddLog(kOne, kTwo); + feedback_->AddLog(kThree, kLongLog); + feedback_->PrepareReport(&report_); - EXPECT_EQ(1, report.web_data().product_specific_data_size()); - EXPECT_EQ(kOne, report.web_data().product_specific_data(0).key()); + EXPECT_EQ(1, report_.web_data().product_specific_data_size()); + EXPECT_EQ(kOne, report_.web_data().product_specific_data(0).key()); } TEST_F(FeedbackCommonTest, TestCompression) { // Add a large and small log, verify that an attachment has been // added with the right name. - feedback->AddLog(kOne, kTwo); - feedback->AddLog(kThree, kLongLog); - feedback->CompressLogs(); - feedback->PrepareReport(&report); + feedback_->AddLog(kOne, kTwo); + feedback_->AddLog(kThree, kLongLog); + feedback_->CompressLogs(); + feedback_->PrepareReport(&report_); - EXPECT_EQ(1, report.product_specific_binary_data_size()); - EXPECT_EQ(kLogsAttachmentName, report.product_specific_binary_data(0).name()); + EXPECT_EQ(1, report_.product_specific_binary_data_size()); + EXPECT_EQ(kLogsAttachmentName, + report_.product_specific_binary_data(0).name()); } |