# Copyright 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Utilities for scanning source files to determine code authorship. """ import itertools def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list): """Similar to UNIX utility find(1), searches for files in the directories. Automatically leaves out only source code files and excludes third_party directories. Args: input_api: InputAPI, as in presubmit scripts. root_dir: The root directory, to which all other paths are relative. start_paths_list: The list of paths to start search from. Each path can be a file or a directory. excluded_dirs_list: The list of directories to skip. Returns: The list of source code files found, relative to |root_dir|. """ excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d] # Using a common pattern for third-partyies makes the ignore regexp shorter excluded_dirs_list.append('third_party') EXTRA_EXCLUDED_DIRS = [ # VCS dirs '.git', '.svn', # Build output 'out/Debug', 'out/Release', # 'Copyright' appears in license agreements 'chrome/app/resources', # Quickoffice js files from internal src used on buildbots. # crbug.com/350472. 'chrome/browser/resources/chromeos/quickoffice', # This is a test output directory 'chrome/tools/test/reference_build', # blink style copy right headers. 'content/shell/renderer/test_runner', # blink style copy right headers. 'content/shell/tools/plugin', # This is tests directory, doesn't exist in the snapshot 'content/test/data', # This is a tests directory that doesn't exist in the shipped product. 'gin/test', # This is a test output directory 'data/dom_perf', # This is a tests directory that doesn't exist in the shipped product. 'tools/perf/page_sets', 'tools/perf/page_sets/tough_animation_cases', # Histogram tools, doesn't exist in the snapshot 'tools/histograms', # Swarming tools, doesn't exist in the snapshot 'tools/swarming_client', # ARM sysroot, doesn't exist in the snapshot 'chrome/installer/linux/debian_wheezy_arm-sysroot', # Old location (TODO(sbc): Remove this once it no longer exists on any bots) 'arm-sysroot', # Data is not part of open source chromium, but are included on some bots. 'data', # This is not part of open source chromium, but are included on some bots. 'skia/tools/clusterfuzz-data' ] excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS) dirs_blacklist = ['/' + d + '/' for d in excluded_dirs_list] def IsBlacklistedDir(d): for item in dirs_blacklist: if item in d: return True return False files_whitelist_re = input_api.re.compile( r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)' '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?' '|tex|mli?)$') files = [] base_path_len = len(root_dir) for path in start_paths_list: full_path = input_api.os_path.join(root_dir, path) if input_api.os_path.isfile(full_path): if files_whitelist_re.search(path) and \ not IsBlacklistedDir(full_path[base_path_len:]): # Keep '/' prefix. files.append(path) else: for dirpath, dirnames, filenames in input_api.os_walk(full_path): # Remove excluded subdirs for faster scanning. for item in dirnames[:]: if IsBlacklistedDir( input_api.os_path.join(dirpath, item)[base_path_len + 1:]): dirnames.remove(item) for filename in filenames: filepath = \ input_api.os_path.join(dirpath, filename)[base_path_len + 1:] if files_whitelist_re.search(filepath) and \ not IsBlacklistedDir(filepath): files.append(filepath) return files class _GeneratedFilesDetector(object): GENERATED_FILE = 'GENERATED FILE' NO_COPYRIGHT = '*No copyright*' def __init__(self, input_api): self.python_multiline_string_double_re = \ input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE) self.python_multiline_string_single_re = \ input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE) self.automatically_generated_re = input_api.re.compile( r'(All changes made in this file will be lost' '|DO NOT (EDIT|delete this file)' '|Generated (at|automatically|data)' '|Automatically generated' '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE) def IsGeneratedFile(self, header): header = header.upper() if '"""' in header: header = self.python_multiline_string_double_re.sub('', header) if "'''" in header: header = self.python_multiline_string_single_re.sub('', header) # First do simple strings lookup to save time. if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header: return True if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \ 'GENERATED' in header: return self.automatically_generated_re.search(header) return False class _CopyrightsScanner(object): @staticmethod def StaticInit(input_api): _CopyrightsScanner._c_comment_re = \ input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''') _CopyrightsScanner._copyright_indicator = \ r'(?:copyright|copr\.|\xc2\xa9|\(c\))' _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile( r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \ r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE) _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile( r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE) def __init__(self, input_api): self.max_line_numbers_proximity = 3 self.last_a_item_line_number = -200 self.last_b_item_line_number = -100 self.re = input_api.re def _CloseLineNumbers(self, a, b): return 0 <= a - b <= self.max_line_numbers_proximity def MatchLine(self, line_number, line): if '"' in line: line = _CopyrightsScanner._c_comment_re.sub('', line) upcase_line = line.upper() # Record '(a)' and '(b)' last occurences in C++ comments. # This is to filter out '(c)' used as a list item inside C++ comments. # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah" cpp_comment_idx = upcase_line.find('//') if cpp_comment_idx != -1: if upcase_line.find('(A)') > cpp_comment_idx: self.last_a_item_line_number = line_number if upcase_line.find('(B)') > cpp_comment_idx: self.last_b_item_line_number = line_number # Fast bailout, uses the same patterns as _copyright_indicator regexp. if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \ and not '\xc2\xa9' in upcase_line: c_item_index = upcase_line.find('(C)') if c_item_index == -1: return None if c_item_index > cpp_comment_idx and \ self._CloseLineNumbers(line_number, self.last_b_item_line_number) and \ self._CloseLineNumbers(self.last_b_item_line_number, self.last_a_item_line_number): return None copyr = None m = _CopyrightsScanner._full_copyright_indicator_re.search(line) if m and \ not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)): copyr = m.group(0) # Prettify the authorship string. copyr = self.re.sub(r'([,.])?\s*$/', '', copyr) copyr = self.re.sub( _CopyrightsScanner._copyright_indicator, '', copyr, \ flags=self.re.IGNORECASE) copyr = self.re.sub(r'^\s+', '', copyr) copyr = self.re.sub(r'\s{2,}', ' ', copyr) copyr = self.re.sub(r'\\@', '@', copyr) return copyr def FindCopyrights(input_api, root_dir, files_to_scan): """Determines code autorship, and finds generated files. Args: input_api: InputAPI, as in presubmit scripts. root_dir: The root directory, to which all other paths are relative. files_to_scan: The list of file names to scan. Returns: The list of copyrights associated with each of the files given. If the certain file is generated, the corresponding list consists a single entry -- 'GENERATED_FILE' string. If the file has no copyright info, the corresponding list contains 'NO_COPYRIGHT' string. """ generated_files_detector = _GeneratedFilesDetector(input_api) _CopyrightsScanner.StaticInit(input_api) copyrights = [] for file_name in files_to_scan: linenum = 0 header = [] file_copyrights = [] scanner = _CopyrightsScanner(input_api) contents = input_api.ReadFile( input_api.os_path.join(root_dir, file_name), 'r') for l in contents.split('\n'): linenum += 1 if linenum <= 25: header.append(l) c = scanner.MatchLine(linenum, l) if c: file_copyrights.append(c) if generated_files_detector.IsGeneratedFile('\n'.join(header)): copyrights.append([_GeneratedFilesDetector.GENERATED_FILE]) elif file_copyrights: copyrights.append(file_copyrights) else: copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT]) return copyrights def FindCopyrightViolations(input_api, root_dir, files_to_scan): """Looks for files that are not belong exlusively to the Chromium Authors. Args: input_api: InputAPI, as in presubmit scripts. root_dir: The root directory, to which all other paths are relative. files_to_scan: The list of file names to scan. Returns: The list of file names that contain non-Chromium copyrights. """ copyrights = FindCopyrights(input_api, root_dir, files_to_scan) offending_files = [] allowed_copyrights_re = input_api.re.compile( r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. ' 'All rights reserved.*)$') for f, cs in itertools.izip(files_to_scan, copyrights): if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \ cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT: continue for c in cs: if not allowed_copyrights_re.match(c): offending_files.append(input_api.os_path.normpath(f)) break return offending_files def _GetWhitelistFileName(input_api): return input_api.os_path.join( 'android_webview', 'tools', 'third_party_files_whitelist.txt') def _ProcessWhitelistedFilesList(input_api, lines): whitelisted_files = [] for line in lines: match = input_api.re.match(r'([^#\s]+)', line) if match: whitelisted_files.append(match.group(1)) return whitelisted_files def LoadWhitelistedFilesList(input_api): """Loads and parses the 3rd party code whitelist file. input_api: InputAPI of presubmit scripts. Returns: The list of files. """ full_file_name = input_api.os_path.join( input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api)) file_data = input_api.ReadFile(full_file_name, 'rb') return _ProcessWhitelistedFilesList(input_api, file_data.splitlines()) def AnalyzeScanResults(input_api, whitelisted_files, offending_files): """Compares whitelist contents with the results of file scanning. input_api: InputAPI of presubmit scripts. whitelisted_files: Whitelisted files list. offending_files: Files that contain 3rd party code. Returns: A triplet of "unknown", "missing", and "stale" file lists. "Unknown" are files that contain 3rd party code but not whitelisted. "Missing" are files that are whitelisted but doesn't really exist. "Stale" are files that are whitelisted unnecessarily. """ unknown = set(offending_files) - set(whitelisted_files) missing = [f for f in whitelisted_files if not input_api.os_path.isfile(f)] stale = set(whitelisted_files) - set(offending_files) - set(missing) return (list(unknown), missing, list(stale)) def _GetDeletedContents(affected_file): """Returns a list of all deleted lines. AffectedFile class from presubmit_support is lacking this functionality. """ deleted_lines = [] for line in affected_file.GenerateScmDiff().splitlines(): if line.startswith('-') and not line.startswith('--'): deleted_lines.append(line[1:]) return deleted_lines def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check): # We pass empty 'known third-party' dirs list here. Since this is a patch # for the Chromium's src tree, it must contain properly licensed Chromium # code. Any third-party code must be put into a directory named 'third_party', # and such dirs are automatically excluded by FindFiles. files_to_scan = FindFiles( input_api, input_api.change.RepositoryRoot(), files_to_check, []) offending_files = FindCopyrightViolations( input_api, input_api.change.RepositoryRoot(), files_to_scan) return AnalyzeScanResults( input_api, whitelisted_files, offending_files) def ScanAtPresubmit(input_api, output_api): """Invoked at change presubmit time. Verifies that updated non third-party code doesn't contain external copyrighted code. input_api: InputAPI of presubmit scripts. output_api: OutputAPI of presubmit scripts. """ files_to_check = set([]) deleted_files = set([]) whitelist_contents_changed = False for f in input_api.AffectedFiles(): if f.LocalPath() == _GetWhitelistFileName(input_api): whitelist_contents_changed = True deleted_files |= set(_ProcessWhitelistedFilesList( input_api, _GetDeletedContents(f))) continue if f.Action() != 'D': files_to_check.add(f.LocalPath()) else: deleted_files.add(f.LocalPath()) whitelisted_files = set(LoadWhitelistedFilesList(input_api)) if not whitelist_contents_changed: whitelisted_files &= files_to_check | deleted_files else: # Need to re-check the entire contents of the whitelist file. # Also add files removed from the whitelist. If the file has indeed been # deleted, the scanner will not complain. files_to_check |= whitelisted_files | deleted_files (unknown_files, missing_files, stale_files) = _DoScanAtPresubmit( input_api, list(whitelisted_files), list(files_to_check)) results = [] if unknown_files: results.append(output_api.PresubmitError( 'The following files contain a third-party license but are not in ' \ 'a listed third-party directory and are not whitelisted. You must ' \ 'add the following files to the whitelist file ' \ '%s:' % _GetWhitelistFileName(input_api), sorted(unknown_files))) if missing_files: results.append(output_api.PresubmitPromptWarning( 'The following files are whitelisted in %s, ' \ 'but do not exist or not files:' % _GetWhitelistFileName(input_api), sorted(missing_files))) if stale_files: results.append(output_api.PresubmitPromptWarning( 'The following files are whitelisted unnecessarily. You must ' \ 'remove the following files from the whitelist file ' \ '%s:' % _GetWhitelistFileName(input_api), sorted(stale_files))) return results