#!/usr/bin/python # Copyright 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Checks third-party licenses for the purposes of the Android WebView build. The Android tree includes a snapshot of Chromium in order to power the system WebView. This tool checks that all code uses open-source licenses compatible with Android, and that we meet the requirements of those licenses. It can also be used to generate an Android NOTICE file for the third-party code. It makes use of src/tools/licenses.py and the README.chromium files on which it depends. It also makes use of a data file, third_party_files_whitelist.txt, which whitelists individual files which contain third-party code but which aren't in a third-party directory with a README.chromium file. """ import imp import json import multiprocessing import optparse import os import re import sys import textwrap REPOSITORY_ROOT = os.path.abspath(os.path.join( os.path.dirname(__file__), '..', '..')) # Import third_party/PRESUBMIT.py via imp to avoid importing a random # PRESUBMIT.py from $PATH, also make sure we don't generate a .pyc file. sys.dont_write_bytecode = True third_party = \ imp.load_source('PRESUBMIT', \ os.path.join(REPOSITORY_ROOT, 'third_party', 'PRESUBMIT.py')) sys.path.append(os.path.join(REPOSITORY_ROOT, 'third_party')) import jinja2 sys.path.append(os.path.join(REPOSITORY_ROOT, 'tools')) from copyright_scanner import copyright_scanner import licenses class InputApi(object): def __init__(self): self.os_path = os.path self.os_walk = os.walk self.re = re self.ReadFile = _ReadFile self.change = InputApiChange() class InputApiChange(object): def __init__(self): self.RepositoryRoot = lambda: REPOSITORY_ROOT class ScanResult(object): Ok, Warnings, Errors = range(3) # Needs to be a top-level function for multiprocessing def _FindCopyrightViolations(files_to_scan_as_string): return copyright_scanner.FindCopyrightViolations( InputApi(), REPOSITORY_ROOT, files_to_scan_as_string) def _ShardList(l, shard_len): return [l[i:i + shard_len] for i in range(0, len(l), shard_len)] def _CheckLicenseHeaders(excluded_dirs_list, whitelisted_files): """Checks that all files which are not in a listed third-party directory, and which do not use the standard Chromium license, are whitelisted. Args: excluded_dirs_list: The list of directories to exclude from scanning. whitelisted_files: The whitelist of files. Returns: ScanResult.Ok if all files with non-standard license headers are whitelisted and the whitelist contains no stale entries; ScanResult.Warnings if there are stale entries; ScanResult.Errors if new non-whitelisted entries found. """ input_api = InputApi() files_to_scan = copyright_scanner.FindFiles( input_api, REPOSITORY_ROOT, ['.'], excluded_dirs_list) sharded_files_to_scan = _ShardList(files_to_scan, 2000) pool = multiprocessing.Pool() offending_files_chunks = pool.map_async( _FindCopyrightViolations, sharded_files_to_scan).get(999999) pool.close() pool.join() # Flatten out the result offending_files = \ [item for sublist in offending_files_chunks for item in sublist] (unknown, missing, stale) = copyright_scanner.AnalyzeScanResults( input_api, whitelisted_files, offending_files) if unknown: print 'The following files contain a third-party license but are not in ' \ 'a listed third-party directory and are not whitelisted. You must ' \ 'add the following files to the whitelist.\n' \ '(Note that if the code you are adding does not actually contain ' \ 'any third-party code, it may contain the word "copyright", which ' \ 'should be masked out, e.g. by writing it as "copy-right")\n%s' % \ '\n'.join(sorted(unknown)) if missing: print 'The following files are whitelisted, but do not exist.\n%s' % \ '\n'.join(sorted(missing)) if stale: print 'The following files are whitelisted unnecessarily. You must ' \ 'remove the following files from the whitelist.\n%s' % \ '\n'.join(sorted(stale)) if unknown: code = ScanResult.Errors elif stale or missing: code = ScanResult.Warnings else: code = ScanResult.Ok problem_paths = sorted(set(unknown + missing + stale)) return (code, problem_paths) def _ReadFile(full_path, mode='rU'): """Reads a file from disk. This emulates presubmit InputApi.ReadFile func. Args: full_path: The path of the file to read. Returns: The contents of the file as a string. """ with open(full_path, mode) as f: return f.read() def _Scan(): """Checks that license meta-data is present for all third-party code and that all non third-party code doesn't contain external copyrighted code. Returns: ScanResult.Ok if everything is in order; ScanResult.Warnings if there are non-fatal problems (e.g. stale whitelist entries) ScanResult.Errors otherwise. """ third_party_dirs = licenses.FindThirdPartyDirsWithFiles(REPOSITORY_ROOT) problem_paths = [] # First, check designated third-party directories using src/tools/licenses.py. all_licenses_valid = True for path in sorted(third_party_dirs): try: licenses.ParseDir(path, REPOSITORY_ROOT) except licenses.LicenseError, e: print 'Got LicenseError "%s" while scanning %s' % (e, path) problem_paths.append(path) all_licenses_valid = False # Second, check for non-standard license text. whitelisted_files = copyright_scanner.LoadWhitelistedFilesList(InputApi()) licenses_check, more_problem_paths = _CheckLicenseHeaders( third_party_dirs, whitelisted_files) problem_paths.extend(more_problem_paths) return (licenses_check if all_licenses_valid else ScanResult.Errors, problem_paths) class TemplateEntryGenerator(object): def __init__(self): self._generate_licenses_file_list_only = False self._toc_index = 0 def SetGenerateLicensesFileListOnly(self, generate_licenses_file_list_only): self._generate_licenses_file_list_only = generate_licenses_file_list_only def _ReadFileGuessEncoding(self, name): if self._generate_licenses_file_list_only: return '' contents = '' with open(name, 'rb') as input_file: contents = input_file.read() try: return contents.decode('utf8') except UnicodeDecodeError: pass # If it's not UTF-8, it must be CP-1252. Fail otherwise. return contents.decode('cp1252') def MetadataToTemplateEntry(self, metadata): self._toc_index += 1 return { 'name': metadata['Name'], 'url': metadata['URL'], 'license_file': metadata['License File'], 'license': self._ReadFileGuessEncoding(metadata['License File']), 'toc_href': 'entry' + str(self._toc_index), } def GenerateNoticeFile(generate_licenses_file_list_only=False): """Generates the contents of an Android NOTICE file for the third-party code. This is used by the snapshot tool. Returns: The contents of the NOTICE file. """ generator = TemplateEntryGenerator() generator.SetGenerateLicensesFileListOnly(generate_licenses_file_list_only) # Start from Chromium's LICENSE file entries = [generator.MetadataToTemplateEntry({ 'Name': 'The Chromium Project', 'URL': 'http://www.chromium.org', 'License File': os.path.join(REPOSITORY_ROOT, 'LICENSE') }) ] third_party_dirs = licenses.FindThirdPartyDirsWithFiles(REPOSITORY_ROOT) # We provide attribution for all third-party directories. # TODO(mnaganov): Limit this to only code used by the WebView binary. for directory in sorted(third_party_dirs): try: metadata = licenses.ParseDir(directory, REPOSITORY_ROOT, require_license_file=False) except licenses.LicenseError: # Since this code is called during project files generation, # we don't want to break the it. But we assume that release # WebView apks are built using checkouts that pass # 'webview_licenses.py scan' check, thus they don't contain # projects with non-compatible licenses. continue license_file = metadata['License File'] if license_file and license_file != licenses.NOT_SHIPPED: entries.append(generator.MetadataToTemplateEntry(metadata)) if generate_licenses_file_list_only: return [entry['license_file'] for entry in entries] else: env = jinja2.Environment( loader=jinja2.FileSystemLoader(os.path.dirname(__file__)), extensions=['jinja2.ext.autoescape']) template = env.get_template('licenses_notice.tmpl') return template.render({ 'entries': entries }).encode('utf8') def main(): class FormatterWithNewLines(optparse.IndentedHelpFormatter): def format_description(self, description): paras = description.split('\n') formatted_paras = [textwrap.fill(para, self.width) for para in paras] return '\n'.join(formatted_paras) + '\n' parser = optparse.OptionParser(formatter=FormatterWithNewLines(), usage='%prog [options]') parser.add_option('--json', help='Path to JSON output file') parser.description = (__doc__ + '\nCommands:\n' ' scan Check licenses.\n' ' notice_deps Generate the list of dependencies for ' 'Android NOTICE file.\n' ' notice [file] Generate Android NOTICE file on ' 'stdout or into |file|.\n' ' display_copyrights Display autorship on the files' ' using names provided via stdin.\n') (options, args) = parser.parse_args() if len(args) < 1: parser.print_help() return ScanResult.Errors if args[0] == 'scan': scan_result, problem_paths = _Scan() if scan_result == ScanResult.Ok: print 'OK!' if options.json: with open(options.json, 'w') as f: json.dump(problem_paths, f) return scan_result elif args[0] == 'notice_deps': # 'set' is used to eliminate duplicate references to the same license file. print ' '.join( sorted(set(GenerateNoticeFile(generate_licenses_file_list_only=True)))) return ScanResult.Ok elif args[0] == 'gn_notice_deps': # generate list for gn. # 'set' is used to eliminate duplicate references to the same license file. gn_file_list = ['"' + f + '"' for f in sorted(set(GenerateNoticeFile(generate_licenses_file_list_only=True)))] print '[%s] ' % ','.join(gn_file_list) return ScanResult.Ok elif args[0] == 'notice': notice_file_contents = GenerateNoticeFile() if len(args) == 1: print notice_file_contents else: with open(args[1], 'w') as output_file: output_file.write(notice_file_contents) return ScanResult.Ok elif args[0] == 'display_copyrights': files = sys.stdin.read().splitlines() for f, c in \ zip(files, copyright_scanner.FindCopyrights(InputApi(), '.', files)): print f, '\t', ' / '.join(sorted(c)) return ScanResult.Ok parser.print_help() return ScanResult.Errors if __name__ == '__main__': sys.exit(main())