tools/copyright_scanner/copyright_scanner.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403

# Copyright 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Utilities for scanning source files to determine code authorship.
"""

import itertools

def ForwardSlashesToOsPathSeps(input_api, path):
  """Converts forward slashes ('/') in the input path to OS-specific
  path separators. Used when the paths come from outside and are using
  UNIX path separators. Only works for relative paths!
  Args:
    input_api: InputAPI, as in presubmit scripts.
    path: The path to convert.
  Returns:
    Converted path.
  """
  return input_api.os_path.join(*path.split('/'))

def FindFiles(input_api, root_dir, start_paths_list, excluded_dirs_list):
  """Similar to UNIX utility find(1), searches for files in the directories.
  Automatically leaves out only source code files and excludes third_party
  directories.
  Args:
    input_api: InputAPI, as in presubmit scripts.
    root_dir: The root directory, to which all other paths are relative.
    start_paths_list: The list of paths to start search from. Each path can
      be a file or a directory.
    excluded_dirs_list: The list of directories to skip.
  Returns:
    The list of source code files found, relative to |root_dir|.
  """
  excluded_dirs_list = [d for d in excluded_dirs_list if not 'third_party' in d]
  # Using a common pattern for third-partyies makes the ignore regexp shorter
  excluded_dirs_list.append('third_party')

  path_join = input_api.os_path.join
  EXTRA_EXCLUDED_DIRS = [
    # VCS dirs
    path_join('.git'),
    path_join('.svn'),
    # Build output
    path_join('out', 'Debug'),
    path_join('out', 'Release'),
    # 'Copyright' appears in license agreements
    path_join('chrome', 'app', 'resources'),
    # Quickoffice js files from internal src used on buildbots.
    # crbug.com/350472.
    path_join('chrome', 'browser', 'resources', 'chromeos', 'quickoffice'),
    # This is a test output directory
    path_join('chrome', 'tools', 'test', 'reference_build'),
    # blink style copy right headers.
    path_join('content', 'shell', 'renderer', 'test_runner'),
    # blink style copy right headers.
    path_join('content', 'shell', 'tools', 'plugin'),
    # This is tests directory, doesn't exist in the snapshot
    path_join('content', 'test', 'data'),
    # This is a tests directory that doesn't exist in the shipped product.
    path_join('gin', 'test'),
    # This is a test output directory
    path_join('data', 'dom_perf'),
    # This is a tests directory that doesn't exist in the shipped product.
    path_join('tools', 'perf', 'page_sets'),
    path_join('tools', 'perf', 'page_sets', 'tough_animation_cases'),
    # Histogram tools, doesn't exist in the snapshot
    path_join('tools', 'histograms'),
    # Swarming tools, doesn't exist in the snapshot
    path_join('tools', 'swarming_client'),
    # Don't check downloaded goma client binaries.
    path_join('build', 'goma', 'client'),
    # Ignore sysroots.
    path_join('build', 'linux', 'debian_wheezy_amd64-sysroot'),
    path_join('build', 'linux', 'debian_wheezy_arm-sysroot'),
    path_join('build', 'linux', 'debian_wheezy_mips-sysroot'),
    path_join('build', 'linux', 'debian_wheezy_i386-sysroot'),
    # Old location (TODO(sbc): Remove this once it no longer exists on any bots)
    path_join('chrome', 'installer', 'linux', 'debian_wheezy_arm-sysroot'),
    # Data is not part of open source chromium, but are included on some bots.
    path_join('data'),
    # This is not part of open source chromium, but are included on some bots.
    path_join('skia', 'tools', 'clusterfuzz-data'),
    # Not shipped, only relates to Chrome for Android, but not to WebView
    path_join('clank'),
    # Internal-only repository.
    path_join('remoting', 'android', 'internal'),
  ]
  excluded_dirs_list.extend(EXTRA_EXCLUDED_DIRS)

  # Surround the directory names with OS path separators.
  dirs_blacklist = [path_join('.', d, '')[1:] for d in excluded_dirs_list if d]
  def IsBlacklistedDir(d):
    for item in dirs_blacklist:
      if item in d:
        return True
    return False

  files_whitelist_re = input_api.re.compile(
    r'\.(asm|c(c|pp|xx)?|h(h|pp|xx)?|p(l|m)|xs|sh|php|py(|x)'
    '|rb|idl|java|el|sc(i|e)|cs|pas|inc|js|pac|html|dtd|xsl|mod|mm?'
    '|tex|mli?)$')
  files = []

  base_path_len = len(root_dir)
  for path in start_paths_list:
    full_path = path_join(root_dir, path)
    if input_api.os_path.isfile(full_path):
      if files_whitelist_re.search(path) and \
          not IsBlacklistedDir(full_path[base_path_len:]):  # Keep '/' prefix.
        files.append(path)
    else:
      for dirpath, dirnames, filenames in input_api.os_walk(full_path):
        # Remove excluded subdirs for faster scanning.
        for item in dirnames[:]:
          if IsBlacklistedDir(
              path_join(dirpath, item)[base_path_len + 1:]):
            dirnames.remove(item)
        for filename in filenames:
          filepath = \
              path_join(dirpath, filename)[base_path_len + 1:]
          if files_whitelist_re.search(filepath) and \
              not IsBlacklistedDir(filepath):
            files.append(filepath)
  return files


class _GeneratedFilesDetector(object):
  GENERATED_FILE = 'GENERATED FILE'
  NO_COPYRIGHT = '*No copyright*'

  def __init__(self, input_api):
    self.python_multiline_string_double_re = \
      input_api.re.compile(r'"""[^"]*(?:"""|$)', flags=input_api.re.MULTILINE)
    self.python_multiline_string_single_re = \
      input_api.re.compile(r"'''[^']*(?:'''|$)", flags=input_api.re.MULTILINE)
    self.automatically_generated_re = input_api.re.compile(
      r'(All changes made in this file will be lost'
      '|DO NOT (EDIT|delete this file)'
      '|Generated (at|automatically|data)'
      '|Automatically generated'
      '|\Wgenerated\s+(?:\w+\s+)*file\W)', flags=input_api.re.IGNORECASE)

  def IsGeneratedFile(self, header):
    header = header.upper()
    if '"""' in header:
      header = self.python_multiline_string_double_re.sub('', header)
    if "'''" in header:
      header = self.python_multiline_string_single_re.sub('', header)
    # First do simple strings lookup to save time.
    if 'ALL CHANGES MADE IN THIS FILE WILL BE LOST' in header:
      return True
    if 'DO NOT EDIT' in header or 'DO NOT DELETE' in header or \
        'GENERATED' in header:
      return self.automatically_generated_re.search(header)
    return False


class _CopyrightsScanner(object):
  @staticmethod
  def StaticInit(input_api):
    _CopyrightsScanner._c_comment_re = \
      input_api.re.compile(r'''"[^"\\]*(?:\\.[^"\\]*)*"''')
    _CopyrightsScanner._copyright_indicator = \
      r'(?:copyright|copr\.|\xc2\xa9|\(c\))'
    _CopyrightsScanner._full_copyright_indicator_re = input_api.re.compile(
      r'(?:\W|^)' + _CopyrightsScanner._copyright_indicator + \
      r'(?::\s*|\s+)(\w.*)$', input_api.re.IGNORECASE)
    _CopyrightsScanner._copyright_disindicator_re = input_api.re.compile(
      r'\s*\b(?:info(?:rmation)?|notice|and|or)\b', input_api.re.IGNORECASE)

  def __init__(self, input_api):
    self.max_line_numbers_proximity = 3
    self.last_a_item_line_number = -200
    self.last_b_item_line_number = -100
    self.re = input_api.re

  def _CloseLineNumbers(self, a, b):
    return 0 <= a - b <= self.max_line_numbers_proximity

  def MatchLine(self, line_number, line):
    if '"' in line:
      line = _CopyrightsScanner._c_comment_re.sub('', line)
    upcase_line = line.upper()
    # Record '(a)' and '(b)' last occurences in C++ comments.
    # This is to filter out '(c)' used as a list item inside C++ comments.
    # E.g. "// blah-blah (a) blah\n// blah-blah (b) and (c) blah"
    cpp_comment_idx = upcase_line.find('//')
    if cpp_comment_idx != -1:
      if upcase_line.find('(A)') > cpp_comment_idx:
        self.last_a_item_line_number = line_number
      if upcase_line.find('(B)') > cpp_comment_idx:
        self.last_b_item_line_number = line_number
    # Fast bailout, uses the same patterns as _copyright_indicator regexp.
    if not 'COPYRIGHT' in upcase_line and not 'COPR.' in upcase_line \
        and not '\xc2\xa9' in upcase_line:
      c_item_index = upcase_line.find('(C)')
      if c_item_index == -1:
        return None
      if c_item_index > cpp_comment_idx and \
          self._CloseLineNumbers(line_number,
                                 self.last_b_item_line_number) and \
          self._CloseLineNumbers(self.last_b_item_line_number,
                                 self.last_a_item_line_number):
        return None
    copyr = None
    m = _CopyrightsScanner._full_copyright_indicator_re.search(line)
    if m and \
        not _CopyrightsScanner._copyright_disindicator_re.match(m.group(1)):
      copyr = m.group(0)
      # Prettify the authorship string.
      copyr = self.re.sub(r'([,.])?\s*$/', '', copyr)
      copyr = self.re.sub(
        _CopyrightsScanner._copyright_indicator, '', copyr, \
        flags=self.re.IGNORECASE)
      copyr = self.re.sub(r'^\s+', '', copyr)
      copyr = self.re.sub(r'\s{2,}', ' ', copyr)
      copyr = self.re.sub(r'\\@', '@', copyr)
    return copyr


def FindCopyrights(input_api, root_dir, files_to_scan):
  """Determines code autorship, and finds generated files.
  Args:
    input_api: InputAPI, as in presubmit scripts.
    root_dir: The root directory, to which all other paths are relative.
    files_to_scan: The list of file names to scan.
  Returns:
    The list of copyrights associated with each of the files given.
    If the certain file is generated, the corresponding list consists a single
    entry -- 'GENERATED_FILE' string. If the file has no copyright info,
    the corresponding list contains 'NO_COPYRIGHT' string.
  """
  generated_files_detector = _GeneratedFilesDetector(input_api)
  _CopyrightsScanner.StaticInit(input_api)
  copyrights = []
  for file_name in files_to_scan:
    linenum = 0
    header = []
    file_copyrights = []
    scanner = _CopyrightsScanner(input_api)
    contents = input_api.ReadFile(
      input_api.os_path.join(root_dir, file_name), 'r')
    for l in contents.split('\n'):
      linenum += 1
      if linenum <= 25:
        header.append(l)
      c = scanner.MatchLine(linenum, l)
      if c:
        file_copyrights.append(c)
    if generated_files_detector.IsGeneratedFile('\n'.join(header)):
      copyrights.append([_GeneratedFilesDetector.GENERATED_FILE])
    elif file_copyrights:
      copyrights.append(file_copyrights)
    else:
      copyrights.append([_GeneratedFilesDetector.NO_COPYRIGHT])
  return copyrights


def FindCopyrightViolations(input_api, root_dir, files_to_scan):
  """Looks for files that are not belong exlusively to the Chromium Authors.
  Args:
    input_api: InputAPI, as in presubmit scripts.
    root_dir: The root directory, to which all other paths are relative.
    files_to_scan: The list of file names to scan.
  Returns:
    The list of file names that contain non-Chromium copyrights.
  """
  copyrights = FindCopyrights(input_api, root_dir, files_to_scan)
  offending_files = []
  allowed_copyrights_re = input_api.re.compile(
    r'^(?:20[0-9][0-9](?:-20[0-9][0-9])? The Chromium Authors\. '
    'All rights reserved.*)$')
  for f, cs in itertools.izip(files_to_scan, copyrights):
    if cs[0] == _GeneratedFilesDetector.GENERATED_FILE or \
       cs[0] == _GeneratedFilesDetector.NO_COPYRIGHT:
      continue
    for c in cs:
      if not allowed_copyrights_re.match(c):
        offending_files.append(input_api.os_path.normpath(f))
        break
  return offending_files


def _GetWhitelistFileName(input_api):
  return input_api.os_path.join(
    'tools', 'copyright_scanner', 'third_party_files_whitelist.txt')

def _ProcessWhitelistedFilesList(input_api, lines):
  whitelisted_files = []
  for line in lines:
    match = input_api.re.match(r'([^#\s]+)', line)
    if match:
      whitelisted_files.append(
        ForwardSlashesToOsPathSeps(input_api, match.group(1)))
  return whitelisted_files


def LoadWhitelistedFilesList(input_api):
  """Loads and parses the 3rd party code whitelist file.
    input_api: InputAPI of presubmit scripts.
  Returns:
    The list of files.
  """
  full_file_name = input_api.os_path.join(
    input_api.change.RepositoryRoot(), _GetWhitelistFileName(input_api))
  file_data = input_api.ReadFile(full_file_name, 'rb')
  return _ProcessWhitelistedFilesList(input_api, file_data.splitlines())


def AnalyzeScanResults(input_api, whitelisted_files, offending_files):
  """Compares whitelist contents with the results of file scanning.
    input_api: InputAPI of presubmit scripts.
    whitelisted_files: Whitelisted files list.
    offending_files: Files that contain 3rd party code.
  Returns:
    A triplet of "unknown", "missing", and "stale" file lists.
    "Unknown" are files that contain 3rd party code but not whitelisted.
    "Missing" are files that are whitelisted but doesn't really exist.
    "Stale" are files that are whitelisted unnecessarily.
  """
  unknown = set(offending_files) - set(whitelisted_files)
  missing = [f for f in whitelisted_files if not input_api.os_path.isfile(
    input_api.os_path.join(input_api.change.RepositoryRoot(), f))]
  stale = set(whitelisted_files) - set(offending_files) - set(missing)
  return (list(unknown), missing, list(stale))


def _GetDeletedContents(affected_file):
  """Returns a list of all deleted lines.
  AffectedFile class from presubmit_support is lacking this functionality.
  """
  deleted_lines = []
  for line in affected_file.GenerateScmDiff().splitlines():
    if line.startswith('-') and not line.startswith('--'):
      deleted_lines.append(line[1:])
  return deleted_lines

def _DoScanAtPresubmit(input_api, whitelisted_files, files_to_check):
  # We pass empty 'known third-party' dirs list here. Since this is a patch
  # for the Chromium's src tree, it must contain properly licensed Chromium
  # code. Any third-party code must be put into a directory named 'third_party',
  # and such dirs are automatically excluded by FindFiles.
  files_to_scan = FindFiles(
    input_api, input_api.change.RepositoryRoot(), files_to_check, [])
  offending_files = FindCopyrightViolations(
    input_api, input_api.change.RepositoryRoot(), files_to_scan)
  return AnalyzeScanResults(
    input_api, whitelisted_files, offending_files)

def ScanAtPresubmit(input_api, output_api):
  """Invoked at change presubmit time. Verifies that updated non third-party
  code doesn't contain external copyrighted code.
    input_api: InputAPI of presubmit scripts.
    output_api: OutputAPI of presubmit scripts.
  """
  files_to_check = set([])
  deleted_files = set([])
  whitelist_contents_changed = False
  for f in input_api.AffectedFiles():
    if f.LocalPath() == _GetWhitelistFileName(input_api):
      whitelist_contents_changed = True
      deleted_files |= set(_ProcessWhitelistedFilesList(
        input_api, _GetDeletedContents(f)))
      continue
    if f.Action() != 'D':
      files_to_check.add(f.LocalPath())
    else:
      deleted_files.add(f.LocalPath())
  whitelisted_files = set(LoadWhitelistedFilesList(input_api))
  if not whitelist_contents_changed:
    whitelisted_files &= files_to_check | deleted_files
  else:
    # Need to re-check the entire contents of the whitelist file.
    # Also add files removed from the whitelist. If the file has indeed been
    # deleted, the scanner will not complain.
    files_to_check |= whitelisted_files | deleted_files

  (unknown_files, missing_files, stale_files) = _DoScanAtPresubmit(
    input_api, list(whitelisted_files), list(files_to_check))
  results = []
  if unknown_files:
    results.append(output_api.PresubmitError(
        'The following files contain a third-party license but are not in ' \
        'a listed third-party directory and are not whitelisted. You must ' \
        'add the following files to the whitelist file %s\n' \
        '(Note that if the code you are adding does not actually contain ' \
        'any third-party code, it may contain the word "copyright", which ' \
        'should be masked out, e.g. by writing it as "copy-right"):' \
        '' % _GetWhitelistFileName(input_api),
        sorted(unknown_files)))
  if missing_files:
    results.append(output_api.PresubmitPromptWarning(
        'The following files are whitelisted in %s, ' \
        'but do not exist or not files:' % _GetWhitelistFileName(input_api),
        sorted(missing_files)))
  if stale_files:
    results.append(output_api.PresubmitPromptWarning(
        'The following files are whitelisted unnecessarily. You must ' \
        'remove the following files from the whitelist file ' \
        '%s:' % _GetWhitelistFileName(input_api),
        sorted(stale_files)))
  return results