#!/usr/bin/env python # coding=utf-8 # Copyright (c) 2012 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. """Runs strace or dtrace on a test and processes the logs to extract the dependencies from the source tree. Automatically extracts directories where all the files are used to make the dependencies list more compact. """ import codecs import csv import logging import optparse import os import posixpath import re import subprocess import sys ## OS-specific imports if sys.platform == 'win32': from ctypes.wintypes import create_unicode_buffer from ctypes.wintypes import windll, FormatError # pylint: disable=E0611 from ctypes.wintypes import GetLastError # pylint: disable=E0611 elif sys.platform == 'darwin': import Carbon.File # pylint: disable=F0401 BASE_DIR = os.path.dirname(os.path.abspath(__file__)) ROOT_DIR = os.path.dirname(os.path.dirname(BASE_DIR)) KEY_TRACKED = 'isolate_dependency_tracked' KEY_UNTRACKED = 'isolate_dependency_untracked' ## OS-specific functions if sys.platform == 'win32': def QueryDosDevice(drive_letter): """Returns the Windows 'native' path for a DOS drive letter.""" assert re.match(r'^[a-zA-Z]:$', drive_letter), drive_letter # Guesswork. QueryDosDeviceW never returns the required number of bytes. chars = 1024 drive_letter = unicode(drive_letter) p = create_unicode_buffer(chars) if 0 == windll.kernel32.QueryDosDeviceW(drive_letter, p, chars): err = GetLastError() if err: # pylint: disable=E0602 raise WindowsError( err, 'QueryDosDevice(%s): %s (%d)' % ( str(drive_letter), FormatError(err), err)) return p.value def GetShortPathName(long_path): """Returns the Windows short path equivalent for a 'long' path.""" long_path = unicode(long_path) # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is # not enforced. if os.path.isabs(long_path) and not long_path.startswith('\\\\?\\'): long_path = '\\\\?\\' + long_path chars = windll.kernel32.GetShortPathNameW(long_path, None, 0) if chars: p = create_unicode_buffer(chars) if windll.kernel32.GetShortPathNameW(long_path, p, chars): return p.value err = GetLastError() if err: # pylint: disable=E0602 raise WindowsError( err, 'GetShortPathName(%s): %s (%d)' % ( str(long_path), FormatError(err), err)) def GetLongPathName(short_path): """Returns the Windows long path equivalent for a 'short' path.""" short_path = unicode(short_path) # Adds '\\\\?\\' when given an absolute path so the MAX_PATH (260) limit is # not enforced. if os.path.isabs(short_path) and not short_path.startswith('\\\\?\\'): short_path = '\\\\?\\' + short_path chars = windll.kernel32.GetLongPathNameW(short_path, None, 0) if chars: p = create_unicode_buffer(chars) if windll.kernel32.GetLongPathNameW(short_path, p, chars): return p.value err = GetLastError() if err: # pylint: disable=E0602 raise WindowsError( err, 'GetLongPathName(%s): %s (%d)' % ( str(short_path), FormatError(err), err)) def get_current_encoding(): """Returns the 'ANSI' code page associated to the process.""" return 'cp%d' % int(windll.kernel32.GetACP()) class DosDriveMap(object): """Maps \Device\HarddiskVolumeN to N: on Windows.""" # Keep one global cache. _MAPPING = {} def __init__(self): if not self._MAPPING: # This is related to UNC resolver on windows. Ignore that. self._MAPPING['\\Device\\Mup'] = None for letter in (chr(l) for l in xrange(ord('C'), ord('Z')+1)): try: letter = '%s:' % letter mapped = QueryDosDevice(letter) # It can happen. Assert until we see it happens in the wild. In # practice, prefer the lower drive letter. assert mapped not in self._MAPPING if mapped not in self._MAPPING: self._MAPPING[mapped] = letter except WindowsError: # pylint: disable=E0602 pass def to_dos(self, path): """Converts a native NT path to DOS path.""" m = re.match(r'(^\\Device\\[a-zA-Z0-9]+)(\\.*)?$', path) assert m, path if not m.group(1) in self._MAPPING: # Unmapped partitions may be accessed by windows for the # fun of it while the test is running. Discard these. return None drive = self._MAPPING[m.group(1)] if not drive or not m.group(2): return drive return drive + m.group(2) def get_native_path_case(root, relative_path): """Returns the native path case.""" if sys.platform == 'win32': # Windows used to have an option to turn on case sensitivity on non Win32 # subsystem but that's out of scope here and isn't supported anymore. # First process root. if root: root = GetLongPathName(GetShortPathName(root)) + os.path.sep path = os.path.join(root, relative_path) if root else relative_path # Go figure why GetShortPathName() is needed. return GetLongPathName(GetShortPathName(path))[len(root):] elif sys.platform == 'darwin': # Technically, it's only HFS+ on OSX that is case insensitive. It's # the default setting on HFS+ but can be changed. root_ref, _ = Carbon.File.FSPathMakeRef(root) rel_ref, _ = Carbon.File.FSPathMakeRef(os.path.join(root, relative_path)) return rel_ref.FSRefMakePath()[len(root_ref.FSRefMakePath())+1:] else: # Give up on cygwin, as GetLongPathName() can't be called. return relative_path def get_flavor(): """Returns the system default flavor. Copied from gyp/pylib/gyp/common.py.""" flavors = { 'cygwin': 'win', 'win32': 'win', 'darwin': 'mac', 'sunos5': 'solaris', 'freebsd7': 'freebsd', 'freebsd8': 'freebsd', } return flavors.get(sys.platform, 'linux') def isEnabledFor(level): return logging.getLogger().isEnabledFor(level) def fix_python_path(cmd): """Returns the fixed command line to call the right python executable.""" out = cmd[:] if out[0] == 'python': out[0] = sys.executable elif out[0].endswith('.py'): out.insert(0, sys.executable) return out def posix_relpath(path, root): """posix.relpath() that keeps trailing slash.""" out = posixpath.relpath(path, root) if path.endswith('/'): out += '/' return out def cleanup_path(x): """Cleans up a relative path. Converts any os.path.sep to '/' on Windows.""" if x: x = x.rstrip(os.path.sep).replace(os.path.sep, '/') if x == '.': x = '' if x: x += '/' return x class Strace(object): """strace implies linux.""" IGNORED = ( '/bin', '/dev', '/etc', '/lib', '/proc', '/sys', '/tmp', '/usr', '/var', ) class Context(object): """Processes a strace log line and keeps the list of existent and non existent files accessed. Ignores directories. """ # This is the most common format. pid function(args) = result RE_HEADER = re.compile(r'^(\d+)\s+([^\(]+)\((.+?)\)\s+= (.+)$') # An interrupted function call, only grab the minimal header. RE_UNFINISHED = re.compile(r'^(\d+)\s+([^\(]+).*$') UNFINISHED = ' ' # A resumed function call. RE_RESUMED = re.compile(r'^(\d+)\s+<\.\.\. ([^ ]+) resumed> (.+)$') # A process received a signal. RE_SIGNAL = re.compile(r'^\d+\s+--- SIG[A-Z]+ .+ ---') # A process didn't handle a signal. RE_KILLED = re.compile(r'^(\d+)\s+\+\+\+ killed by ([A-Z]+) \+\+\+$') # A call was canceled. RE_UNAVAILABLE = re.compile(r'\)\s+= \? $') # Arguments parsing. RE_CHDIR = re.compile(r'^\"(.+?)\"$') RE_EXECVE = re.compile(r'^\"(.+?)\", \[.+?\], \[.+?\]$') RE_OPEN2 = re.compile(r'^\"(.*?)\", ([A-Z\_\|]+)$') RE_OPEN3 = re.compile(r'^\"(.*?)\", ([A-Z\_\|]+), (\d+)$') RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$') def __init__(self, blacklist): self._cwd = {} self.blacklist = blacklist self.files = set() self.non_existent = set() # Key is a tuple(pid, function name) self._pending_calls = {} self._line_number = 0 @classmethod def traces(cls): prefix = 'handle_' return [i[len(prefix):] for i in dir(cls) if i.startswith(prefix)] def on_line(self, line): self._line_number += 1 line = line.strip() if self.RE_SIGNAL.match(line): # Ignore signals. return m = self.RE_KILLED.match(line) if m: self.handle_exit_group(int(m.group(1)), m.group(2), None, None) return if line.endswith(self.UNFINISHED): line = line[:-len(self.UNFINISHED)] m = self.RE_UNFINISHED.match(line) assert m, '%d: %s' % (self._line_number, line) self._pending_calls[(m.group(1), m.group(2))] = line return m = self.RE_UNAVAILABLE.match(line) if m: # This usually means a process was killed and a pending call was # canceled. # TODO(maruel): Look up the last exit_group() trace just above and make # sure any self._pending_calls[(pid, anything)] is properly flushed. return m = self.RE_RESUMED.match(line) if m: pending = self._pending_calls.pop((m.group(1), m.group(2))) # Reconstruct the line. line = pending + m.group(3) m = self.RE_HEADER.match(line) assert m, (self._line_number, line) if m.group(2) == '????': return handler = getattr(self, 'handle_%s' % m.group(2), None) assert handler, (self._line_number, line) try: return handler( int(m.group(1)), m.group(2), m.group(3), m.group(4)) except Exception: print >> sys.stderr, (self._line_number, line) raise def handle_chdir(self, pid, _function, args, result): """Updates cwd.""" if result.startswith('0'): cwd = self.RE_CHDIR.match(args).group(1) if not cwd.startswith('/'): cwd2 = os.path.join(self._cwd[pid], cwd) logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2)) self._cwd[pid] = cwd2 else: logging.debug('handle_chdir(%d, %s)' % (pid, cwd)) self._cwd[pid] = cwd else: assert False, 'Unexecpected fail: %s' % result def handle_clone(self, pid, _function, _args, result): """Transfers cwd.""" if result == '? ERESTARTNOINTR (To be restarted)': return self._cwd[int(result)] = self._cwd[pid] def handle_close(self, pid, _function, _args, _result): pass def handle_execve(self, pid, _function, args, result): self._handle_file(pid, self.RE_EXECVE.match(args).group(1), result) def handle_exit_group(self, pid, _function, _args, _result): """Removes cwd.""" del self._cwd[pid] @staticmethod def handle_fork(_pid, _function, args, result): assert False, (args, result) def handle_open(self, pid, _function, args, result): args = (self.RE_OPEN3.match(args) or self.RE_OPEN2.match(args)).groups() if 'O_DIRECTORY' in args[1]: return self._handle_file(pid, args[0], result) def handle_rename(self, pid, _function, args, result): args = self.RE_RENAME.match(args).groups() self._handle_file(pid, args[0], result) self._handle_file(pid, args[1], result) @staticmethod def handle_stat64(_pid, _function, args, result): assert False, (args, result) @staticmethod def handle_vfork(_pid, _function, args, result): assert False, (args, result) def _handle_file(self, pid, filepath, result): assert pid in self._cwd if result.startswith('-1'): return old_filepath = filepath if not filepath.startswith('/'): filepath = os.path.join(self._cwd[pid], filepath) if self.blacklist(filepath): return if old_filepath != filepath: logging.debug( '_handle_file(%d, %s) -> %s' % (pid, old_filepath, filepath)) else: logging.debug('_handle_file(%d, %s)' % (pid, filepath)) if filepath not in self.files and filepath not in self.non_existent: if os.path.isfile(filepath): self.files.add(filepath) else: self.non_existent.add(filepath) @staticmethod def clean_trace(logname): """Deletes the old log.""" if os.path.isfile(logname): os.remove(logname) @classmethod def gen_trace(cls, cmd, cwd, logname, output): """Runs strace on an executable.""" logging.info('gen_trace(%s, %s, %s, %s)' % (cmd, cwd, logname, output)) stdout = stderr = None if output: stdout = subprocess.PIPE stderr = subprocess.STDOUT traces = ','.join(cls.Context.traces()) trace_cmd = ['strace', '-f', '-e', 'trace=%s' % traces, '-o', logname] child = subprocess.Popen( trace_cmd + cmd, cwd=cwd, stdin=subprocess.PIPE, stdout=stdout, stderr=stderr) out = child.communicate()[0] # Once it's done, inject a chdir() call to cwd to be able to reconstruct # the full paths. # TODO(maruel): cwd should be saved at each process creation, so forks needs # to be traced properly. if os.path.isfile(logname): with open(logname) as f: content = f.read() with open(logname, 'w') as f: pid = content.split(' ', 1)[0] f.write('%s chdir("%s") = 0\n' % (pid, cwd)) f.write(content) return child.returncode, out @classmethod def parse_log(cls, filename, blacklist): """Processes a strace log and returns the files opened and the files that do not exist. It does not track directories. Most of the time, files that do not exist are temporary test files that should be put in /tmp instead. See http://crbug.com/116251 """ logging.info('parse_log(%s, %s)' % (filename, blacklist)) context = cls.Context(blacklist) for line in open(filename): context.on_line(line) # Resolve any symlink we hit. return ( set(os.path.realpath(f) for f in context.files), set(os.path.realpath(f) for f in context.non_existent)) class Dtrace(object): """Uses DTrace framework through dtrace. Requires root access. Implies Mac OSX. dtruss can't be used because it has compatibility issues with python. Also, the pid->cwd handling needs to be done manually since OSX has no way to get the absolute path of the 'cwd' dtrace variable from the probe. Also, OSX doesn't populate curpsinfo->pr_psargs properly, see https://discussions.apple.com/thread/1980539. """ IGNORED = ( '/.vol', '/Library', '/System', '/dev', '/etc', '/private/var', '/tmp', '/usr', '/var', ) # pylint: disable=C0301 # To understand the following code, you'll want to take a look at: # http://developers.sun.com/solaris/articles/dtrace_quickref/dtrace_quickref.html # https://wikis.oracle.com/display/DTrace/Variables # http://docs.oracle.com/cd/E19205-01/820-4221/ # # The list of valid probes can be retrieved with: # sudo dtrace -l -P syscall | less D_CODE = """ proc:::start /trackedpid[ppid]/ { trackedpid[pid] = 1; current_processes += 1; printf("%d %d:%d %s_%s(\\"%s\\", %d) = 0\\n", logindex, ppid, pid, probeprov, probename, execname, current_processes); logindex++; } proc:::exit /trackedpid[pid] && current_processes == 1/ { trackedpid[pid] = 0; current_processes -= 1; printf("%d %d:%d %s_%s(\\"%s\\", %d) = 0\\n", logindex, ppid, pid, probeprov, probename, execname, current_processes); logindex++; exit(0); } proc:::exit /trackedpid[pid]/ { trackedpid[pid] = 0; current_processes -= 1; printf("%d %d:%d %s_%s(\\"%s\\", %d) = 0\\n", logindex, ppid, pid, probeprov, probename, execname, current_processes); logindex++; } /* Finally what we care about! */ syscall::open*:entry /trackedpid[pid]/ { self->arg0 = arg0; self->arg1 = arg1; self->arg2 = arg2; } syscall::open*:return /trackedpid[pid] && errno == 0/ { printf("%d %d:%d %s(\\"%s\\", %d, %d) = %d\\n", logindex, ppid, pid, probefunc, copyinstr(self->arg0), self->arg1, self->arg2, errno); logindex++; self->arg0 = 0; self->arg1 = 0; self->arg2 = 0; } syscall::rename:entry /trackedpid[pid]/ { self->arg0 = arg0; self->arg1 = arg1; } syscall::rename:return /trackedpid[pid]/ { printf("%d %d:%d %s(\\"%s\\", \\"%s\\") = %d\\n", logindex, ppid, pid, probefunc, copyinstr(self->arg0), copyinstr(self->arg1), errno); logindex++; self->arg0 = 0; self->arg1 = 0; } /* Track chdir, it's painful because it is only receiving relative path */ syscall::chdir:entry /trackedpid[pid]/ { self->arg0 = arg0; } syscall::chdir:return /trackedpid[pid] && errno == 0/ { printf("%d %d:%d %s(\\"%s\\") = %d\\n", logindex, ppid, pid, probefunc, copyinstr(self->arg0), errno); logindex++; self->arg0 = 0; } /* TODO(maruel): *stat* functions and friends syscall::access:return, syscall::chdir:return, syscall::chflags:return, syscall::chown:return, syscall::chroot:return, syscall::getattrlist:return, syscall::getxattr:return, syscall::lchown:return, syscall::lstat64:return, syscall::lstat:return, syscall::mkdir:return, syscall::pathconf:return, syscall::readlink:return, syscall::removexattr:return, syscall::setxattr:return, syscall::stat64:return, syscall::stat:return, syscall::truncate:return, syscall::unlink:return, syscall::utimes:return, */ """ @classmethod def code(cls, pid, cwd): """Setups the D code to implement child process tracking. Injects a fake chdir() trace to simplify parsing. The reason is that the child process is already running at that point so: - no proc_start() is logged for it. - there is no way to figure out the absolute path of cwd in kernel on OSX Since the child process is already started, initialize current_processes to 1. """ pid = str(pid) cwd = os.path.realpath(cwd).replace('\\', '\\\\').replace('%', '%%') return ( 'dtrace:::BEGIN {\n' ' current_processes = 1;\n' ' logindex = 0;\n' ' trackedpid[') + pid + ('] = 1;\n' ' printf("%d %d:%d chdir(\\"' + cwd + '\\") = 0\\n",\n' ' logindex, 1, ' + pid + ');\n' ' logindex++;\n' ' printf("%d %d:%d %s_%s() = 0\\n",\n' ' logindex, ppid, pid, probeprov, probename);\n' ' logindex++;\n' '}\n') + cls.D_CODE class Context(object): """Processes a dtrace log line and keeps the list of existent and non existent files accessed. Ignores directories. """ # This is the most common format. index pid function(args) = result RE_HEADER = re.compile(r'^\d+ (\d+):(\d+) ([a-zA-Z_\-]+)\((.*?)\) = (.+)$') # Arguments parsing. RE_CHDIR = re.compile(r'^\"(.+?)\"$') RE_OPEN = re.compile(r'^\"(.+?)\", (\d+), (\d+)$') RE_RENAME = re.compile(r'^\"(.+?)\", \"(.+?)\"$') O_DIRECTORY = 0x100000 def __init__(self, blacklist): # TODO(maruel): Handling chdir() and cwd in general on OSX is tricky # because OSX only keeps relative directory names. In addition, cwd is a # process local variable so forks need to be properly traced and cwd # saved. self._cwd = {} self.blacklist = blacklist self.files = set() self.non_existent = set() def on_line(self, line): m = self.RE_HEADER.match(line) assert m, line fn = getattr( self, 'handle_%s' % m.group(3).replace('-', '_'), self._handle_ignored) return fn( int(m.group(1)), int(m.group(2)), m.group(3), m.group(4), m.group(5)) def handle_dtrace_BEGIN(self, _ppid, _pid, _function, args, _result): pass def handle_proc_start(self, ppid, pid, _function, _args, result): """Transfers cwd.""" assert result == '0' self._cwd[pid] = self._cwd[ppid] def handle_proc_exit(self, _ppid, pid, _function, _args, _result): """Removes cwd.""" del self._cwd[pid] def handle_chdir(self, _ppid, pid, _function, args, result): """Updates cwd.""" if result.startswith('0'): cwd = self.RE_CHDIR.match(args).group(1) if not cwd.startswith('/'): cwd2 = os.path.join(self._cwd[pid], cwd) logging.debug('handle_chdir(%d, %s) -> %s' % (pid, cwd, cwd2)) self._cwd[pid] = cwd2 else: logging.debug('handle_chdir(%d, %s)' % (pid, cwd)) self._cwd[pid] = cwd else: assert False, 'Unexecpected fail: %s' % result def handle_open_nocancel(self, ppid, pid, function, args, result): return self.handle_open(ppid, pid, function, args, result) def handle_open(self, _ppid, pid, _function, args, result): args = self.RE_OPEN.match(args).groups() flag = int(args[1]) if self.O_DIRECTORY & flag == self.O_DIRECTORY: # Ignore directories. return self._handle_file(pid, args[0], result) def handle_rename(self, _ppid, pid, _function, args, result): args = self.RE_RENAME.match(args).groups() self._handle_file(pid, args[0], result) self._handle_file(pid, args[1], result) def _handle_file(self, pid, filepath, result): if result.startswith(('-1', '2')): return orig_filepath = filepath if not filepath.startswith('/'): filepath = os.path.join(self._cwd[pid], filepath) filepath = os.path.normpath(filepath) if self.blacklist(filepath): return # Sadly, still need to filter out directories here; # saw open_nocancel(".", 0, 0) = 0 lines. if (filepath not in self.files and filepath not in self.non_existent and not os.path.isdir(filepath)): if orig_filepath: logging.debug( '_handle_file(%d, %s) -> %s' % (pid, orig_filepath, filepath)) else: logging.debug('_handle_file(%d, %s)' % (pid, filepath)) if os.path.isfile(filepath): self.files.add(filepath) else: self.non_existent.add(filepath) @staticmethod def _handle_ignored(_ppid, pid, function, args, result): logging.debug('%d %s(%s) = %s' % (pid, function, args, result)) @staticmethod def clean_trace(logname): """Deletes the old log.""" if os.path.isfile(logname): os.remove(logname) @classmethod def gen_trace(cls, cmd, cwd, logname, output): """Runs dtrace on an executable.""" logging.info('gen_trace(%s, %s, %s, %s)' % (cmd, cwd, logname, output)) logging.info('Running: %s' % cmd) signal = 'Go!' logging.debug('Our pid: %d' % os.getpid()) # Part 1: start the child process. stdout = stderr = None if output: stdout = subprocess.PIPE stderr = subprocess.STDOUT child_cmd = [ sys.executable, os.path.join(BASE_DIR, 'trace_child_process.py'), ] child = subprocess.Popen( child_cmd + cmd, stdin=subprocess.PIPE, stdout=stdout, stderr=stderr, cwd=cwd) logging.debug('Started child pid: %d' % child.pid) # Part 2: start dtrace process. # Note: do not use the -p flag. It's useless if the initial process quits # too fast, resulting in missing traces from the grand-children. The D code # manages the dtrace lifetime itself. trace_cmd = [ 'sudo', 'dtrace', '-x', 'dynvarsize=4m', '-x', 'evaltime=exec', '-n', cls.code(child.pid, cwd), '-o', '/dev/stderr', '-q', ] with open(logname, 'w') as logfile: dtrace = subprocess.Popen( trace_cmd, stdout=logfile, stderr=subprocess.STDOUT) logging.debug('Started dtrace pid: %d' % dtrace.pid) # Part 3: Read until one line is printed, which signifies dtrace is up and # ready. with open(logname, 'r') as logfile: while 'dtrace_BEGIN' not in logfile.readline(): if dtrace.poll() is not None: break try: # Part 4: We can now tell our child to go. # TODO(maruel): Another pipe than stdin could be used instead. This would # be more consistent with the other tracing methods. out = child.communicate(signal)[0] dtrace.wait() if dtrace.returncode != 0: print 'dtrace failure: %d' % dtrace.returncode with open(logname) as logfile: print ''.join(logfile.readlines()[-100:]) # Find a better way. os.remove(logname) else: # Short the log right away to simplify our life. There isn't much # advantage in keeping it out of order. cls._sort_log(logname) except KeyboardInterrupt: # Still sort when testing. cls._sort_log(logname) raise return dtrace.returncode or child.returncode, out @classmethod def parse_log(cls, filename, blacklist): """Processes a dtrace log and returns the files opened and the files that do not exist. It does not track directories. Most of the time, files that do not exist are temporary test files that should be put in /tmp instead. See http://crbug.com/116251 """ logging.info('parse_log(%s, %s)' % (filename, blacklist)) context = cls.Context(blacklist) for line in open(filename, 'rb'): context.on_line(line) # Resolve any symlink we hit. return ( set(os.path.realpath(f) for f in context.files), set(os.path.realpath(f) for f in context.non_existent)) @staticmethod def _sort_log(logname): """Sorts the log back in order when each call occured. dtrace doesn't save the buffer in strict order since it keeps one buffer per CPU. """ with open(logname, 'rb') as logfile: lines = [f for f in logfile.readlines() if f.strip()] lines = sorted(lines, key=lambda l: int(l.split(' ', 1)[0])) with open(logname, 'wb') as logfile: logfile.write(''.join(lines)) class LogmanTrace(object): """Uses the native Windows ETW based tracing functionality to trace a child process. """ class Context(object): """Processes a ETW log line and keeps the list of existent and non existent files accessed. Ignores directories. """ EVENT_NAME = 0 TYPE = 1 PID = 9 CHILD_PID = 20 PARENT_PID = 21 FILE_PATH = 25 PROC_NAME = 26 CMD_LINE = 27 def __init__(self, blacklist): self.blacklist = blacklist self.files = set() self.non_existent = set() self._processes = set() self._drive_map = DosDriveMap() self._first_line = False def on_csv_line(self, line): """Processes a CSV Event line.""" # So much white space! line = [i.strip() for i in line] if not self._first_line: assert line == [ u'Event Name', u'Type', u'Event ID', u'Version', u'Channel', u'Level', # 5 u'Opcode', u'Task', u'Keyword', u'PID', u'TID', # 10 u'Processor Number', u'Instance ID', u'Parent Instance ID', u'Activity ID', u'Related Activity ID', # 15 u'Clock-Time', u'Kernel(ms)', u'User(ms)', u'User Data', ] self._first_line = True return # As you can see, the CSV is full of useful non-redundant information: # Event ID assert line[2] == '0' # Version assert line[3] in ('2', '3'), line[3] # Channel assert line[4] == '0' # Level assert line[5] == '0' # Task assert line[7] == '0' # Keyword assert line[8] == '0x0000000000000000' # Instance ID assert line[12] == '' # Parent Instance ID assert line[13] == '' # Activity ID assert line[14] == '{00000000-0000-0000-0000-000000000000}' # Related Activity ID assert line[15] == '' if line[0].startswith('{'): # Skip GUIDs. return # Convert the PID in-place from hex. line[self.PID] = int(line[self.PID], 16) # By Opcode handler = getattr( self, 'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]), None) if not handler: # Try to get an universal fallback handler = getattr(self, 'handle_%s_Any' % line[self.EVENT_NAME], None) if handler: handler(line) else: assert False, '%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]) def handle_EventTrace_Any(self, line): pass def handle_FileIo_Create(self, line): m = re.match(r'^\"(.+)\"$', line[self.FILE_PATH]) self._handle_file(self._drive_map.to_dos(m.group(1))) def handle_FileIo_Rename(self, line): # TODO(maruel): Handle? pass def handle_FileIo_Any(self, line): pass def handle_Image_DCStart(self, line): # TODO(maruel): Handle? pass def handle_Image_Load(self, line): # TODO(maruel): Handle? pass def handle_Image_Any(self, line): # TODO(maruel): Handle? pass def handle_Process_Any(self, line): pass def handle_Process_DCStart(self, line): """Gives historic information about the process tree. Use it to extract the pid of the trace_inputs.py parent process that started logman.exe. """ ppid = int(line[self.PARENT_PID], 16) if line[self.PROC_NAME] == '"logman.exe"': # logman's parent is us. self._processes.add(ppid) logging.info('Found logman\'s parent at %d' % ppid) def handle_Process_End(self, line): # Look if it is logman terminating, if so, grab the parent's process pid # and inject cwd. if line[self.PID] in self._processes: logging.info('Terminated: %d' % line[self.PID]) self._processes.remove(line[self.PID]) def handle_Process_Start(self, line): """Handles a new child process started by PID.""" ppid = line[self.PID] pid = int(line[self.CHILD_PID], 16) if ppid in self._processes: if line[self.PROC_NAME] == '"logman.exe"': # Skip the shutdown call. return self._processes.add(pid) logging.info( 'New child: %d -> %d %s' % (ppid, pid, line[self.PROC_NAME])) def handle_SystemConfig_Any(self, line): pass def _handle_file(self, filename): """Handles a file that was touched. Interestingly enough, the file is always with an absolute path. """ if (not filename or self.blacklist(filename) or os.path.isdir(filename) or filename in self.files or filename in self.non_existent): return logging.debug('_handle_file(%s)' % filename) if os.path.isfile(filename): self.files.add(filename) else: self.non_existent.add(filename) def __init__(self): # Most ignores need to be determined at runtime. self.IGNORED = set([os.path.dirname(sys.executable)]) # Add many directories from environment variables. vars_to_ignore = ( 'APPDATA', 'LOCALAPPDATA', 'ProgramData', 'ProgramFiles', 'ProgramFiles(x86)', 'ProgramW6432', 'SystemRoot', 'TEMP', 'TMP', ) for i in vars_to_ignore: if os.environ.get(i): self.IGNORED.add(os.environ[i]) # Also add their short path name equivalents. for i in list(self.IGNORED): self.IGNORED.add(GetShortPathName(i)) # Add this one last since it has no short path name equivalent. self.IGNORED.add('\\systemroot') self.IGNORED = tuple(sorted(self.IGNORED)) @staticmethod def clean_trace(logname): """Deletes the old log.""" if os.path.isfile(logname): os.remove(logname) if os.path.isfile(logname + '.etl'): os.remove(logname + '.etl') @classmethod def gen_trace(cls, cmd, cwd, logname, output): logging.info('gen_trace(%s, %s, %s, %s)' % (cmd, cwd, logname, output)) # Use "logman -?" for help. etl = logname + '.etl' stdout = stderr = None if output: stdout = subprocess.PIPE stderr = subprocess.STDOUT # 1. Start the log collection. Requires administrative access. logman.exe is # synchronous so no need for a "warmup" call. # 'Windows Kernel Trace' is *localized* so use its GUID instead. # The GUID constant name is SystemTraceControlGuid. Lovely. cmd_start = [ 'logman.exe', 'start', 'NT Kernel Logger', '-p', '{9e814aad-3204-11d2-9a82-006008a86939}', '(process,img,file,fileio)', '-o', etl, '-ets', # Send directly to kernel ] logging.debug('Running: %s' % cmd_start) subprocess.check_call( cmd_start, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # 2. Run the child process. logging.debug('Running: %s' % cmd) try: child = subprocess.Popen( cmd, cwd=cwd, stdin=subprocess.PIPE, stdout=stdout, stderr=stderr) out = child.communicate()[0] finally: # 3. Stop the log collection. cmd_stop = [ 'logman.exe', 'stop', 'NT Kernel Logger', '-ets', # Send directly to kernel ] logging.debug('Running: %s' % cmd_stop) subprocess.check_call( cmd_stop, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) # 4. Convert the traces to text representation. # Use "tracerpt -?" for help. LOCALE_INVARIANT = 0x7F windll.kernel32.SetThreadLocale(LOCALE_INVARIANT) cmd_convert = [ 'tracerpt.exe', '-l', etl, '-o', logname, '-gmt', # Use UTC '-y', # No prompt ] # Normally, 'csv' is sufficient. If complex scripts are used (like eastern # languages), use 'csv_unicode'. If localization gets in the way, use 'xml'. logformat = 'csv' if logformat == 'csv': # tracerpt localizes the 'Type' column, for major brainfuck # entertainment. I can't imagine any sane reason to do that. cmd_convert.extend(['-of', 'CSV']) elif logformat == 'csv_utf16': # This causes it to use UTF-16, which doubles the log size but ensures the # log is readable for non-ASCII characters. cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode']) elif logformat == 'xml': cmd_convert.extend(['-of', 'XML']) else: assert False, logformat logging.debug('Running: %s' % cmd_convert) subprocess.check_call( cmd_convert, stdin=subprocess.PIPE, stdout=stdout, stderr=stderr) return child.returncode, out @classmethod def parse_log(cls, filename, blacklist): logging.info('parse_log(%s, %s)' % (filename, blacklist)) # Auto-detect the log format with open(filename, 'rb') as f: hdr = f.read(2) assert len(hdr) == 2 if hdr == ' logformat = 'xml' elif hdr == '\xFF\xEF': # utf-16 BOM. logformat = 'csv_utf16' else: logformat = 'csv' context = cls.Context(blacklist) if logformat == 'csv_utf16': def utf_8_encoder(unicode_csv_data): """Encodes the unicode object as utf-8 encoded str instance""" for line in unicode_csv_data: yield line.encode('utf-8') def unicode_csv_reader(unicode_csv_data, **kwargs): """Encodes temporarily as UTF-8 since csv module doesn't do unicode.""" csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs) for row in csv_reader: # Decode str utf-8 instances back to unicode instances, cell by cell: yield [cell.decode('utf-8') for cell in row] # The CSV file is UTF-16 so use codecs.open() to load the file into the # python internal unicode format (utf-8). Then explicitly re-encode as # utf8 as str instances so csv can parse it fine. Then decode the utf-8 # str back into python unicode instances. This sounds about right. for line in unicode_csv_reader(codecs.open(filename, 'r', 'utf-16')): # line is a list of unicode objects context.on_csv_line(line) elif logformat == 'csv': def ansi_csv_reader(ansi_csv_data, **kwargs): """Loads an 'ANSI' code page and returns unicode() objects.""" assert sys.getfilesystemencoding() == 'mbcs' encoding = get_current_encoding() for row in csv.reader(ansi_csv_data, **kwargs): # Decode str 'ansi' instances to unicode instances, cell by cell: yield [cell.decode(encoding) for cell in row] # The fastest and smallest format but only supports 'ANSI' file paths. # E.g. the filenames are encoding in the 'current' encoding. for line in ansi_csv_reader(open(filename)): # line is a list of unicode objects context.on_csv_line(line) else: raise NotImplementedError('Implement %s' % logformat) return ( set(os.path.realpath(f) for f in context.files), set(os.path.realpath(f) for f in context.non_existent)) def relevant_files(files, root): """Trims the list of files to keep the expected files and unexpected files. Unexpected files are files that are not based inside the |root| directory. """ expected = [] unexpected = [] for f in files: if f.startswith(root): f = f[len(root):] assert f expected.append(f) else: unexpected.append(f) return sorted(set(expected)), sorted(set(unexpected)) def extract_directories(files, root): """Detects if all the files in a directory were loaded and if so, replace the individual files by the directory entry. """ directories = set(os.path.dirname(f) for f in files) files = set(files) for directory in sorted(directories, reverse=True): actual = set( os.path.join(directory, f) for f in os.listdir(os.path.join(root, directory)) if not f.endswith(('.svn', '.pyc')) ) if not (actual - files): files -= actual files.add(directory + os.path.sep) return sorted(files) def pretty_print(variables, stdout): """Outputs a gyp compatible list from the decoded variables. Similar to pprint.print() but with NIH syndrome. """ # Order the dictionary keys by these keys in priority. ORDER = ( 'variables', 'condition', 'command', 'relative_cwd', 'read_only', KEY_TRACKED, KEY_UNTRACKED) def sorting_key(x): """Gives priority to 'most important' keys before the others.""" if x in ORDER: return str(ORDER.index(x)) return x def loop_list(indent, items): for item in items: if isinstance(item, basestring): stdout.write('%s\'%s\',\n' % (indent, item)) elif isinstance(item, dict): stdout.write('%s{\n' % indent) loop_dict(indent + ' ', item) stdout.write('%s},\n' % indent) elif isinstance(item, list): # A list inside a list will write the first item embedded. stdout.write('%s[' % indent) for index, i in enumerate(item): if isinstance(i, basestring): stdout.write( '\'%s\', ' % i.replace('\\', '\\\\').replace('\'', '\\\'')) elif isinstance(i, dict): stdout.write('{\n') loop_dict(indent + ' ', i) if index != len(item) - 1: x = ', ' else: x = '' stdout.write('%s}%s' % (indent, x)) else: assert False stdout.write('],\n') else: assert False def loop_dict(indent, items): for key in sorted(items, key=sorting_key): item = items[key] stdout.write("%s'%s': " % (indent, key)) if isinstance(item, dict): stdout.write('{\n') loop_dict(indent + ' ', item) stdout.write(indent + '},\n') elif isinstance(item, list): stdout.write('[\n') loop_list(indent + ' ', item) stdout.write(indent + '],\n') elif isinstance(item, basestring): stdout.write( '\'%s\',\n' % item.replace('\\', '\\\\').replace('\'', '\\\'')) elif item in (True, False, None): stdout.write('%s\n' % item) else: assert False, item stdout.write('{\n') loop_dict(' ', variables) stdout.write('}\n') def get_api(): flavor = get_flavor() if flavor == 'linux': return Strace() elif flavor == 'mac': return Dtrace() elif sys.platform == 'win32': return LogmanTrace() else: print >> sys.stderr, 'Unsupported platform %s' % sys.platform sys.exit(1) def get_blacklist(api): """Returns a function to filter unimportant files normally ignored.""" git_path = os.path.sep + '.git' + os.path.sep svn_path = os.path.sep + '.svn' + os.path.sep return lambda f: ( f.startswith(api.IGNORED) or f.endswith('.pyc') or git_path in f or svn_path in f) def generate_dict(files, cwd_dir, product_dir): """Converts the list of files into a .isolate dictionary. Arguments: - files: list of files to generate a dictionary out of. - cwd_dir: directory to base all the files from, relative to root_dir. - product_dir: directory to replace with <(PRODUCT_DIR), relative to root_dir. """ cwd_dir = cleanup_path(cwd_dir) product_dir = cleanup_path(product_dir) def fix(f): """Bases the file on the most restrictive variable.""" logging.debug('fix(%s)' % f) # Important, GYP stores the files with / and not \. f = f.replace(os.path.sep, '/') if product_dir and f.startswith(product_dir): return '<(PRODUCT_DIR)/%s' % f[len(product_dir):] else: # cwd_dir is usually the directory containing the gyp file. It may be # empty if the whole directory containing the gyp file is needed. return posix_relpath(f, cwd_dir) or './' corrected = [fix(f) for f in files] tracked = [f for f in corrected if not f.endswith('/') and ' ' not in f] untracked = [f for f in corrected if f.endswith('/') or ' ' in f] variables = {} if tracked: variables[KEY_TRACKED] = tracked if untracked: variables[KEY_UNTRACKED] = untracked return variables def trace(logfile, cmd, cwd, api, output): """Traces an executable. Returns (returncode, output) from api. Arguments: - logfile: file to write to. - cmd: command to run. - cwd: current directory to start the process in. - api: a tracing api instance. - output: if True, returns output, otherwise prints it at the console. """ cmd = fix_python_path(cmd) assert os.path.isabs(cmd[0]), cmd[0] api.clean_trace(logfile) return api.gen_trace(cmd, cwd, logfile, output) def load_trace(logfile, root_dir, api): """Loads a trace file and returns the processed file lists. Arguments: - logfile: file to load. - root_dir: root directory to use to determine if a file is relevant to the trace or not. - api: a tracing api instance. """ files, non_existent = api.parse_log(logfile, get_blacklist(api)) expected, unexpected = relevant_files( files, root_dir.rstrip(os.path.sep) + os.path.sep) # In case the file system is case insensitive. expected = sorted(set(get_native_path_case(root_dir, f) for f in expected)) simplified = extract_directories(expected, root_dir) return files, expected, unexpected, non_existent, simplified def trace_inputs(logfile, cmd, root_dir, cwd_dir, product_dir, force_trace): """Tries to load the logs if available. If not, trace the test. Symlinks are not processed at all. Arguments: - logfile: Absolute path to the OS-specific trace. - cmd: Command list to run. - root_dir: Base directory where the files we care about live. - cwd_dir: Cwd to use to start the process, relative to the root_dir directory. - product_dir: Directory containing the executables built by the build process, relative to the root_dir directory. It is used to properly replace paths with <(PRODUCT_DIR) for gyp output. - force_trace: Will force to trace unconditionally even if a trace already exist. """ logging.debug( 'trace_inputs(%s, %s, %s, %s, %s, %s)' % ( logfile, cmd, root_dir, cwd_dir, product_dir, force_trace)) def print_if(txt): if cwd_dir is None: print txt # It is important to have unambiguous path. assert os.path.isabs(root_dir), root_dir assert os.path.isabs(logfile), logfile assert not cwd_dir or not os.path.isabs(cwd_dir), cwd_dir assert not product_dir or not os.path.isabs(product_dir), product_dir api = get_api() # Resolve any symlink root_dir = os.path.realpath(root_dir) if not os.path.isfile(logfile) or force_trace: print_if('Tracing... %s' % cmd) # Use the proper relative directory. cwd = root_dir if not cwd_dir else os.path.join(root_dir, cwd_dir) silent = not isEnabledFor(logging.WARNING) returncode, _ = trace(logfile, cmd, cwd, api, silent) if returncode and not force_trace: return returncode print_if('Loading traces... %s' % logfile) files, expected, unexpected, non_existent, simplified = load_trace( logfile, root_dir, api) print_if('Total: %d' % len(files)) print_if('Non existent: %d' % len(non_existent)) for f in non_existent: print_if(' %s' % f) if unexpected: print_if('Unexpected: %d' % len(unexpected)) for f in unexpected: print_if(' %s' % f) print_if('Interesting: %d reduced to %d' % (len(expected), len(simplified))) for f in simplified: print_if(' %s' % f) if cwd_dir is not None: value = { 'conditions': [ ['OS=="%s"' % get_flavor(), { 'variables': generate_dict(simplified, cwd_dir, product_dir), }], ], } pretty_print(value, sys.stdout) return 0 def main(): parser = optparse.OptionParser( usage='%prog [cmd line...]') parser.allow_interspersed_args = False parser.add_option( '-v', '--verbose', action='count', default=0, help='Use multiple times') parser.add_option('-l', '--log', help='Log file') parser.add_option( '-c', '--cwd', help='Signal to start the process from this relative directory. When ' 'specified, outputs the inputs files in a way compatible for ' 'gyp processing. Should be set to the relative path containing the ' 'gyp file, e.g. \'chrome\' or \'net\'') parser.add_option( '-p', '--product-dir', default='out/Release', help='Directory for PRODUCT_DIR. Default: %default') parser.add_option( '--root-dir', default=ROOT_DIR, help='Root directory to base everything off. Default: %default') parser.add_option( '-f', '--force', action='store_true', default=False, help='Force to retrace the file') options, args = parser.parse_args() level = [logging.ERROR, logging.INFO, logging.DEBUG][min(2, options.verbose)] logging.basicConfig( level=level, format='%(levelname)5s %(module)15s(%(lineno)3d):%(message)s') if not options.log: parser.error('Must supply a log file with -l') if not args: if not os.path.isfile(options.log) or options.force: parser.error('Must supply a command to run') else: args[0] = os.path.abspath(args[0]) if options.root_dir: options.root_dir = os.path.abspath(options.root_dir) return trace_inputs( os.path.abspath(options.log), args, options.root_dir, options.cwd, options.product_dir, options.force) if __name__ == '__main__': sys.exit(main())