summaryrefslogtreecommitdiffstats
path: root/tools
diff options
context:
space:
mode:
authormaruel@chromium.org <maruel@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-06-07 00:37:20 +0000
committermaruel@chromium.org <maruel@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2012-06-07 00:37:20 +0000
commitb0d7147e8fec8b92e355a7042843b7f7ccea46ec (patch)
tree5203d0592309aef27cdaafdd2b4ca5f679e84c71 /tools
parent278a34d225295fbfa5d51c66e29357dd11a70ff0 (diff)
downloadchromium_src-b0d7147e8fec8b92e355a7042843b7f7ccea46ec.zip
chromium_src-b0d7147e8fec8b92e355a7042843b7f7ccea46ec.tar.gz
chromium_src-b0d7147e8fec8b92e355a7042843b7f7ccea46ec.tar.bz2
Convert to json first to speed up CSV processing.
The data is eventually going to be processed several times for parallel tracing so it is worth doing a first step just to clean up the junk in the traces. R=cmp@chromium.org NOTRY=true BUG= TEST= Review URL: https://chromiumcodereview.appspot.com/10546026 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@140905 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'tools')
-rwxr-xr-xtools/isolate/trace_inputs.py309
1 files changed, 142 insertions, 167 deletions
diff --git a/tools/isolate/trace_inputs.py b/tools/isolate/trace_inputs.py
index 9ad033f..3d5d92d 100755
--- a/tools/isolate/trace_inputs.py
+++ b/tools/isolate/trace_inputs.py
@@ -1750,22 +1750,31 @@ class LogmanTrace(ApiBase):
u'User(ms)', # pretty much useless.
u'User Data', # Extra arguments that are event-specific.
]
+ # Only the useful headers common to all entries are listed there. Any column
+ # at 19 or higher is dependent on the specific event.
+ EVENT_NAME = 0
+ TYPE = 1
+ PID = 9
+ TID = 10
+ PROCESSOR_ID = 11
+ TIMESTAMP = 16
+ NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
+ USER_DATA = 19
+
class Context(ApiBase.Context):
"""Processes a ETW log line and keeps the list of existent and non
existent files accessed.
Ignores directories.
"""
- # Only the useful headers common to all entries are listed there. Any column
- # at 19 or higher is dependent on the specific event.
+ # These indexes are for the stripped version in json.
EVENT_NAME = 0
TYPE = 1
- PID = 9
- TID = 10
- PROCESSOR_ID = 11
- TIMESTAMP = 16
- NULL_GUID = '{00000000-0000-0000-0000-000000000000}'
- USER_DATA = 19
+ PID = 2
+ TID = 3
+ PROCESSOR_ID = 4
+ TIMESTAMP = 5
+ USER_DATA = 6
class Process(ApiBase.Context.Process):
def __init__(self, *args):
@@ -1786,49 +1795,16 @@ class LogmanTrace(ApiBase):
self._initial_pid = None
self._line_number = 0
- def on_csv_line(self, raw_line):
- """Processes a CSV Event line."""
- # So much white space!
- line = [i.strip() for i in raw_line]
+ def on_line(self, line):
+ """Processes a json Event line."""
self._line_number += 1
try:
- if self._line_number == 1:
- if line != LogmanTrace.EXPECTED_HEADER:
- raise TracingFailure(
- 'Found malformed header: %s' % ' '.join(line),
- None, None, None)
- return
-
- # As you can see, the CSV is full of useful non-redundant information:
- if (line[2] != '0' or # Event ID
- line[3] not in ('2', '3') or # Version
- line[4] != '0' or # Channel
- line[5] != '0' or # Level
- line[7] != '0' or # Task
- line[8] != '0x0000000000000000' or # Keyword
- line[12] != '' or # Instance ID
- line[13] != '' or # Parent Instance ID
- line[14] != self.NULL_GUID or # Activity ID
- line[15] != ''): # Related Activity ID
- raise TracingFailure(
- 'Found unexpected values in line: %s' % ' '.join(line),
- None, None, None)
- if line[0].startswith('{'):
- # Skip GUIDs.
- return
-
- # Convert the PID in-place from hex.
- line[self.PID] = int(line[self.PID], 16)
-
# By Opcode
handler = getattr(
self,
'handle_%s_%s' % (line[self.EVENT_NAME], line[self.TYPE]),
None)
if not handler:
- # Try to get an universal fallback for the event class.
- handler = getattr(self, 'handle_%s_Any' % line[self.EVENT_NAME], None)
- if not handler:
raise TracingFailure(
'Unexpected event %s_%s' % (
line[self.EVENT_NAME], line[self.TYPE]),
@@ -1900,9 +1876,6 @@ class LogmanTrace(ApiBase):
'size from %s') % (line[EVENTS_LOST], line[BUFFER_SIZE]),
None, None, None)
- def handle_EventTrace_Any(self, line):
- pass
-
def handle_FileIo_Cleanup(self, line):
"""General wisdom: if a file is closed, it's because it was opened.
@@ -1966,12 +1939,6 @@ class LogmanTrace(ApiBase):
# TODO(maruel): Handle?
pass
- def handle_FileIo_Any(self, line):
- pass
-
- def handle_Process_Any(self, line):
- pass
-
def handle_Process_End(self, line):
pid = line[self.PID]
if pid in self.processes:
@@ -2051,32 +2018,35 @@ class LogmanTrace(ApiBase):
Do not use self.PID here since a process' initial thread is created by
the parent process.
"""
- PROCESS_ID = 19
- TTHREAD_ID = 20
- #STACK_BASE = 21
- #STACK_LIMIT = 22
- #USER_STACK_BASE = 23
- #USER_STACK_LIMIT = 24
- #AFFINITY = 25
- #WIN32_START_ADDR = 26
- #TEB_BASE = 27
- #SUB_PROCESS_TAG = 28
- #BASE_PRIORITY = 29
- #PAGE_PRIORITY = 30
- #IO_PRIORITY = 31
- #THREAD_FLAGS = 32
+ PROCESS_ID = self.USER_DATA
+ TTHREAD_ID = self.USER_DATA + 1
+ #STACK_BASE = self.USER_DATA + 2
+ #STACK_LIMIT = self.USER_DATA + 3
+ #USER_STACK_BASE = self.USER_DATA + 4
+ #USER_STACK_LIMIT = self.USER_DATA + 5
+ #AFFINITY = self.USER_DATA + 6
+ #WIN32_START_ADDR = self.USER_DATA + 7
+ #TEB_BASE = self.USER_DATA + 8
+ #SUB_PROCESS_TAG = self.USER_DATA + 9
+ #BASE_PRIORITY = self.USER_DATA + 10
+ #PAGE_PRIORITY = self.USER_DATA + 11
+ #IO_PRIORITY = self.USER_DATA + 12
+ #THREAD_FLAGS = self.USER_DATA + 13
# Do not use self.PID here since a process' initial thread is created by
# the parent process.
pid = int(line[PROCESS_ID], 16)
tid = int(line[TTHREAD_ID], 16)
self._threads_active[tid] = pid
- def handle_Thread_Any(self, line):
- pass
-
- def handle_SystemConfig_Any(self, line):
- """If you have too many of these, check your hardware."""
- pass
+ @classmethod
+ def supported_events(cls):
+ """Returns all the procesed events."""
+ out = []
+ for member in dir(cls):
+ match = re.match(r'^handle_([A-Za-z]+)_([A-Za-z]+)$', member)
+ if match:
+ out.append(match.groups())
+ return out
class Tracer(ApiBase.Tracer):
@classmethod
@@ -2119,8 +2089,9 @@ class LogmanTrace(ApiBase):
# 3. Stop the log collection.
cls._stop_log()
- # 4. Convert the traces to text representation.
- cls._convert_log(logname, 'csv')
+ # 4. Convert log
+ # TODO(maruel): Temporary, remove me.
+ LogmanTrace.process_log(logname)
# 5. Save metadata.
value = {
@@ -2181,52 +2152,6 @@ class LogmanTrace(ApiBase):
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
- @staticmethod
- def _convert_log(logname, logformat):
- """Converts the ETL trace to text representation.
-
- Normally, 'csv' is sufficient. If complex scripts are used (like eastern
- languages), use 'csv_utf16'. If localization gets in the way, use 'xml'.
-
- Arguments:
- - logname: Base filename to convert.
- - logformat: Text format to be generated, csv, csv_utf16 or xml.
-
- Use "tracerpt -?" for help.
- """
- LOCALE_INVARIANT = 0x7F
- windll.kernel32.SetThreadLocale(LOCALE_INVARIANT)
- cmd_convert = [
- 'tracerpt.exe',
- '-l', logname + '.etl',
- '-o', logname + '.' + logformat,
- '-gmt', # Use UTC
- '-y', # No prompt
- # Use -of XML to get the header of each items after column 19, e.g. all
- # the actual headers of 'User Data'.
- ]
-
- if logformat == 'csv':
- # tracerpt localizes the 'Type' column, for major brainfuck
- # entertainment. I can't imagine any sane reason to do that.
- cmd_convert.extend(['-of', 'CSV'])
- elif logformat == 'csv_utf16':
- # This causes it to use UTF-16, which doubles the log size but ensures
- # the log is readable for non-ASCII characters.
- cmd_convert.extend(['-of', 'CSV', '-en', 'Unicode'])
- elif logformat == 'xml':
- cmd_convert.extend(['-of', 'XML'])
- else:
- raise ValueError('Unexpected log format \'%s\'' % logformat)
- logging.debug('Running: %s' % cmd_convert)
- # This can takes tens of minutes for large logs.
- # Redirects all output to stderr.
- subprocess.check_call(
- cmd_convert,
- stdin=subprocess.PIPE,
- stdout=sys.stderr,
- stderr=sys.stderr)
-
def __init__(self):
super(LogmanTrace, self).__init__()
# Most ignores need to be determined at runtime.
@@ -2255,6 +2180,96 @@ class LogmanTrace(ApiBase):
self.IGNORED.add('\\SystemRoot')
self.IGNORED = tuple(sorted(self.IGNORED))
+ @classmethod
+ def process_log(cls, logname):
+ """Converts the .etl file into .csv then into .json."""
+ logformat = 'csv'
+ cls._convert_log(logname, logformat)
+
+ if logformat == 'csv_utf16':
+ def load_file():
+ def utf_8_encoder(unicode_csv_data):
+ """Encodes the unicode object as utf-8 encoded str instance"""
+ for line in unicode_csv_data:
+ yield line.encode('utf-8')
+
+ def unicode_csv_reader(unicode_csv_data, **kwargs):
+ """Encodes temporarily as UTF-8 since csv module doesn't do unicode.
+ """
+ csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
+ for row in csv_reader:
+ # Decode str utf-8 instances back to unicode instances, cell by
+ # cell:
+ yield [cell.decode('utf-8') for cell in row]
+
+ # The CSV file is UTF-16 so use codecs.open() to load the file into the
+ # python internal unicode format (utf-8). Then explicitly re-encode as
+ # utf8 as str instances so csv can parse it fine. Then decode the utf-8
+ # str back into python unicode instances. This sounds about right.
+ for line in unicode_csv_reader(
+ codecs.open(logname + '.' + logformat, 'r', 'utf-16')):
+ # line is a list of unicode objects
+ # So much white space!
+ yield [i.strip() for i in line]
+
+ elif logformat == 'csv':
+ def load_file():
+ def ansi_csv_reader(ansi_csv_data, **kwargs):
+ """Loads an 'ANSI' code page and returns unicode() objects."""
+ assert sys.getfilesystemencoding() == 'mbcs'
+ encoding = get_current_encoding()
+ for row in csv.reader(ansi_csv_data, **kwargs):
+ # Decode str 'ansi' instances to unicode instances, cell by cell:
+ yield [cell.decode(encoding) for cell in row]
+
+ # The fastest and smallest format but only supports 'ANSI' file paths.
+ # E.g. the filenames are encoding in the 'current' encoding.
+ for line in ansi_csv_reader(open(logname + '.' + logformat)):
+ # line is a list of unicode objects.
+ yield [i.strip() for i in line]
+
+ supported_events = cls.Context.supported_events()
+
+ def trim(generator):
+ for index, line in enumerate(generator):
+ if not index:
+ if line != cls.EXPECTED_HEADER:
+ raise TracingFailure(
+ 'Found malformed header: %s' % ' '.join(line),
+ None, None, None)
+ continue
+ # As you can see, the CSV is full of useful non-redundant information:
+ if (line[2] != '0' or # Event ID
+ line[3] not in ('2', '3') or # Version
+ line[4] != '0' or # Channel
+ line[5] != '0' or # Level
+ line[7] != '0' or # Task
+ line[8] != '0x0000000000000000' or # Keyword
+ line[12] != '' or # Instance ID
+ line[13] != '' or # Parent Instance ID
+ line[14] != cls.NULL_GUID or # Activity ID
+ line[15] != ''): # Related Activity ID
+ raise TracingFailure(
+ 'Found unexpected values in line: %s' % ' '.join(line),
+ None, None, None)
+
+ if (line[cls.EVENT_NAME], line[cls.TYPE]) not in supported_events:
+ continue
+
+ # Convert the PID in-place from hex.
+ line[cls.PID] = int(line[cls.PID], 16)
+
+ yield [
+ line[cls.EVENT_NAME],
+ line[cls.TYPE],
+ line[cls.PID],
+ line[cls.TID],
+ line[cls.PROCESSOR_ID],
+ line[cls.TIMESTAMP],
+ ] + line[cls.USER_DATA:]
+
+ write_json('%s.json' % logname, list(trim(load_file())), True)
+
@staticmethod
def _convert_log(logname, logformat):
"""Converts the ETL trace to text representation.
@@ -2303,63 +2318,23 @@ class LogmanTrace(ApiBase):
@staticmethod
def clean_trace(logname):
- for ext in ('', '.csv', '.etl', '.xml'):
+ for ext in ('', '.csv', '.etl', '.json', '.xml'):
if os.path.isfile(logname + ext):
os.remove(logname + ext)
@classmethod
- def parse_log(cls, filename, blacklist):
- logging.info('parse_log(%s, %s)' % (filename, blacklist))
+ def parse_log(cls, logname, blacklist):
+ logging.info('parse_log(%s, %s)' % (logname, blacklist))
def blacklist_more(filepath):
# All the NTFS metadata is in the form x:\$EXTEND or stuff like that.
return blacklist(filepath) or re.match(r'[A-Z]\:\\\$EXTEND', filepath)
- data = read_json(filename)
- logformat = data['format']
-
+ data = read_json(logname)
+ lines = read_json(logname + '.json')
context = cls.Context(blacklist_more, data['pid'])
-
- if logformat == 'csv_utf16':
- def utf_8_encoder(unicode_csv_data):
- """Encodes the unicode object as utf-8 encoded str instance"""
- for line in unicode_csv_data:
- yield line.encode('utf-8')
-
- def unicode_csv_reader(unicode_csv_data, **kwargs):
- """Encodes temporarily as UTF-8 since csv module doesn't do unicode."""
- csv_reader = csv.reader(utf_8_encoder(unicode_csv_data), **kwargs)
- for row in csv_reader:
- # Decode str utf-8 instances back to unicode instances, cell by cell:
- yield [cell.decode('utf-8') for cell in row]
-
- # The CSV file is UTF-16 so use codecs.open() to load the file into the
- # python internal unicode format (utf-8). Then explicitly re-encode as
- # utf8 as str instances so csv can parse it fine. Then decode the utf-8
- # str back into python unicode instances. This sounds about right.
- for line in unicode_csv_reader(
- codecs.open(filename + '.' + logformat, 'r', 'utf-16')):
- # line is a list of unicode objects
- context.on_csv_line(line)
-
- elif logformat == 'csv':
- def ansi_csv_reader(ansi_csv_data, **kwargs):
- """Loads an 'ANSI' code page and returns unicode() objects."""
- assert sys.getfilesystemencoding() == 'mbcs'
- encoding = get_current_encoding()
- for row in csv.reader(ansi_csv_data, **kwargs):
- # Decode str 'ansi' instances to unicode instances, cell by cell:
- yield [cell.decode(encoding) for cell in row]
-
- # The fastest and smallest format but only supports 'ANSI' file paths.
- # E.g. the filenames are encoding in the 'current' encoding.
- for line in ansi_csv_reader(open(filename + '.' + logformat)):
- # line is a list of unicode objects.
- context.on_csv_line(line)
-
- else:
- raise NotImplementedError('Implement %s' % logformat)
-
+ for line in lines:
+ context.on_line(line)
return context.to_results()