# Copyright (c) 2014 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. import xml.dom.minidom as minidom from xml.parsers.expat import ExpatError import crash_utils from repository_parser_interface import ParserInterface # This number is 6 because each linediff page in src.chromium.org should # contain the following tables: table with revision number, table with actual # diff, table with dropdown menu, table with legend, a border table and a table # containing page information. NUM_TABLES_IN_LINEDIFF_PAGE = 6 # Each of the linediff info should contain 3 tds, one for changed line number, # and two for line contents before/after. NUM_TDS_IN_LINEDIFF_PAGE = 3 class SVNParser(ParserInterface): """Parser for SVN repository using chromium.org, for components in config. Attributes: url_map: A map from component to the urls, where urls are for changelog, revision, line diff and annotation. """ def __init__(self, url_map): self.component_to_urls_map = url_map def ParseChangelog(self, component, range_start, range_end): file_to_revision_map = {} revision_map = {} # Check if the current component is supported by reading the components # parsed from config file. If it is not, fail. url_map = self.component_to_urls_map.get(component) if not url_map: return (revision_map, file_to_revision_map) # Retrieve data from the url, return empty map if fails. revision_range_str = '%s:%s' % (range_start, range_end) url = url_map['changelog_url'] % revision_range_str response = crash_utils.GetDataFromURL(url) if not response: return (revision_map, file_to_revision_map) # Parse xml out of the returned string. If it fails, return empty map. try: xml_revisions = minidom.parseString(response) except ExpatError: return (revision_map, file_to_revision_map) # Iterate through the returned XML object. revisions = xml_revisions.getElementsByTagName('logentry') for revision in revisions: # Create new revision object for each of the revision. revision_object = {} # Set author of the CL. revision_object['author'] = revision.getElementsByTagName( 'author')[0].firstChild.nodeValue # Get the revision number from xml. revision_number = int(revision.getAttribute('revision')) # Iterate through the changed paths in the CL. paths = revision.getElementsByTagName('paths') if paths: for changed_path in paths[0].getElementsByTagName('path'): # Get path and file change type from the xml. file_path = changed_path.firstChild.nodeValue file_change_type = changed_path.getAttribute('action') if file_path.startswith('/trunk/'): file_path = file_path[len('/trunk/'):] # Add file to the map. if file_path not in file_to_revision_map: file_to_revision_map[file_path] = [] file_to_revision_map[file_path].append( (revision_number, file_change_type)) # Set commit message of the CL. revision_object['message'] = revision.getElementsByTagName('msg')[ 0].firstChild.nodeValue # Set url of this CL. revision_url = url_map['revision_url'] % revision_number revision_object['url'] = revision_url # Add this CL to the revision map. revision_map[revision_number] = revision_object return (revision_map, file_to_revision_map) def ParseLineDiff(self, path, component, file_change_type, revision_number): changed_line_numbers = [] changed_line_contents = [] url_map = self.component_to_urls_map.get(component) if not url_map: return (None, None, None) # If the file is added (not modified), treat it as if it is not changed. backup_url = url_map['revision_url'] % revision_number if file_change_type == 'A': return (backup_url, changed_line_numbers, changed_line_contents) # Retrieve data from the url. If no data is retrieved, return empty lists. url = url_map['diff_url'] % (path, revision_number - 1, revision_number, revision_number) data = crash_utils.GetDataFromURL(url) if not data: return (backup_url, changed_line_numbers, changed_line_contents) line_diff_html = minidom.parseString(data) tables = line_diff_html.getElementsByTagName('table') # If there are not NUM_TABLES tables in the html page, there should be an # error in the html page. if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE: return (backup_url, changed_line_numbers, changed_line_contents) # Diff content is in the second table. Each line of the diff content # is in . trs = tables[1].getElementsByTagName('tr') prefix_len = len('vc_diff_') # Filter trs so that it only contains diff chunk with contents. filtered_trs = [] for tr in trs: tr_class = tr.getAttribute('class') # Check for the classes of the s. if tr_class: tr_class = tr_class[prefix_len:] # Do not have to add header. if tr_class == 'header' or tr_class == 'chunk_header': continue # If the class of tr is empty, this page does not have any change. if tr_class == 'empty': return (backup_url, changed_line_numbers, changed_line_contents) filtered_trs.append(tr) # Iterate through filtered trs, and grab line diff information. for tr in filtered_trs: tds = tr.getElementsByTagName('td') # If there aren't 3 tds, this line does should not contain line diff. if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE: continue # If line number information is not in hyperlink, ignore this line. try: line_num = tds[0].getElementsByTagName('a')[0].firstChild.nodeValue left_diff_type = tds[1].getAttribute('class')[prefix_len:] right_diff_type = tds[2].getAttribute('class')[prefix_len:] except IndexError: continue # Treat the line as modified only if both left and right diff has type # changed or both have different change type, and if the change is not # deletion. if (left_diff_type != right_diff_type) or ( left_diff_type == 'change' and right_diff_type == 'change'): # Check if the line content is not empty. try: new_line = tds[2].firstChild.nodeValue except AttributeError: new_line = '' if not (left_diff_type == 'remove' and right_diff_type == 'empty'): changed_line_numbers.append(int(line_num)) changed_line_contents.append(new_line.strip()) return (url, changed_line_numbers, changed_line_contents) def ParseBlameInfo(self, component, file_path, line, revision): url_map = self.component_to_urls_map.get(component) if not url_map: return None # Retrieve blame data from url, return None if fails. url = url_map['blame_url'] % (file_path, revision, revision) data = crash_utils.GetDataFromURL(url) if not data: return None blame_html = minidom.parseString(data) title = blame_html.getElementsByTagName('title') # If the returned html page is an exception page, return None. if title[0].firstChild.nodeValue == 'ViewVC Exception': return None # Each of the blame result is in . blame_results = blame_html.getElementsByTagName('tr') try: blame_result = blame_results[line] except IndexError: return None # There must be 4 for each . If not, this page is wrong. tds = blame_result.getElementsByTagName('td') if len(tds) != 4: return None # The third has the line content, separated by s. Combine # those to get a string of changed line. If it has nothing, the line # is empty. line_content = '' if tds[3].hasChildNodes(): contents = tds[3].childNodes for content in contents: # Nodetype 3 means it is text node. if content.nodeType == minidom.Node.TEXT_NODE: line_content += content.nodeValue else: line_content += content.firstChild.nodeValue line_content = line_content.strip() # If the current line has the same author/revision as the previous lines, # the result is not shown. Propagate up until we find the line with info. while not tds[1].firstChild: line -= 1 blame_result = blame_results[line] tds = blame_result.getElementsByTagName('td') author = tds[1].firstChild.nodeValue # Revision can either be in hyperlink or plain text. try: revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue except IndexError: revision = tds[2].firstChild.nodeValue (revision_info, _) = self.ParseChangelog(component, revision, revision) message = revision_info[int(revision)]['message'] # Return the parsed information. revision_url = url_map['revision_url'] % int(revision) return (line_content, revision, author, revision_url, message)