tools/findit/svn_repository_parser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250

# Copyright (c) 2014 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import xml.dom.minidom as minidom
from xml.parsers.expat import ExpatError

import crash_utils
from repository_parser_interface import ParserInterface


# This number is 6 because each linediff page in src.chromium.org should
# contain the following tables: table with revision number, table with actual
# diff, table with dropdown menu, table with legend, a border table and a table
# containing page information.
NUM_TABLES_IN_LINEDIFF_PAGE = 6
# Each of the linediff info should contain 3 tds, one for changed line number,
# and two for line contents before/after.
NUM_TDS_IN_LINEDIFF_PAGE = 3


class SVNParser(ParserInterface):
  """Parser for SVN repository using chromium.org, for components in config.

  Attributes:
    url_map: A map from component to the urls, where urls are for changelog,
             revision, line diff and annotation.
  """

  def __init__(self, url_map):
    self.component_to_urls_map = url_map

  def ParseChangelog(self, component, range_start, range_end):
    file_to_revision_map = {}
    revision_map = {}

    # Check if the current component is supported by reading the components
    # parsed from config file. If it is not, fail.

    url_map = self.component_to_urls_map.get(component)
    if not url_map:
      return (revision_map, file_to_revision_map)

    # Retrieve data from the url, return empty map if fails.
    revision_range_str = '%s:%s' % (range_start, range_end)
    url = url_map['changelog_url'] % revision_range_str
    response = crash_utils.GetDataFromURL(url)
    if not response:
      return (revision_map, file_to_revision_map)

    # Parse xml out of the returned string. If it fails, return empty map.
    try:
      xml_revisions = minidom.parseString(response)
    except ExpatError:
      return (revision_map, file_to_revision_map)

    # Iterate through the returned XML object.
    revisions = xml_revisions.getElementsByTagName('logentry')
    for revision in revisions:
      # Create new revision object for each of the revision.
      revision_object = {}

      # Set author of the CL.
      revision_object['author'] = revision.getElementsByTagName(
          'author')[0].firstChild.nodeValue

      # Get the revision number from xml.
      revision_number = int(revision.getAttribute('revision'))

      # Iterate through the changed paths in the CL.
      paths = revision.getElementsByTagName('paths')
      if paths:
        for changed_path in paths[0].getElementsByTagName('path'):
          # Get path and file change type from the xml.
          file_path = changed_path.firstChild.nodeValue
          file_change_type = changed_path.getAttribute('action')

          if file_path.startswith('/trunk/'):
            file_path = file_path[len('/trunk/'):]

          # Add file to the map.
          if file_path not in file_to_revision_map:
            file_to_revision_map[file_path] = []
          file_to_revision_map[file_path].append(
              (revision_number, file_change_type))

      # Set commit message of the CL.
      revision_object['message'] = revision.getElementsByTagName('msg')[
          0].firstChild.nodeValue

      # Set url of this CL.
      revision_url = url_map['revision_url'] % revision_number
      revision_object['url'] = revision_url

      # Add this CL to the revision map.
      revision_map[revision_number] = revision_object

    return (revision_map, file_to_revision_map)

  def ParseLineDiff(self, path, component, file_change_type, revision_number):
    changed_line_numbers = []
    changed_line_contents = []

    url_map = self.component_to_urls_map.get(component)
    if not url_map:
      return (None, None, None)

    # If the file is added (not modified), treat it as if it is not changed.
    backup_url = url_map['revision_url'] % revision_number
    if file_change_type == 'A':
      return (backup_url, changed_line_numbers, changed_line_contents)

    # Retrieve data from the url. If no data is retrieved, return empty lists.
    url = url_map['diff_url'] % (path, revision_number - 1,
                                 revision_number, revision_number)
    data = crash_utils.GetDataFromURL(url)
    if not data:
      return (backup_url, changed_line_numbers, changed_line_contents)

    line_diff_html = minidom.parseString(data)
    tables = line_diff_html.getElementsByTagName('table')
    # If there are not NUM_TABLES tables in the html page, there should be an
    # error in the html page.
    if len(tables) != NUM_TABLES_IN_LINEDIFF_PAGE:
      return (backup_url, changed_line_numbers, changed_line_contents)

    # Diff content is in the second table. Each line of the diff content
    # is in <tr>.
    trs = tables[1].getElementsByTagName('tr')
    prefix_len = len('vc_diff_')

    # Filter trs so that it only contains diff chunk with contents.
    filtered_trs = []
    for tr in trs:
      tr_class = tr.getAttribute('class')

      # Check for the classes of the <tr>s.
      if tr_class:
        tr_class = tr_class[prefix_len:]

        # Do not have to add header.
        if tr_class == 'header' or tr_class == 'chunk_header':
          continue

        # If the class of tr is empty, this page does not have any change.
        if tr_class == 'empty':
          return (backup_url, changed_line_numbers, changed_line_contents)

      filtered_trs.append(tr)

    # Iterate through filtered trs, and grab line diff information.
    for tr in filtered_trs:
      tds = tr.getElementsByTagName('td')

      # If there aren't 3 tds, this line does should not contain line diff.
      if len(tds) != NUM_TDS_IN_LINEDIFF_PAGE:
        continue

      # If line number information is not in hyperlink, ignore this line.
      try:
        line_num = tds[0].getElementsByTagName('a')[0].firstChild.nodeValue
        left_diff_type = tds[1].getAttribute('class')[prefix_len:]
        right_diff_type = tds[2].getAttribute('class')[prefix_len:]
      except IndexError:
        continue

      # Treat the line as modified only if both left and right diff has type
      # changed or both have different change type, and if the change is not
      # deletion.
      if (left_diff_type != right_diff_type) or (
          left_diff_type == 'change' and right_diff_type == 'change'):

        # Check if the line content is not empty.
        try:
          new_line = tds[2].firstChild.nodeValue
        except AttributeError:
          new_line = ''

        if not (left_diff_type == 'remove' and right_diff_type == 'empty'):
          changed_line_numbers.append(int(line_num))
          changed_line_contents.append(new_line.strip())

    return (url, changed_line_numbers, changed_line_contents)

  def ParseBlameInfo(self, component, file_path, line, revision):
    url_map = self.component_to_urls_map.get(component)
    if not url_map:
      return None

    # Retrieve blame data from url, return None if fails.
    url = url_map['blame_url'] % (file_path, revision, revision)
    data = crash_utils.GetDataFromURL(url)
    if not data:
      return None

    blame_html = minidom.parseString(data)

    title = blame_html.getElementsByTagName('title')
    # If the returned html page is an exception page, return None.
    if title[0].firstChild.nodeValue == 'ViewVC Exception':
      return None

    # Each of the blame result is in <tr>.
    blame_results = blame_html.getElementsByTagName('tr')
    try:
      blame_result = blame_results[line]
    except IndexError:
      return None

    # There must be 4 <td> for each <tr>. If not, this page is wrong.
    tds = blame_result.getElementsByTagName('td')
    if len(tds) != 4:
      return None

    # The third <td> has the line content, separated by <span>s. Combine
    # those to get a string of changed line. If it has nothing, the line
    # is empty.
    line_content = ''
    if tds[3].hasChildNodes():
      contents = tds[3].childNodes

      for content in contents:
        # Nodetype 3 means it is text node.
        if content.nodeType == minidom.Node.TEXT_NODE:
          line_content += content.nodeValue
        else:
          line_content += content.firstChild.nodeValue

      line_content = line_content.strip()

    # If the current line has the same author/revision as the previous lines,
    # the result is not shown. Propagate up until we find the line with info.
    while not tds[1].firstChild:
      line -= 1
      blame_result = blame_results[line]
      tds = blame_result.getElementsByTagName('td')
    author = tds[1].firstChild.nodeValue

    # Revision can either be in hyperlink or plain text.
    try:
      revision = tds[2].getElementsByTagName('a')[0].firstChild.nodeValue
    except IndexError:
      revision = tds[2].firstChild.nodeValue

    (revision_info, _) = self.ParseChangelog(component, revision, revision)
    message = revision_info[int(revision)]['message']

    # Return the parsed information.
    revision_url = url_map['revision_url'] % int(revision)
    return (line_content, revision, author, revision_url, message)