media/tools/bug_hunter/bug_hunter.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372

#!/usr/bin/env python
# Copyright (c) 2011 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""This script queries the Chromium issue tracker and e-mails the results.

It queries issue tracker using Issue Tracker API. The query
parameters can be specified by command-line arguments. For example, with the
following command:

  'python bug_hunter.py -q video Status:Unconfirmed OR audio Status:Unconfirmed
   -s sender@chromium.org -r receiver@chromium.org -v 100 -u days'

You will find all 'Unconfirmed' issues created in the last 100 days containing
'video' or 'audio' in their content/comments. The content of these issues are
sent to receiver@chromium.org.

TODO(imasaki): users can specify the interval as say: "100d" for "100 days".

There are two limitations in the current implementation of issue tracker API
and UI:
* only outermost OR is valid. For example, the query
  'video OR audio Status:Unconfirmed' is translated into
  'video OR (audio AND Status:Unconfirmed)'
* brackets are not supported. For example, the query
  '(video OR audio) Status:Unconfirmed' does not work.

You need to install following to run this script
  gdata-python-client (http://code.google.com/p/gdata-python-client/)
  rfc3339.py (http://henry.precheur.org/projects/rfc3339)

Links:
* Chromium issue tracker: http://code.google.com/p/chromium/issues/list
* Issue tracker API: http://code.google.com/p/support/wiki/IssueTrackerAPI
* Search tips for the issue tracker:
    http://code.google.com/p/chromium/issues/searchtips
"""

import csv
import datetime
from email.mime.multipart import MIMEMultipart
from email.mime.text import MIMEText
import logging
from operator import itemgetter
import optparse
import re
import smtplib
import socket
import sys
import urllib

try:
  import gdata.data
  import gdata.projecthosting.client
except ImportError:
  logging.error('gdata-client needs to be installed. Please install\n'
                'and try again (http://code.google.com/p/gdata-python-client/)')
  sys.exit(1)

try:
  import rfc3339
except ImportError:
  logging.error('rfc3339 needs to be installed. Please install\n'
                'and try again (http://henry.precheur.org/projects/rfc3339)')
  sys.exit(1)

# A list of default values.
_DEFAULT_INTERVAL_UNIT = 'hours'
_DEFAULT_ISSUE_ELEMENT_IN_EMAIL = ('author', 'status', 'state', 'content',
                                   'comments', 'labels', 'urls')
_DEFAULT_PROJECT_NAME = 'chromium'
_DEFAULT_QUERY_TITLE = 'potential media bugs'
_DEFAULT_QUERY = ('video -has:Feature -has:Owner -label:nomedia '
                  'status:Unconfirmed OR audio -has:Feature -has:Owner '
                  '-label:nomedia status:Unconfirmed')
_DEFAULT_OUTPUT_FILENAME = 'output.csv'
_DETAULT_MAX_COMMENTS = 1000

_INTERVAL_UNIT_CHOICES = ('hours', 'days', 'weeks')

# URLs in this list are excluded from URL extraction from bug
# content/comments. Each list element should not contain the url ending in
# '/'. For example, the element should be 'http://www.google.com' but not
# 'http://www.google.com/'
_URL_EXCLUSION_LIST = ('http://www.youtube.com/html5',
                       'http://www.google.com')
_ISSUE_ELEMENT_IN_EMAIL_CHOICES = ('issue_id', 'author', 'status', 'state',
                                   'content', 'comments', 'labels', 'urls')


def ParseArgs():
  """Returns options dictionary from parsed command line arguments."""
  parser = optparse.OptionParser()

  parser.add_option('-e', '--email-entries',
                    help=('A comma-separated list of issue entries that are '
                          'sent in the email content. '
                          'Possible strings are %s. Default: %%default.' %
                          ', '.join(_ISSUE_ELEMENT_IN_EMAIL_CHOICES)),
                    default=','.join(_DEFAULT_ISSUE_ELEMENT_IN_EMAIL))
  parser.add_option('-l', '--max-comments',
                    help=('The maximum number of comments returned for each '
                          'issue in a reverse chronological order. '
                          'Default: %default.'),
                    type='int', default=_DETAULT_MAX_COMMENTS)
  parser.add_option('-o', '--output-filename',
                    help=('Filename for result output in CSV format. '
                          'Default: %default.'),
                    default=_DEFAULT_OUTPUT_FILENAME, metavar='FILE')
  parser.add_option('-p', '--project-name', default=_DEFAULT_PROJECT_NAME,
                    help='Project name string. Default: %default')
  parser.add_option('-q', '--query', default=_DEFAULT_QUERY,
                    help=('Query to be used to find bugs. The detail can be '
                          'found in Chromium Issue tracker page '
                          'http://code.google.com/p/chromium/issues/searchtips.'
                          ' Default: "%default".'))
  parser.add_option('-r', '--receiver-email-address',
                    help="Receiver's email address (Required).")
  parser.add_option('-s', '--sender-email-address',
                    help="Sender's email address (Required).")
  parser.add_option('-t', '--query-title',
                    default=_DEFAULT_QUERY_TITLE, dest='query_title',
                    help=('Query title string used in the subject of the '
                          'result email. Default: %default.'))
  parser.add_option('-u', '--interval_unit', default=_DEFAULT_INTERVAL_UNIT,
                    choices=_INTERVAL_UNIT_CHOICES,
                    help=('Unit name for |interval_value|. Valid options are '
                          '%s. Default: %%default' % (
                              ', '.join(_INTERVAL_UNIT_CHOICES))))
  parser.add_option('-v', '--interval-value', type='int',
                    help=('Interval value to find bugs. '
                          'The script looks for bugs during '
                          'that interval (up to now). This option is used in '
                          'conjunction with |--interval_unit| option. '
                          'The script looks for all bugs if this is not '
                          'specified.'))

  options = parser.parse_args()[0]

  options.email_entries = options.email_entries.split(',')
  options.email_entries = [entry for entry in options.email_entries
                           if entry in _ISSUE_ELEMENT_IN_EMAIL_CHOICES]
  if not options.email_entries:
    logging.warning('No issue elements in email in option. '
                    'Default email entries will be used.')
    options.email_entries = _DEFAULT_ISSUE_ELEMENT_IN_EMAIL
  logging.info('The following is the issue elements in email: %s ' + (
      ', '.join(options.email_entries)))
  return options


class BugHunter(object):
  """This class queries issue trackers and e-mails the results."""

  _ISSUE_SEARCH_LINK_BASE = ('http://code.google.com/p/chromium/issues/list?'
                             'can=2&colspec=ID+Pri+Mstone+ReleaseBlock+Area'
                             '+Feature+Status+Owner+Summary&cells=tiles'
                             '&sort=-id')
  # TODO(imasaki): Convert these into template library.
  _EMAIL_ISSUE_TEMPLATE = ('<li><a href="http://crbug.com/%(issue_id)s">'
                           '%(issue_id)s %(title)s</a> ')
  _EMAIL_SUBJECT_TEMPLATE = ('BugHunter found %(n_issues)d %(query_title)s '
                             'bug%(plural)s%(time_msg)s!')
  _EMAIL_MSG_TEMPLATE = ('<a href="%(link_base)s&q=%(unquote_query_text)s">'
                         'Used Query</a>: %(query_text)s<br><br>'
                         'The number of issues : %(n_issues)d<br>'
                         '<ul>%(issues)s</ul>')

  def __init__(self, options):
    """Sets up initial state for Bug Hunter.

    Args:
      options: Command-line options.
    """
    self._client = gdata.projecthosting.client.ProjectHostingClient()
    self._options = options
    self._issue_template = BugHunter._EMAIL_ISSUE_TEMPLATE
    for entry in options.email_entries:
      self._issue_template += '%%(%s)s ' % entry
    self._issue_template += '</li>'

  def GetComments(self, issue_id, max_comments):
    """Get comments for a issue.

    Args:
      issue_id: Issue id for each issue in the issue tracker.
      max_comments: The maximum number of comments to be returned. The comments
        are returned in a reverse chronological order.

    Returns:
      A list of (author name, comments, updated time) tuples.
    """
    comments_feed = self._client.get_comments(self._options.project_name,
                                              issue_id)
    comment_list = [(comment.content.text, comment.author[0].name.text,
                     comment.updated.text)
                    for comment
                    in list(reversed(comments_feed.entry))[0:max_comments]]
    return comment_list

  def GetIssues(self):
    """Get issues from issue tracker and return them.

    Returns:
      A list of issues in descending order by issue_id. Each element in the
        list is a dictionary where the keys are 'issue_id', 'title', 'author',
        'status', 'state', 'content', 'comments', 'labels', 'urls'.
        Returns an empty list when there is no matching issue.
    """
    min_time = None
    if self._options.interval_value:
      # Issue Tracker Data API uses RFC 3339 timestamp format, For example:
      # 2005-08-09T10:57:00-08:00
      # (http://code.google.com/p/support/wiki/IssueTrackerAPIPython)
      delta = datetime.timedelta(
          **{self._options.interval_unit: self._options.interval_value})
      dt = datetime.datetime.now() - delta
      min_time = rfc3339.rfc3339(dt)

    query = gdata.projecthosting.client.Query(text_query=self._options.query,
                                              max_results=1000,
                                              published_min=min_time)

    feed = self._client.get_issues(self._options.project_name, query=query)
    if not feed.entry:
      logging.info('No issues available to match query %s.',
                   self._options.query)
      return []
    issues = []
    for entry in feed.entry:
      # The fully qualified id is a URL. We just want the number.
      issue_id = entry.id.text.split('/')[-1]
      if not issue_id.isdigit():
        logging.warning('Issue_id is not correct: %s. Skipping.', issue_id)
        continue
      label_list = [label.text for label in entry.label]
      comments = ''
      if 'comments' in self._options.email_entries:
        comments = ''.join(
            [''.join(comment) if not comment else ''
             for comment
             in self.GetComments(issue_id, self._options.max_comments)])
      content = BugHunterUtils.StripHTML(entry.content.text)
      url_list = list(
          set(re.findall(r'(https?://\S+)', content + comments)))
      url_list = [url for url in url_list
                  if not url.rstrip('/') in _URL_EXCLUSION_LIST]
      issues.append({'issue_id': issue_id, 'title': entry.title.text,
                     'author': entry.author[0].name.text,
                     'status': entry.status.text,
                     'state': entry.state.text, 'content': content,
                     'comments': comments, 'labels': label_list,
                     'urls': url_list})
    return sorted(issues, key=itemgetter('issue_id'), reverse=True)

  def _SetUpEmailSubjectMsg(self, issues):
    """Set up email subject and its content.

    Args:
      issues: Please refer to the return value in GetIssues().

    Returns:
      A tuple of two strings (email subject and email content).
    """
    time_msg = ''
    if self._options.interval_value:
      time_msg = ' in the past %s %s%s' % (
          self._options.interval_value, self._options.interval_unit[:-1],
          's' if self._options.interval_value > 1 else '')
    subject = BugHunter._EMAIL_SUBJECT_TEMPLATE % {
        'n_issues': len(issues),
        'query_title': self._options.query_title,
        'plural': 's' if len(issues) > 1 else '',
        'time_msg': time_msg}
    content = BugHunter._EMAIL_MSG_TEMPLATE % {
        'link_base': BugHunter._ISSUE_SEARCH_LINK_BASE,
        'unquote_query_text': urllib.quote(self._options.query),
        'query_text': self._options.query,
        'n_issues': len(issues),
        'issues': ''.join(
            [self._issue_template % issue for issue in issues])}
    return (subject, content)

  def SendResultEmail(self, issues):
    """Send result email.

    Args:
      issues: Please refer to the return value in GetIssues().
    """
    subject, content = self._SetUpEmailSubjectMsg(issues)
    BugHunterUtils.SendEmail(
        content, self._options.sender_email_address,
        self._options.receiver_email_address, subject)

  def WriteIssuesToFileInCSV(self, issues, filename):
    """Write issues to a file in CSV format.

    Args:
      issues: Please refer to the return value in GetIssues().
      filename: File name for CSV file.
    """
    with open(filename, 'w') as f:
      writer = csv.writer(f)
      # Write header first.
      writer.writerow(issues[0].keys())
      for issue in issues:
        writer.writerow(
            [unicode(value).encode('utf-8') for value in issue.values()])


class BugHunterUtils(object):
  """Utility class for Bug Hunter."""

  @staticmethod
  def StripHTML(string_with_html):
    """Strip HTML tags from string.

    Args:
      string_with_html: A string with HTML tags.

    Returns:
      A string without HTML tags.
    """
    return re.sub('<[^<]+?>', '', string_with_html)

  @staticmethod
  def SendEmail(message, sender_email_address, receivers_email_address,
                subject):
    """Send email using localhost's mail server.

    Args:
      message: Email message to be sent.
      sender_email_address: Sender's email address.
      receivers_email_address: Receiver's email address.
      subject: Email subject.

    Returns:
      True if successful; False, otherwise.
    """
    try:
      html = '<html><head></head><body>%s</body></html>' % message
      msg = MIMEMultipart('alternative')
      msg['Subject'] = subject
      msg['From'] = sender_email_address
      msg['To'] = receivers_email_address
      msg.attach(MIMEText(html.encode('utf-8'), 'html', _charset='utf-8'))
      smtp_obj = smtplib.SMTP('localhost')
      smtp_obj.sendmail(sender_email_address, receivers_email_address,
                        msg.as_string())
      logging.info('Successfully sent email.')
      smtp_obj.quit()
      return True
    except smtplib.SMTPException:
      logging.exception('Authentication failed, unable to send email.')
    except (socket.gaierror, socket.error, socket.herror):
      logging.exception('Unable to send email.')
    return False


def Main():
  ops = ParseArgs()
  bh = BugHunter(ops)
  issues = bh.GetIssues()
  if issues and ops.sender_email_address and ops.receiver_email_address:
    bh.SendResultEmail(issues)
  if issues:
    bh.WriteIssuesToFileInCSV(issues, ops.output_filename)


if __name__ == '__main__':
  Main()