tools/telemetry/telemetry/page/page_set_archive_info.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196

# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import json
import logging
import os
import re
import shutil
import tempfile

from telemetry import page as page_module
from telemetry.util import cloud_storage


def AssertValidCloudStorageBucket(bucket):
  is_valid = bucket in (None,
                        cloud_storage.PUBLIC_BUCKET,
                        cloud_storage.PARTNER_BUCKET,
                        cloud_storage.INTERNAL_BUCKET)
  if not is_valid:
    raise ValueError("Cloud storage privacy bucket %s is invalid" % bucket)


# TODO(chrishenry): Rename this (and module) to wpr_archive_info.WprArchiveInfo
# and move to telemetry.user_story or telemetry.wpr or telemetry.core.
class PageSetArchiveInfo(object):
  def __init__(self, file_path, data, bucket, ignore_archive=False):
    AssertValidCloudStorageBucket(bucket)
    self._file_path = file_path
    self._base_dir = os.path.dirname(file_path)
    self._bucket = bucket

    # Ensure directory exists.
    if not os.path.exists(self._base_dir):
      os.makedirs(self._base_dir)

    # Download all .wpr files.
    if not ignore_archive:
      if not self._bucket:
        logging.warning('page_set in %s has no bucket specified, and cannot be'
                        'downloaded from cloud_storage.', file_path)
      else:
        for archive_path in data['archives']:
          archive_path = self._WprFileNameToPath(archive_path)
          try:
            cloud_storage.GetIfChanged(archive_path, bucket)
          except (cloud_storage.CredentialsError,
                  cloud_storage.PermissionError):
            if os.path.exists(archive_path):
              # If the archive exists, assume the user recorded their own and
              # simply warn.
              logging.warning('Need credentials to update WPR archive: %s',
                              archive_path)

    # Map from the relative path (as it appears in the metadata file) of the
    # .wpr file to a list of page names it supports.
    self._wpr_file_to_page_names = data['archives']

    # Map from the page name to a relative path (as it appears in the metadata
    # file) of the .wpr file.
    self._page_name_to_wpr_file = dict()
    # Find out the wpr file names for each page.
    for wpr_file in data['archives']:
      page_names = data['archives'][wpr_file]
      for page_name in page_names:
        self._page_name_to_wpr_file[page_name] = wpr_file
    self.temp_target_wpr_file_path = None

  @classmethod
  def FromFile(cls, file_path, bucket, ignore_archive=False):
    if os.path.exists(file_path):
      with open(file_path, 'r') as f:
        data = json.load(f)
        return cls(file_path, data, bucket, ignore_archive=ignore_archive)
    return cls(file_path, {'archives': {}}, bucket,
               ignore_archive=ignore_archive)

  def WprFilePathForUserStory(self, story):
    if self.temp_target_wpr_file_path:
      return self.temp_target_wpr_file_path
    wpr_file = self._page_name_to_wpr_file.get(story.display_name, None)
    if wpr_file is None and isinstance(story, page_module.Page):
      # Some old page sets always use the URL to identify a page rather than the
      # display_name, so try to look for that.
      wpr_file = self._page_name_to_wpr_file.get(story.url, None)
    if wpr_file:
      return self._WprFileNameToPath(wpr_file)
    return None

  def AddNewTemporaryRecording(self, temp_wpr_file_path=None):
    if temp_wpr_file_path is None:
      temp_wpr_file_handle, temp_wpr_file_path = tempfile.mkstemp()
      os.close(temp_wpr_file_handle)
    self.temp_target_wpr_file_path = temp_wpr_file_path

  def AddRecordedPages(self, pages, upload_to_cloud_storage=False):
    if not pages:
      os.remove(self.temp_target_wpr_file_path)
      return

    (target_wpr_file, target_wpr_file_path) = self._NextWprFileName()
    for page in pages:
      self._SetWprFileForPage(page.display_name, target_wpr_file)
    shutil.move(self.temp_target_wpr_file_path, target_wpr_file_path)

    # Update the hash file.
    with open(target_wpr_file_path + '.sha1', 'wb') as f:
      f.write(cloud_storage.CalculateHash(target_wpr_file_path))
      f.flush()

    self._WriteToFile()
    self._DeleteAbandonedWprFiles()

    # Upload to cloud storage
    if upload_to_cloud_storage:
      if not self._bucket:
        logging.warning('PageSet must have bucket specified to upload pages to'
                        ' cloud storage.')
        return
      try:
        cloud_storage.Insert(self._bucket, target_wpr_file,
                             target_wpr_file_path)
      except cloud_storage.CloudStorageError, e:
        logging.warning('Failed to upload wpr file %s to cloud storage. '
                        'Error:%s' % target_wpr_file_path, e)

  def _DeleteAbandonedWprFiles(self):
    # Update the metadata so that the abandoned wpr files don't have empty page
    # name arrays.
    abandoned_wpr_files = self._AbandonedWprFiles()
    for wpr_file in abandoned_wpr_files:
      del self._wpr_file_to_page_names[wpr_file]
      # Don't fail if we're unable to delete some of the files.
      wpr_file_path = self._WprFileNameToPath(wpr_file)
      try:
        os.remove(wpr_file_path)
      except Exception:
        logging.warning('Failed to delete file: %s' % wpr_file_path)

  def _AbandonedWprFiles(self):
    abandoned_wpr_files = []
    for wpr_file, page_names in self._wpr_file_to_page_names.iteritems():
      if not page_names:
        abandoned_wpr_files.append(wpr_file)
    return abandoned_wpr_files

  def _WriteToFile(self):
    """Writes the metadata into the file passed as constructor parameter."""
    metadata = dict()
    metadata['description'] = (
        'Describes the Web Page Replay archives for a page set. Don\'t edit by '
        'hand! Use record_wpr for updating.')
    metadata['archives'] = self._wpr_file_to_page_names.copy()
    # Don't write data for abandoned archives.
    abandoned_wpr_files = self._AbandonedWprFiles()
    for wpr_file in abandoned_wpr_files:
      del metadata['archives'][wpr_file]

    with open(self._file_path, 'w') as f:
      json.dump(metadata, f, indent=4)
      f.flush()

  def _WprFileNameToPath(self, wpr_file):
    return os.path.abspath(os.path.join(self._base_dir, wpr_file))

  def _NextWprFileName(self):
    """Creates a new file name for a wpr archive file."""
    # The names are of the format "some_thing_number.wpr". Read the numbers.
    highest_number = -1
    base = None
    for wpr_file in self._wpr_file_to_page_names:
      match = re.match(r'(?P<BASE>.*)_(?P<NUMBER>[0-9]+)\.wpr', wpr_file)
      if not match:
        raise Exception('Illegal wpr file name ' + wpr_file)
      highest_number = max(int(match.groupdict()['NUMBER']), highest_number)
      if base and match.groupdict()['BASE'] != base:
        raise Exception('Illegal wpr file name ' + wpr_file +
                        ', doesn\'t begin with ' + base)
      base = match.groupdict()['BASE']
    if not base:
      # If we're creating a completely new info file, use the base name of the
      # page set file.
      base = os.path.splitext(os.path.basename(self._file_path))[0]
    new_filename = '%s_%03d.wpr' % (base, highest_number + 1)
    return new_filename, self._WprFileNameToPath(new_filename)

  def _SetWprFileForPage(self, page_name, wpr_file):
    """For modifying the metadata when we're going to record a new archive."""
    old_wpr_file = self._page_name_to_wpr_file.get(page_name, None)
    if old_wpr_file:
      self._wpr_file_to_page_names[old_wpr_file].remove(page_name)
    self._page_name_to_wpr_file[page_name] = wpr_file
    if wpr_file not in self._wpr_file_to_page_names:
      self._wpr_file_to_page_names[wpr_file] = []
    self._wpr_file_to_page_names[wpr_file].append(page_name)