summaryrefslogtreecommitdiffstats
path: root/chrome/common/extensions/docs/server2/new_github_file_system.py
blob: 5aa1c2d9d82bdda36e22785ea0b893f16bcc0f52 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

import json
import logging
from cStringIO import StringIO
import posixpath
import traceback
from zipfile import ZipFile

import appengine_blobstore as blobstore
from appengine_url_fetcher import AppEngineUrlFetcher
from appengine_wrappers import urlfetch
from docs_server_utils import StringIdentity
from file_system import FileNotFoundError, FileSystem, FileSystemError, StatInfo
from future import Future
from object_store_creator import ObjectStoreCreator
from path_util import AssertIsDirectory, IsDirectory
import url_constants


_GITHUB_REPOS_NAMESPACE = 'GithubRepos'


def _LoadCredentials(object_store_creator):
  '''Returns (username, password) from |password_store|.
  '''
  password_store = object_store_creator.Create(
      GithubFileSystem,
      app_version=None,
      category='password',
      start_empty=False)
  password_data = password_store.GetMulti(('username', 'password')).Get()
  return password_data.get('username'), password_data.get('password')


class _GithubZipFile(object):
  '''A view of a ZipFile with a more convenient interface which ignores the
  'zipball' prefix that all paths have. The zip files that come straight from
  GitHub have paths like ['zipball/foo.txt', 'zipball/bar.txt'] but we only
  care about ['foo.txt', 'bar.txt'].
  '''

  @classmethod
  def Create(cls, repo_name, blob):
    try:
      zipball = ZipFile(StringIO(blob))
    except:
      logging.warning('zipball "%s" is not a valid zip' % repo_name)
      return None

    if not zipball.namelist():
      logging.warning('zipball "%s" is empty' % repo_name)
      return None

    name_prefix = None  # probably 'zipball'
    paths = []
    for name in zipball.namelist():
      prefix, path = name.split('/', 1)
      if name_prefix and prefix != name_prefix:
        logging.warning('zipball "%s" has names with inconsistent prefix: %s' %
                        (repo_name, zipball.namelist()))
        return None
      name_prefix = prefix
      paths.append(path)
    return cls(zipball, name_prefix, paths)

  def __init__(self, zipball, name_prefix, paths):
    self._zipball = zipball
    self._name_prefix = name_prefix
    self._paths = paths

  def Paths(self):
    '''Return all file paths in this zip file.
    '''
    return self._paths

  def List(self, path):
    '''Returns all files within a directory at |path|. Not recursive. Paths
    are returned relative to |path|.
    '''
    AssertIsDirectory(path)
    return [p[len(path):] for p in self._paths
            if p != path and
               p.startswith(path) and
               '/' not in p[len(path):].rstrip('/')]

  def Read(self, path):
    '''Returns the contents of |path|. Raises a KeyError if it doesn't exist.
    '''
    return self._zipball.read(posixpath.join(self._name_prefix, path))


class GithubFileSystem(FileSystem):
  '''Allows reading from a github.com repository.
  '''
  @staticmethod
  def Create(owner, repo, object_store_creator):
    '''Creates a GithubFileSystem that corresponds to a single github repository
    specified by |owner| and |repo|.
    '''
    return GithubFileSystem(
        url_constants.GITHUB_REPOS,
        owner,
        repo,
        object_store_creator,
        AppEngineUrlFetcher)

  @staticmethod
  def ForTest(repo, fake_fetcher, path=None, object_store_creator=None):
    '''Creates a GithubFileSystem that can be used for testing. It reads zip
    files and commit data from server2/test_data/github_file_system/test_owner
    instead of github.com. It reads from files specified by |repo|.
    '''
    return GithubFileSystem(
        path if path is not None else 'test_data/github_file_system',
        'test_owner',
        repo,
        object_store_creator or ObjectStoreCreator.ForTest(),
        fake_fetcher)

  def __init__(self, base_url, owner, repo, object_store_creator, Fetcher):
    self._repo_key = posixpath.join(owner, repo)
    self._repo_url = posixpath.join(base_url, owner, repo)
    self._username, self._password = _LoadCredentials(object_store_creator)
    self._blobstore = blobstore.AppEngineBlobstore()
    self._fetcher = Fetcher(self._repo_url)
    # Stores whether the github is up-to-date. This will either be True or
    # empty, the emptiness most likely due to this being a cron run.
    self._up_to_date_cache = object_store_creator.Create(
        GithubFileSystem, category='up-to-date')
    # Caches the zip file's stat. Overrides start_empty=False and use
    # |self._up_to_date_cache| to determine whether we need to refresh.
    self._stat_cache = object_store_creator.Create(
        GithubFileSystem, category='stat-cache', start_empty=False)

    # Created lazily in |_EnsureRepoZip|.
    self._repo_zip = None

  def _EnsureRepoZip(self):
    '''Initializes |self._repo_zip| if it hasn't already been (i.e. if
    _EnsureRepoZip has never been called before). In that case |self._repo_zip|
    will be set to a Future of _GithubZipFile and the fetch process started,
    whether that be from a blobstore or if necessary all the way from GitHub.
    '''
    if self._repo_zip is not None:
      return

    repo_key, repo_url, username, password = (
        self._repo_key, self._repo_url, self._username, self._password)

    def fetch_from_blobstore(version):
      '''Returns a Future which resolves to the _GithubZipFile for this repo
      fetched from blobstore.
      '''
      blob = None
      try:
        blob = self._blobstore.Get(repo_url, _GITHUB_REPOS_NAMESPACE)
      except blobstore.BlobNotFoundError:
        pass

      if blob is None:
        logging.warning('No blob for %s found in datastore' % repo_key)
        return fetch_from_github(version)

      repo_zip = _GithubZipFile.Create(repo_key, blob)
      if repo_zip is None:
        logging.warning('Blob for %s was corrupted in blobstore!?' % repo_key)
        return fetch_from_github(version)

      return Future(value=repo_zip)

    def fetch_from_github(version):
      '''Returns a Future which resolves to the _GithubZipFile for this repo
      fetched new from GitHub, then writes it to blobstore and |version| to the
      stat caches.
      '''
      github_future = self._fetcher.FetchAsync(
          'zipball', username=username, password=password)
      def resolve():
        try:
          blob = github_future.Get().content
        except urlfetch.DownloadError:
          raise FileSystemError('Failed to download repo %s file from %s' %
                                (repo_key, repo_url))

        repo_zip = _GithubZipFile.Create(repo_key, blob)
        if repo_zip is None:
          raise FileSystemError('Blob for %s was fetched corrupted from %s' %
                                (repo_key, repo_url))

        self._blobstore.Set(self._repo_url, blob, _GITHUB_REPOS_NAMESPACE)
        self._up_to_date_cache.Set(repo_key, True)
        self._stat_cache.Set(repo_key, version)
        return repo_zip
      return Future(callback=resolve)

    # To decide whether we need to re-stat, and from there whether to re-fetch,
    # make use of ObjectStore's start-empty configuration. If
    # |object_store_creator| is configured to start empty then our creator
    # wants to refresh (e.g. running a cron), so fetch the live stat from
    # GitHub. If the stat hasn't changed since last time then no reason to
    # re-fetch from GitHub, just take from blobstore.

    cached_version = self._stat_cache.Get(repo_key).Get()
    if self._up_to_date_cache.Get(repo_key).Get() is None:
      # This is either a cron or an instance where a cron has never been run.
      live_version = self._FetchLiveVersion(username, password)
      if cached_version != live_version:
        # Note: branch intentionally triggered if |cached_version| is None.
        logging.info('%s has changed, fetching from GitHub.' % repo_url)
        self._repo_zip = fetch_from_github(live_version)
      else:
        # Already up to date. Fetch from blobstore. No need to set up-to-date
        # to True here since it'll already be set for instances, and it'll
        # never be set for crons.
        logging.info('%s is up to date.' % repo_url)
        self._repo_zip = fetch_from_blobstore(cached_version)
    else:
      # Instance where cron has been run. It should be in blobstore.
      self._repo_zip = fetch_from_blobstore(cached_version)

    assert self._repo_zip is not None

  def _FetchLiveVersion(self, username, password):
    '''Fetches the current repository version from github.com and returns it.
    The version is a 'sha' hash value.
    '''
    # TODO(kalman): Do this asynchronously (use FetchAsync).
    result = self._fetcher.Fetch(
        'commits/HEAD', username=username, password=password)

    try:
      return json.loads(result.content)['sha']
    except (KeyError, ValueError):
      raise FileSystemError('Error parsing JSON from repo %s: %s' %
                            (self._repo_url, traceback.format_exc()))

  def Refresh(self):
    return self.ReadSingle('')

  def Read(self, paths, skip_not_found=False):
    '''Returns a directory mapping |paths| to the contents of the file at each
    path. If path ends with a '/', it is treated as a directory and is mapped to
    a list of filenames in that directory.
    '''
    self._EnsureRepoZip()
    def resolve():
      repo_zip = self._repo_zip.Get()
      reads = {}
      for path in paths:
        if path not in repo_zip.Paths():
          raise FileNotFoundError('"%s": %s not found' % (self._repo_key, path))
        if IsDirectory(path):
          reads[path] = repo_zip.List(path)
        else:
          reads[path] = repo_zip.Read(path)
      return reads
    return Future(callback=resolve)

  def Stat(self, path):
    '''Stats |path| returning its version as as StatInfo object. If |path| ends
    with a '/', it is assumed to be a directory and the StatInfo object returned
    includes child_versions for all paths in the directory.

    File paths do not include the name of the zip file, which is arbitrary and
    useless to consumers.

    Because the repository will only be downloaded once per server version, all
    stat versions are always 0.
    '''
    self._EnsureRepoZip()
    repo_zip = self._repo_zip.Get()

    if path not in repo_zip.Paths():
      raise FileNotFoundError('"%s" does not contain file "%s"' %
                              (self._repo_key, path))

    version = self._stat_cache.Get(self._repo_key).Get()
    assert version is not None, ('There was a zipball in datastore; there '
                                 'should be a version cached for it')

    stat_info = StatInfo(version)
    if IsDirectory(path):
      stat_info.child_versions = dict((p, StatInfo(version))
                                      for p in repo_zip.List(path))
    return stat_info

  def GetIdentity(self):
    return '%s' % StringIdentity(self.__class__.__name__ + self._repo_key)

  def __repr__(self):
    return '%s(key=%s, url=%s)' % (type(self).__name__,
                                   self._repo_key,
                                   self._repo_url)