summaryrefslogtreecommitdiffstats
path: root/chrome/common/extensions/docs/server2/path_canonicalizer.py
blob: 04ad9bef200d9c9a142f6618dd0ae89fa60c2a5f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
# Copyright 2013 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

from collections import defaultdict
import posixpath

from future import Future
from path_util import SplitParent
from special_paths import SITE_VERIFICATION_FILE

def _Normalize(file_name, splittext=False):
  normalized = file_name
  if splittext:
    normalized = posixpath.splitext(file_name)[0]
  normalized = normalized.replace('.', '').replace('-', '').replace('_', '')
  return normalized.lower()

def _CommonNormalizedPrefix(first_file, second_file):
  return posixpath.commonprefix((_Normalize(first_file),
                                 _Normalize(second_file)))


class PathCanonicalizer(object):
  '''Transforms paths into their canonical forms. Since the docserver has had
  many incarnations - e.g. there didn't use to be apps/ - there may be old
  paths lying around the webs. We try to redirect those to where they are now.
  '''
  def __init__(self,
               file_system,
               object_store_creator,
               strip_extensions):
    # |strip_extensions| is a list of file extensions (e.g. .html) that should
    # be stripped for a path's canonical form.
    self._cache = object_store_creator.Create(
        PathCanonicalizer, category=file_system.GetIdentity())
    self._file_system = file_system
    self._strip_extensions = strip_extensions

  def _LoadCache(self):
    cached_future = self._cache.GetMulti(('canonical_paths',
                                          'simplified_paths_map'))

    def resolve():
      # |canonical_paths| is the pre-calculated set of canonical paths.
      # |simplified_paths_map| is a lazily populated mapping of simplified file
      # names to a list of full paths that contain them. For example,
      #  - browseraction: [extensions/browserAction.html]
      #  - storage: [apps/storage.html, extensions/storage.html]
      cached = cached_future.Get()
      canonical_paths, simplified_paths_map = (
          cached.get('canonical_paths'), cached.get('simplified_paths_map'))

      if canonical_paths is None:
        assert simplified_paths_map is None
        canonical_paths = set()
        simplified_paths_map = defaultdict(list)
        for base, dirs, files in self._file_system.Walk(''):
          for path in dirs + files:
            path_without_ext, ext = posixpath.splitext(path)
            canonical_path = posixpath.join(base, path_without_ext)
            if (ext not in self._strip_extensions or
                path == SITE_VERIFICATION_FILE):
              canonical_path += ext
            canonical_paths.add(canonical_path)
            simplified_paths_map[_Normalize(path, splittext=True)].append(
                canonical_path)
        # Store |simplified_paths_map| sorted. Ties in length are broken by
        # taking the shortest, lexicographically smallest path.
        for path_list in simplified_paths_map.itervalues():
          path_list.sort(key=lambda p: (len(p), p))
        self._cache.SetMulti({
          'canonical_paths': canonical_paths,
          'simplified_paths_map': simplified_paths_map,
        })
      else:
        assert simplified_paths_map is not None

      return canonical_paths, simplified_paths_map

    return Future(callback=resolve)

  def Canonicalize(self, path):
    '''Returns the canonical path for |path|.
    '''
    canonical_paths, simplified_paths_map = self._LoadCache().Get()

    # Path may already be the canonical path.
    if path in canonical_paths:
      return path

    # Path not found. Our single heuristic: find |base| in the directory
    # structure with the longest common prefix of |path|.
    _, base = SplitParent(path)

    # Paths with a non-extension dot separator lose information in
    # _SimplifyFileName, so we try paths both with and without the dot to
    # maximize the possibility of finding the right path.
    potential_paths = (
        simplified_paths_map.get(_Normalize(base), []) +
        simplified_paths_map.get(_Normalize(base, splittext=True), []))

    if potential_paths == []:
      # There is no file with anything close to that name.
      return path

    # The most likely canonical file is the one with the longest common prefix
    # with |path|. This is slightly weaker than it could be; |path| is
    # compared without symbols, not the simplified form of |path|,
    # which may matter.
    max_prefix = potential_paths[0]
    max_prefix_length = len(_CommonNormalizedPrefix(max_prefix, path))
    for path_for_file in potential_paths[1:]:
      prefix_length = len(_CommonNormalizedPrefix(path_for_file, path))
      if prefix_length > max_prefix_length:
        max_prefix, max_prefix_length = path_for_file, prefix_length

    return max_prefix

  def Cron(self):
    return self._LoadCache()