# Copyright 2013 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. from collections import defaultdict import posixpath from future import Future from path_util import SplitParent from special_paths import SITE_VERIFICATION_FILE def _Normalize(file_name, splittext=False): normalized = file_name if splittext: normalized = posixpath.splitext(file_name)[0] normalized = normalized.replace('.', '').replace('-', '').replace('_', '') return normalized.lower() def _CommonNormalizedPrefix(first_file, second_file): return posixpath.commonprefix((_Normalize(first_file), _Normalize(second_file))) class PathCanonicalizer(object): '''Transforms paths into their canonical forms. Since the docserver has had many incarnations - e.g. there didn't use to be apps/ - there may be old paths lying around the webs. We try to redirect those to where they are now. ''' def __init__(self, file_system, object_store_creator, strip_extensions): # |strip_extensions| is a list of file extensions (e.g. .html) that should # be stripped for a path's canonical form. self._cache = object_store_creator.Create( PathCanonicalizer, category=file_system.GetIdentity()) self._file_system = file_system self._strip_extensions = strip_extensions def _LoadCache(self): def load(cached): # |canonical_paths| is the pre-calculated set of canonical paths. # |simplified_paths_map| is a lazily populated mapping of simplified file # names to a list of full paths that contain them. For example, # - browseraction: [extensions/browserAction.html] # - storage: [apps/storage.html, extensions/storage.html] canonical_paths, simplified_paths_map = ( cached.get('canonical_paths'), cached.get('simplified_paths_map')) if canonical_paths is None: assert simplified_paths_map is None canonical_paths = set() simplified_paths_map = defaultdict(list) for base, dirs, files in self._file_system.Walk(''): for path in dirs + files: path_without_ext, ext = posixpath.splitext(path) canonical_path = posixpath.join(base, path_without_ext) if (ext not in self._strip_extensions or path == SITE_VERIFICATION_FILE): canonical_path += ext canonical_paths.add(canonical_path) simplified_paths_map[_Normalize(path, splittext=True)].append( canonical_path) # Store |simplified_paths_map| sorted. Ties in length are broken by # taking the shortest, lexicographically smallest path. for path_list in simplified_paths_map.itervalues(): path_list.sort(key=lambda p: (len(p), p)) self._cache.SetMulti({ 'canonical_paths': canonical_paths, 'simplified_paths_map': simplified_paths_map, }) else: assert simplified_paths_map is not None return canonical_paths, simplified_paths_map return self._cache.GetMulti(('canonical_paths', 'simplified_paths_map')).Then(load) def Canonicalize(self, path): '''Returns the canonical path for |path|. ''' canonical_paths, simplified_paths_map = self._LoadCache().Get() # Path may already be the canonical path. if path in canonical_paths: return path # Path not found. Our single heuristic: find |base| in the directory # structure with the longest common prefix of |path|. _, base = SplitParent(path) # Paths with a non-extension dot separator lose information in # _SimplifyFileName, so we try paths both with and without the dot to # maximize the possibility of finding the right path. potential_paths = ( simplified_paths_map.get(_Normalize(base), []) + simplified_paths_map.get(_Normalize(base, splittext=True), [])) if potential_paths == []: # There is no file with anything close to that name. return path # The most likely canonical file is the one with the longest common prefix # with |path|. This is slightly weaker than it could be; |path| is # compared without symbols, not the simplified form of |path|, # which may matter. max_prefix = potential_paths[0] max_prefix_length = len(_CommonNormalizedPrefix(max_prefix, path)) for path_for_file in potential_paths[1:]: prefix_length = len(_CommonNormalizedPrefix(path_for_file, path)) if prefix_length > max_prefix_length: max_prefix, max_prefix_length = path_for_file, prefix_length return max_prefix def Refresh(self): return self._LoadCache()