diff options
author | Rogério Brito <rbrito@ime.usp.br> | 2013-12-26 16:41:27 -0200 |
---|---|---|
committer | Rogério Brito <rbrito@ime.usp.br> | 2013-12-26 16:41:27 -0200 |
commit | ca4d08063804fb264eb0ae9cc57894198f66e1fb (patch) | |
tree | 417325d8523de104149ebbf967ffc14991921072 /youtube_dl/extractor/appletrailers.py | |
parent | b238854ce845f3796daac74edab2e8a373e8ba1a (diff) | |
download | youtube-dl-ca4d08063804fb264eb0ae9cc57894198f66e1fb.zip youtube-dl-ca4d08063804fb264eb0ae9cc57894198f66e1fb.tar.gz youtube-dl-ca4d08063804fb264eb0ae9cc57894198f66e1fb.tar.bz2 |
Imported Upstream version 2013.12.23
Diffstat (limited to 'youtube_dl/extractor/appletrailers.py')
-rw-r--r-- | youtube_dl/extractor/appletrailers.py | 25 |
1 files changed, 12 insertions, 13 deletions
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index 4befff3..ef5644a 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -1,5 +1,4 @@ import re -import xml.etree.ElementTree import json from .common import InfoExtractor @@ -10,7 +9,7 @@ from ..utils import ( class AppleTrailersIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?trailers.apple.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' + _VALID_URL = r'https?://(?:www\.)?trailers\.apple\.com/trailers/(?P<company>[^/]+)/(?P<movie>[^/]+)' _TEST = { u"url": u"http://trailers.apple.com/trailers/wb/manofsteel/", u"playlist": [ @@ -65,18 +64,18 @@ class AppleTrailersIE(InfoExtractor): uploader_id = mobj.group('company') playlist_url = compat_urlparse.urljoin(url, u'includes/playlists/itunes.inc') - playlist_snippet = self._download_webpage(playlist_url, movie) - playlist_cleaned = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', playlist_snippet) - playlist_cleaned = re.sub(r'<img ([^<]*?)>', r'<img \1/>', playlist_cleaned) - # The ' in the onClick attributes are not escaped, it couldn't be parsed - # with xml.etree.ElementTree.fromstring - # like: http://trailers.apple.com/trailers/wb/gravity/ - def _clean_json(m): - return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') - playlist_cleaned = re.sub(self._JSON_RE, _clean_json, playlist_cleaned) - playlist_html = u'<html>' + playlist_cleaned + u'</html>' + def fix_html(s): + s = re.sub(r'(?s)<script[^<]*?>.*?</script>', u'', s) + s = re.sub(r'<img ([^<]*?)>', r'<img \1/>', s) + # The ' in the onClick attributes are not escaped, it couldn't be parsed + # like: http://trailers.apple.com/trailers/wb/gravity/ + def _clean_json(m): + return u'iTunes.playURL(%s);' % m.group(1).replace('\'', ''') + s = re.sub(self._JSON_RE, _clean_json, s) + s = u'<html>' + s + u'</html>' + return s + doc = self._download_xml(playlist_url, movie, transform_source=fix_html) - doc = xml.etree.ElementTree.fromstring(playlist_html) playlist = [] for li in doc.findall('./div/ul/li'): on_click = li.find('.//a').attrib['onClick'] |