aboutsummaryrefslogtreecommitdiffstats
path: root/youtube_dl/extractor/mit.py
diff options
context:
space:
mode:
authorJeff Smith <whydoubt@yahoo.com>2013-08-28 14:00:59 -0500
committerJeff Smith <whydoubt@yahoo.com>2013-08-28 14:24:42 -0500
commitb5ba7b9dcfed5ded96c841a0ebbbf12132de838f (patch)
tree2622b78cd616c051fa1dadcef5571f97b72998be /youtube_dl/extractor/mit.py
parent2891932bf0a01acc025246438f890dca57f91c6b (diff)
downloadyoutube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.zip
youtube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.tar.gz
youtube-dl-b5ba7b9dcfed5ded96c841a0ebbbf12132de838f.tar.bz2
Fix MIT extractor for Python 2.6
The HTML for the MIT page does not parse cleanly for Python 2.6 due to script tags within an actual script element. The offending piece is inside a comment block, so removing all such comment blocks fixes the parsing.
Diffstat (limited to 'youtube_dl/extractor/mit.py')
-rw-r--r--youtube_dl/extractor/mit.py16
1 files changed, 7 insertions, 9 deletions
diff --git a/youtube_dl/extractor/mit.py b/youtube_dl/extractor/mit.py
index d09d03e..52be923 100644
--- a/youtube_dl/extractor/mit.py
+++ b/youtube_dl/extractor/mit.py
@@ -25,23 +25,21 @@ class TechTVMITIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- webpage = self._download_webpage(
+ raw_page = self._download_webpage(
'http://techtv.mit.edu/videos/%s' % video_id, video_id)
- embed_page = self._download_webpage(
- 'http://techtv.mit.edu/embeds/%s/' % video_id, video_id,
- note=u'Downloading embed page')
+ clean_page = re.compile(u'<!--.*?-->', re.S).sub(u'', raw_page)
base_url = self._search_regex(r'ipadUrl: \'(.+?cloudfront.net/)',
- embed_page, u'base url')
- formats_json = self._search_regex(r'bitrates: (\[.+?\])', embed_page,
+ raw_page, u'base url')
+ formats_json = self._search_regex(r'bitrates: (\[.+?\])', raw_page,
u'video formats')
formats = json.loads(formats_json)
formats = sorted(formats, key=lambda f: f['bitrate'])
- title = get_element_by_id('edit-title', webpage)
- description = clean_html(get_element_by_id('edit-description', webpage))
+ title = get_element_by_id('edit-title', clean_page)
+ description = clean_html(get_element_by_id('edit-description', clean_page))
thumbnail = self._search_regex(r'playlist:.*?url: \'(.+?)\'',
- embed_page, u'thumbnail', flags=re.DOTALL)
+ raw_page, u'thumbnail', flags=re.DOTALL)
return {'id': video_id,
'title': title,