diff options
author | Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de> | 2017-07-02 12:51:03 +0200 |
---|---|---|
committer | Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de> | 2017-07-02 12:51:03 +0200 |
commit | dc2f94d337af01e817b96920fed6b80368f17431 (patch) | |
tree | 1c379d9b98d2b56aec139e5136199053a7c67d3c /youtube_dl/extractor/youku.py | |
parent | 572b7af003229a08abc7bb62fc4b91f031e50733 (diff) | |
parent | 02d61a65e27c9f186ea24bce8e9f0c65f4ec508a (diff) | |
download | youtube-dl-dc2f94d337af01e817b96920fed6b80368f17431.zip youtube-dl-dc2f94d337af01e817b96920fed6b80368f17431.tar.gz youtube-dl-dc2f94d337af01e817b96920fed6b80368f17431.tar.bz2 |
Merge branch 'upstream'
Diffstat (limited to 'youtube_dl/extractor/youku.py')
-rw-r--r-- | youtube_dl/extractor/youku.py | 129 |
1 files changed, 99 insertions, 30 deletions
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 37be440..dcce15d 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -10,7 +10,11 @@ import time from .common import InfoExtractor from ..utils import ( ExtractorError, - get_element_by_attribute, + get_element_by_class, + js_to_json, + str_or_none, + strip_jsonp, + urljoin, ) @@ -19,7 +23,9 @@ class YoukuIE(InfoExtractor): IE_DESC = '优酷' _VALID_URL = r'''(?x) (?: - http://(?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + https?://( + (?:v|player)\.youku\.com/(?:v_show/id_|player\.php/sid/)| + video\.tudou\.com/v/)| youku:) (?P<id>[A-Za-z0-9]+)(?:\.html|/v\.swf|) ''' @@ -31,6 +37,12 @@ class YoukuIE(InfoExtractor): 'id': 'XMTc1ODE5Njcy', 'title': '★Smile﹗♡ Git Fresh -Booty Music舞蹈.', 'ext': 'mp4', + 'duration': 74.73, + 'thumbnail': r're:^https?://.*', + 'uploader': '。躲猫猫、', + 'uploader_id': '36017967', + 'uploader_url': 'http://i.youku.com/u/UMTQ0MDcxODY4', + 'tags': list, } }, { 'url': 'http://player.youku.com/player.php/sid/XNDgyMDQ2NTQw/v.swf', @@ -41,6 +53,12 @@ class YoukuIE(InfoExtractor): 'id': 'XODgxNjg1Mzk2', 'ext': 'mp4', 'title': '武媚娘传奇 85', + 'duration': 1999.61, + 'thumbnail': r're:^https?://.*', + 'uploader': '疯狂豆花', + 'uploader_id': '62583473', + 'uploader_url': 'http://i.youku.com/u/UMjUwMzMzODky', + 'tags': list, }, }, { 'url': 'http://v.youku.com/v_show/id_XMTI1OTczNDM5Mg==.html', @@ -48,6 +66,12 @@ class YoukuIE(InfoExtractor): 'id': 'XMTI1OTczNDM5Mg', 'ext': 'mp4', 'title': '花千骨 04', + 'duration': 2363, + 'thumbnail': r're:^https?://.*', + 'uploader': '放剧场-花千骨', + 'uploader_id': '772849359', + 'uploader_url': 'http://i.youku.com/u/UMzA5MTM5NzQzNg==', + 'tags': list, }, }, { 'url': 'http://v.youku.com/v_show/id_XNjA1NzA2Njgw.html', @@ -56,6 +80,12 @@ class YoukuIE(InfoExtractor): 'id': 'XNjA1NzA2Njgw', 'ext': 'mp4', 'title': '邢義田复旦讲座之想象中的胡人—从“左衽孔子”说起', + 'duration': 7264.5, + 'thumbnail': r're:^https?://.*', + 'uploader': 'FoxJin1006', + 'uploader_id': '322014285', + 'uploader_url': 'http://i.youku.com/u/UMTI4ODA1NzE0MA==', + 'tags': list, }, 'params': { 'videopassword': '100600', @@ -67,7 +97,29 @@ class YoukuIE(InfoExtractor): 'id': 'XOTUxMzg4NDMy', 'ext': 'mp4', 'title': '我的世界☆明月庄主☆车震猎杀☆杀人艺术Minecraft', + 'duration': 702.08, + 'thumbnail': r're:^https?://.*', + 'uploader': '明月庄主moon', + 'uploader_id': '38465621', + 'uploader_url': 'http://i.youku.com/u/UMTUzODYyNDg0', + 'tags': list, }, + }, { + 'url': 'http://video.tudou.com/v/XMjIyNzAzMTQ4NA==.html?f=46177805', + 'info_dict': { + 'id': 'XMjIyNzAzMTQ4NA', + 'ext': 'mp4', + 'title': '卡马乔国足开大脚长传冲吊集锦', + 'duration': 289, + 'thumbnail': r're:^https?://.*', + 'uploader': '阿卜杜拉之星', + 'uploader_id': '2382249', + 'uploader_url': 'http://i.youku.com/u/UOTUyODk5Ng==', + 'tags': list, + }, + }, { + 'url': 'http://video.tudou.com/v/XMjE4ODI3OTg2MA==.html', + 'only_matching': True, }] @staticmethod @@ -104,7 +156,7 @@ class YoukuIE(InfoExtractor): # request basic data basic_data_params = { 'vid': video_id, - 'ccode': '0401', + 'ccode': '0402' if 'tudou.com' in url else '0401', 'client_ip': '192.168.1.1', 'utid': cna, 'client_ts': time.time() / 1000, @@ -139,7 +191,8 @@ class YoukuIE(InfoExtractor): raise ExtractorError(msg) # get video title - title = data['video']['title'] + video_data = data['video'] + title = video_data['title'] formats = [{ 'url': stream['m3u8_url'], @@ -156,53 +209,69 @@ class YoukuIE(InfoExtractor): 'id': video_id, 'title': title, 'formats': formats, + 'duration': video_data.get('seconds'), + 'thumbnail': video_data.get('logo'), + 'uploader': video_data.get('username'), + 'uploader_id': str_or_none(video_data.get('userid')), + 'uploader_url': data.get('uploader', {}).get('homepage'), + 'tags': video_data.get('tags'), } class YoukuShowIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?youku\.com/show_page/id_(?P<id>[0-9a-z]+)\.html' + _VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html' IE_NAME = 'youku:show' _TEST = { - 'url': 'http://www.youku.com/show_page/id_zc7c670be07ff11e48b3f.html', + 'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html', 'info_dict': { 'id': 'zc7c670be07ff11e48b3f', 'title': '花千骨 未删减版', - 'description': 'md5:578d4f2145ae3f9128d9d4d863312910', + 'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558', }, 'playlist_count': 50, } _PAGE_SIZE = 40 - def _find_videos_in_page(self, webpage): - videos = re.findall( - r'<li><a[^>]+href="(?P<url>https?://v\.youku\.com/[^"]+)"[^>]+title="(?P<title>[^"]+)"', webpage) - return [ - self.url_result(video_url, YoukuIE.ie_key(), title) - for video_url, title in videos] - def _real_extract(self, url): show_id = self._match_id(url) webpage = self._download_webpage(url, show_id) - entries = self._find_videos_in_page(webpage) - - playlist_title = self._html_search_regex( - r'<span[^>]+class="name">([^<]+)</span>', webpage, 'playlist title', fatal=False) - detail_div = get_element_by_attribute('class', 'detail', webpage) or '' - playlist_description = self._html_search_regex( - r'<span[^>]+style="display:none"[^>]*>([^<]+)</span>', - detail_div, 'playlist description', fatal=False) - - for idx in itertools.count(1): - episodes_page = self._download_webpage( - 'http://www.youku.com/show_episode/id_%s.html' % show_id, - show_id, query={'divid': 'reload_%d' % (idx * self._PAGE_SIZE + 1)}, - note='Downloading episodes page %d' % idx) - new_entries = self._find_videos_in_page(episodes_page) + entries = [] + page_config = self._parse_json(self._search_regex( + r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'), + show_id, transform_source=js_to_json) + for idx in itertools.count(0): + if idx == 0: + playlist_data_url = 'http://list.youku.com/show/module' + query = {'id': page_config['showid'], 'tab': 'point'} + else: + playlist_data_url = 'http://list.youku.com/show/point' + query = { + 'id': page_config['showid'], + 'stage': 'reload_%d' % (self._PAGE_SIZE * idx + 1), + } + query['callback'] = 'cb' + playlist_data = self._download_json( + playlist_data_url, show_id, query=query, + note='Downloading playlist data page %d' % (idx + 1), + transform_source=lambda s: js_to_json(strip_jsonp(s)))['html'] + video_urls = re.findall( + r'<div[^>]+class="p-thumb"[^<]+<a[^>]+href="([^"]+)"', + playlist_data) + new_entries = [ + self.url_result(urljoin(url, video_url), YoukuIE.ie_key()) + for video_url in video_urls] entries.extend(new_entries) if len(new_entries) < self._PAGE_SIZE: break - return self.playlist_result(entries, show_id, playlist_title, playlist_description) + desc = self._html_search_meta('description', webpage, fatal=False) + playlist_title = desc.split(',')[0] if desc else None + detail_li = get_element_by_class('p-intro', webpage) + playlist_description = get_element_by_class( + 'intro-more', detail_li) if detail_li else None + + return self.playlist_result( + entries, show_id, playlist_title, playlist_description) |