diff options
Diffstat (limited to 'youtube_dl/extractor')
135 files changed, 5095 insertions, 2195 deletions
diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py new file mode 100644 index 0000000..66caf6a --- /dev/null +++ b/youtube_dl/extractor/adn.py @@ -0,0 +1,136 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import json +import os + +from .common import InfoExtractor +from ..aes import aes_cbc_decrypt +from ..compat import compat_ord +from ..utils import ( + bytes_to_intlist, + ExtractorError, + float_or_none, + intlist_to_bytes, + srt_subtitles_timecode, + strip_or_none, +) + + +class ADNIE(InfoExtractor): + IE_DESC = 'Anime Digital Network' + _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites', + 'md5': 'e497370d847fd79d9d4c74be55575c7a', + 'info_dict': { + 'id': '7778', + 'ext': 'mp4', + 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1', + 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5', + } + } + + def _get_subtitles(self, sub_path, video_id): + if not sub_path: + return None + + enc_subtitles = self._download_webpage( + 'http://animedigitalnetwork.fr/' + sub_path, + video_id, fatal=False) + if not enc_subtitles: + return None + + # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js + dec_subtitles = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(base64.b64decode(enc_subtitles[24:])), + bytes_to_intlist(b'\nd\xaf\xd2J\xd0\xfc\xe1\xfc\xdf\xb61\xe8\xe1\xf0\xcc'), + bytes_to_intlist(base64.b64decode(enc_subtitles[:24])) + )) + subtitles_json = self._parse_json( + dec_subtitles[:-compat_ord(dec_subtitles[-1])], + None, fatal=False) + if not subtitles_json: + return None + + subtitles = {} + for sub_lang, sub in subtitles_json.items(): + srt = '' + for num, current in enumerate(sub): + start, end, text = ( + float_or_none(current.get('startTime')), + float_or_none(current.get('endTime')), + current.get('text')) + if start is None or end is None or text is None: + continue + srt += os.linesep.join( + ( + '%d' % num, + '%s --> %s' % ( + srt_subtitles_timecode(start), + srt_subtitles_timecode(end)), + text, + os.linesep, + )) + + if sub_lang == 'vostf': + sub_lang = 'fr' + subtitles.setdefault(sub_lang, []).extend([{ + 'ext': 'json', + 'data': json.dumps(sub), + }, { + 'ext': 'srt', + 'data': srt, + }]) + return subtitles + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + player_config = self._parse_json(self._search_regex( + r'playerConfig\s*=\s*({.+});', webpage, 'player config'), video_id) + + video_info = {} + video_info_str = self._search_regex( + r'videoInfo\s*=\s*({.+});', webpage, + 'video info', fatal=False) + if video_info_str: + video_info = self._parse_json( + video_info_str, video_id, fatal=False) or {} + + options = player_config.get('options') or {} + metas = options.get('metas') or {} + title = metas.get('title') or video_info['title'] + links = player_config.get('links') or {} + + formats = [] + for format_id, qualities in links.items(): + for load_balancer_url in qualities.values(): + load_balancer_data = self._download_json( + load_balancer_url, video_id, fatal=False) or {} + m3u8_url = load_balancer_data.get('location') + if not m3u8_url: + continue + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id=format_id, fatal=False) + if format_id == 'vf': + for f in m3u8_formats: + f['language'] = 'fr' + formats.extend(m3u8_formats) + error = options.get('error') + if not formats and error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': strip_or_none(metas.get('summary') or video_info.get('resume')), + 'thumbnail': video_info.get('image'), + 'formats': formats, + 'subtitles': self.extract_subtitles(player_config.get('subtitles'), video_id), + 'episode': metas.get('subtitle') or video_info.get('videoTitle'), + 'series': video_info.get('playlistTitle'), + } diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py index 1b2d364..7da96c6 100644 --- a/youtube_dl/extractor/adobepass.py +++ b/youtube_dl/extractor/adobepass.py @@ -41,6 +41,11 @@ MSO_INFO = { 'username_field': 'IDToken1', 'password_field': 'IDToken2', }, + 'Verizon': { + 'name': 'Verizon FiOS', + 'username_field': 'IDToken1', + 'password_field': 'IDToken2', + }, 'thr030': { 'name': '3 Rivers Communications' }, @@ -1303,6 +1308,12 @@ class AdobePassIE(InfoExtractor): _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' _MVPD_CACHE = 'ap-mvpd' + def _download_webpage_handle(self, *args, **kwargs): + headers = kwargs.get('headers', {}) + headers.update(self.geo_verification_headers()) + kwargs['headers'] = headers + return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs) + @staticmethod def _get_mvpd_resource(provider_id, title, guid, rating): channel = etree.Element('channel') @@ -1384,40 +1395,72 @@ class AdobePassIE(InfoExtractor): # Comcast page flow varies by video site and whether you # are on Comcast's network. provider_redirect_page, urlh = provider_redirect_page_res - # Check for Comcast auto login if 'automatically signing you in' in provider_redirect_page: oauth_redirect_url = self._html_search_regex( r'window\.location\s*=\s*[\'"]([^\'"]+)', provider_redirect_page, 'oauth redirect') - # Just need to process the request. No useful data comes back self._download_webpage( oauth_redirect_url, video_id, 'Confirming auto login') else: if '<form name="signin"' in provider_redirect_page: - # already have the form, just fill it provider_login_page_res = provider_redirect_page_res elif 'http-equiv="refresh"' in provider_redirect_page: - # redirects to the login page oauth_redirect_url = self._html_search_regex( r'content="0;\s*url=([^\'"]+)', provider_redirect_page, 'meta refresh redirect') provider_login_page_res = self._download_webpage_handle( - oauth_redirect_url, - video_id, 'Downloading Provider Login Page') + oauth_redirect_url, video_id, + 'Downloading Provider Login Page') else: provider_login_page_res = post_form( - provider_redirect_page_res, 'Downloading Provider Login Page') + provider_redirect_page_res, + 'Downloading Provider Login Page') - mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { - mso_info.get('username_field', 'username'): username, - mso_info.get('password_field', 'password'): password, - }) + mvpd_confirm_page_res = post_form( + provider_login_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password, + }) mvpd_confirm_page, urlh = mvpd_confirm_page_res if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page: post_form(mvpd_confirm_page_res, 'Confirming Login') - + elif mso_id == 'Verizon': + # In general, if you're connecting from a Verizon-assigned IP, + # you will not actually pass your credentials. + provider_redirect_page, urlh = provider_redirect_page_res + if 'Please wait ...' in provider_redirect_page: + saml_redirect_url = self._html_search_regex( + r'self\.parent\.location=(["\'])(?P<url>.+?)\1', + provider_redirect_page, + 'SAML Redirect URL', group='url') + saml_login_page = self._download_webpage( + saml_redirect_url, video_id, + 'Downloading SAML Login Page') + else: + saml_login_page_res = post_form( + provider_redirect_page_res, 'Logging in', { + mso_info['username_field']: username, + mso_info['password_field']: password, + }) + saml_login_page, urlh = saml_login_page_res + if 'Please try again.' in saml_login_page: + raise ExtractorError( + 'We\'re sorry, but either the User ID or Password entered is not correct.') + saml_login_url = self._search_regex( + r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1', + saml_login_page, 'SAML Login URL', group='url') + saml_response_json = self._download_json( + saml_login_url, video_id, 'Downloading SAML Response', + headers={'Content-Type': 'text/xml'}) + self._download_webpage( + saml_response_json['targetValue'], video_id, + 'Confirming Login', data=urlencode_postdata({ + 'SAMLResponse': saml_response_json['SAMLResponse'], + 'RelayState': saml_response_json['RelayState'] + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded' + }) else: - # Normal, non-Comcast flow provider_login_page_res = post_form( provider_redirect_page_res, 'Downloading Provider Login Page') mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', { diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 989505c..acc4ce3 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -5,91 +5,52 @@ import re from .turner import TurnerBaseIE from ..utils import ( - ExtractorError, int_or_none, + strip_or_none, ) class AdultSwimIE(TurnerBaseIE): - _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?' + _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?' _TESTS = [{ 'url': 'http://adultswim.com/videos/rick-and-morty/pilot', - 'playlist': [ - { - 'md5': '247572debc75c7652f253c8daa51a14d', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0', - 'ext': 'flv', - 'title': 'Rick and Morty - Pilot Part 1', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - }, - }, - { - 'md5': '77b0e037a4b20ec6b98671c4c379f48d', - 'info_dict': { - 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3', - 'ext': 'flv', - 'title': 'Rick and Morty - Pilot Part 4', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " - }, - }, - ], 'info_dict': { 'id': 'rQxZvXQ4ROaSOqq-or2Mow', + 'ext': 'mp4', 'title': 'Rick and Morty - Pilot', - 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. " + 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.', + 'timestamp': 1493267400, + 'upload_date': '20170427', }, - 'skip': 'This video is only available for registered users', - }, { - 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/', - 'playlist': [ - { - 'md5': '2eb5c06d0f9a1539da3718d897f13ec5', - 'info_dict': { - 'id': '-t8CamQlQ2aYZ49ItZCFog-0', - 'ext': 'flv', - 'title': 'American Dad - Putting Francine Out of Business', - 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' - }, - } - ], - 'info_dict': { - 'id': '-t8CamQlQ2aYZ49ItZCFog', - 'title': 'American Dad - Putting Francine Out of Business', - 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].' + 'params': { + # m3u8 download + 'skip_download': True, }, + 'expected_warnings': ['Unable to download f4m manifest'], }, { 'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/', - 'playlist': [ - { - 'md5': '3e346a2ab0087d687a05e1e7f3b3e529', - 'info_dict': { - 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0', - 'ext': 'mp4', - 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', - }, - } - ], 'info_dict': { 'id': 'sY3cMUR_TbuE4YmdjzbIcQ', + 'ext': 'mp4', 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine', - 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n', + 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.', + 'upload_date': '20080124', + 'timestamp': 1201150800, }, 'params': { # m3u8 download 'skip_download': True, - } + }, }, { - # heroMetadata.trailer 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', 'info_dict': { 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', 'ext': 'mp4', 'title': 'Decker - Inside Decker: A New Hero', - 'description': 'md5:c916df071d425d62d70c86d4399d3ee0', - 'duration': 249.008, + 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.', + 'timestamp': 1469480460, + 'upload_date': '20160725', }, 'params': { # m3u8 download @@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE): }, 'expected_warnings': ['Unable to download f4m manifest'], }, { - 'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/', + 'url': 'http://www.adultswim.com/videos/attack-on-titan', + 'info_dict': { + 'id': 'b7A69dzfRzuaXIECdxW8XQ', + 'title': 'Attack on Titan', + 'description': 'md5:6c8e003ea0777b47013e894767f5e114', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://www.adultswim.com/videos/streams/williams-stream', 'info_dict': { - 'id': 'eYiLsKVgQ6qTC6agD67Sig', - 'title': 'Toonami - Friday, October 14th, 2016', - 'description': 'md5:99892c96ffc85e159a428de85c30acde', + 'id': 'd8DEBj7QRfetLsRgFnGEyg', + 'ext': 'mp4', + 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$', + 'description': 'original programming', }, - 'playlist': [{ - 'md5': '', - 'info_dict': { - 'id': 'eYiLsKVgQ6qTC6agD67Sig', - 'ext': 'mp4', - 'title': 'Toonami - Friday, October 14th, 2016', - 'description': 'md5:99892c96ffc85e159a428de85c30acde', - }, - }], 'params': { # m3u8 download 'skip_download': True, }, - 'expected_warnings': ['Unable to download f4m manifest'], }] - @staticmethod - def find_video_info(collection, slug): - for video in collection.get('videos'): - if video.get('slug') == slug: - return video - - @staticmethod - def find_collection_by_linkURL(collections, linkURL): - for collection in collections: - if collection.get('linkURL') == linkURL: - return collection - - @staticmethod - def find_collection_containing_video(collections, slug): - for collection in collections: - for video in collection.get('videos'): - if video.get('slug') == slug: - return collection, video - return None, None - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - show_path = mobj.group('show_path') - episode_path = mobj.group('episode_path') - is_playlist = True if mobj.group('is_playlist') else False - - webpage = self._download_webpage(url, episode_path) - - # Extract the value of `bootstrappedData` from the Javascript in the page. - bootstrapped_data = self._parse_json(self._search_regex( - r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path) - - # Downloading videos from a /videos/playlist/ URL needs to be handled differently. - # NOTE: We are only downloading one video (the current one) not the playlist - if is_playlist: - collections = bootstrapped_data['playlists']['collections'] - collection = self.find_collection_by_linkURL(collections, show_path) - video_info = self.find_video_info(collection, episode_path) - - show_title = video_info['showTitle'] - segment_ids = [video_info['videoPlaybackID']] + show_path, episode_path = re.match(self._VALID_URL, url).groups() + display_id = episode_path or show_path + webpage = self._download_webpage(url, display_id) + initial_data = self._parse_json(self._search_regex( + r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});', + webpage, 'initial data'), display_id) + + is_stream = show_path == 'streams' + if is_stream: + if not episode_path: + episode_path = 'live-stream' + + video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path) + video_id = video_data.get('stream') + + if not video_id: + entries = [] + for episode in video_data.get('archiveEpisodes', []): + episode_url = episode.get('url') + if not episode_url: + continue + entries.append(self.url_result( + episode_url, 'AdultSwim', episode.get('id'))) + return self.playlist_result( + entries, video_data.get('id'), video_data.get('title'), + strip_or_none(video_data.get('description'))) else: - collections = bootstrapped_data['show']['collections'] - collection, video_info = self.find_collection_containing_video(collections, episode_path) - # Video wasn't found in the collections, let's try `slugged_video`. - if video_info is None: - if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: - video_info = bootstrapped_data['slugged_video'] - if not video_info: - video_info = bootstrapped_data.get( - 'heroMetadata', {}).get('trailer', {}).get('video') - if not video_info: - video_info = bootstrapped_data.get('onlineOriginals', [None])[0] - if not video_info: - raise ExtractorError('Unable to find video info') - - show = bootstrapped_data['show'] - show_title = show['title'] - stream = video_info.get('stream') - if stream and stream.get('videoPlaybackID'): - segment_ids = [stream['videoPlaybackID']] - elif video_info.get('clips'): - segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] - elif video_info.get('videoPlaybackID'): - segment_ids = [video_info['videoPlaybackID']] - elif video_info.get('id'): - segment_ids = [video_info['id']] - else: - if video_info.get('auth') is True: - raise ExtractorError( - 'This video is only available via cable service provider subscription that' - ' is not currently supported. You may want to use --cookies.', expected=True) - else: - raise ExtractorError('Unable to find stream or clips') - - episode_id = video_info['id'] - episode_title = video_info['title'] - episode_description = video_info.get('description') - episode_duration = int_or_none(video_info.get('duration')) - view_count = int_or_none(video_info.get('views')) + show_data = initial_data['show'] + + if not episode_path: + entries = [] + for video in show_data.get('videos', []): + slug = video.get('slug') + if not slug: + continue + entries.append(self.url_result( + 'http://adultswim.com/videos/%s/%s' % (show_path, slug), + 'AdultSwim', video.get('id'))) + return self.playlist_result( + entries, show_data.get('id'), show_data.get('title'), + strip_or_none(show_data.get('metadata', {}).get('description'))) + + video_data = show_data['sluggedVideo'] + video_id = video_data['id'] + + info = self._extract_cvp_info( + 'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id, + video_id, { + 'secure': { + 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', + 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', + }, + }, { + 'url': url, + 'site_name': 'AdultSwim', + 'auth_required': video_data.get('auth'), + }) - entries = [] - for part_num, segment_id in enumerate(segment_ids): - segement_info = self._extract_cvp_info( - 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id, - segment_id, { - 'secure': { - 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big', - 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do', - }, - }) - segment_title = '%s - %s' % (show_title, episode_title) - if len(segment_ids) > 1: - segment_title += ' Part %d' % (part_num + 1) - segement_info.update({ - 'id': segment_id, - 'title': segment_title, - 'description': episode_description, + info.update({ + 'id': video_id, + 'display_id': display_id, + 'description': info.get('description') or strip_or_none(video_data.get('description')), + }) + if not is_stream: + info.update({ + 'duration': info.get('duration') or int_or_none(video_data.get('duration')), + 'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')), + 'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')), + 'episode': info['title'], + 'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')), }) - entries.append(segement_info) - return { - '_type': 'playlist', - 'id': episode_id, - 'display_id': episode_path, - 'entries': entries, - 'title': '%s - %s' % (show_title, episode_title), - 'description': episode_description, - 'duration': episode_duration, - 'view_count': view_count, - } + info['series'] = video_data.get('collection_title') or info.get('series') + if info['series'] and info['series'] != info['title']: + info['title'] = '%s - %s' % (info['series'], info['title']) + + return info diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index dd96a47..2dcdba9 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -23,7 +23,19 @@ class AENetworksBaseIE(ThePlatformIE): class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime|lifetimemovieclub)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?)' + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + (?P<domain> + (?:history|aetv|mylifetime|lifetimemovieclub)\.com| + fyi\.tv + )/ + (?: + shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})| + movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?| + specials/(?P<special_display_id>[^/]+)/full-special + ) + ''' _TESTS = [{ 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'md5': 'a97a65f7e823ae10e9244bc5433d5fe6', @@ -65,6 +77,9 @@ class AENetworksIE(AENetworksBaseIE): }, { 'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us', 'only_matching': True + }, { + 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special', + 'only_matching': True }] _DOMAIN_TO_REQUESTOR_ID = { 'history.com': 'HISTORY', @@ -75,8 +90,8 @@ class AENetworksIE(AENetworksBaseIE): } def _real_extract(self, url): - domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups() - display_id = show_path or movie_display_id + domain, show_path, movie_display_id, special_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id or special_display_id webpage = self._download_webpage(url, display_id) if show_path: url_parts = show_path.split('/') @@ -86,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE): for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): entries.append(self.url_result( compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) - return self.playlist_result( - entries, self._html_search_meta('aetn:SeriesId', webpage), - self._html_search_meta('aetn:SeriesTitle', webpage)) - elif url_parts_len == 2: + if entries: + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + else: + # single season + url_parts_len = 2 + if url_parts_len == 2: entries = [] for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage): episode_attributes = extract_attributes(episode_item) @@ -97,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE): url, episode_attributes['data-canonical']) entries.append(self.url_result( episode_url, 'AENetworks', - episode_attributes['data-videoid'])) + episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id'))) return self.playlist_result( entries, self._html_search_meta('aetn:SeasonId', webpage)) @@ -107,7 +126,10 @@ class AENetworksIE(AENetworksBaseIE): } video_id = self._html_search_meta('aetn:VideoID', webpage) media_url = self._search_regex( - r"media_url\s*=\s*'([^']+)'", webpage, 'video url') + [r"media_url\s*=\s*'(?P<url>[^']+)'", + r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)', + r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'], + webpage, 'video url', group='url') theplatform_metadata = self._download_theplatform_metadata(self._search_regex( r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) info = self._parse_theplatform_metadata(theplatform_metadata) diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py index b774d6d..c8cb91d 100644 --- a/youtube_dl/extractor/afreecatv.py +++ b/youtube_dl/extractor/afreecatv.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..compat import compat_xpath from ..utils import ( + determine_ext, ExtractorError, int_or_none, xpath_text, @@ -72,13 +73,70 @@ class AfreecaTVIE(InfoExtractor): 'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793', 'info_dict': { 'id': '18650793', - 'ext': 'flv', + 'ext': 'mp4', + 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', + 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '윈아디', 'uploader_id': 'badkids', - 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!', + 'duration': 107, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652', + 'info_dict': { + 'id': '10481652', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'duration': 6492, + }, + 'playlist_count': 2, + 'playlist': [{ + 'md5': 'd8b7c174568da61d774ef0203159bf97', + 'info_dict': { + 'id': '20160502_c4c62b9d_174361386_1', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'upload_date': '20160502', + 'duration': 3601, + }, + }, { + 'md5': '58f2ce7f6044e34439ab2d50612ab02b', + 'info_dict': { + 'id': '20160502_39e739bb_174361386_2', + 'ext': 'mp4', + 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)", + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': 'dailyapril', + 'uploader_id': 'dailyapril', + 'upload_date': '20160502', + 'duration': 2891, + }, + }], + 'params': { + 'skip_download': True, + }, + }, { + # non standard key + 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605', + 'info_dict': { + 'id': '20170411_BE689A0E_190960999_1_2_h', + 'ext': 'mp4', + 'title': '혼자사는여자집', + 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$', + 'uploader': '♥이슬이', + 'uploader_id': 'dasl8121', + 'upload_date': '20170411', + 'duration': 213, }, 'params': { - 'skip_download': True, # requires rtmpdump + 'skip_download': True, }, }, { 'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652', @@ -94,7 +152,7 @@ class AfreecaTVIE(InfoExtractor): m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key) if m: video_key['upload_date'] = m.group('upload_date') - video_key['part'] = m.group('part') + video_key['part'] = int(m.group('part')) return video_key def _real_extract(self, url): @@ -109,23 +167,64 @@ class AfreecaTVIE(InfoExtractor): raise ExtractorError('Specified AfreecaTV video does not exist', expected=True) - video_url_raw = video_element.text - - app, playpath = video_url_raw.split('mp4:') + video_url = video_element.text.strip() title = xpath_text(video_xml, './track/title', 'title', fatal=True) + uploader = xpath_text(video_xml, './track/nickname', 'uploader') uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id') - duration = int_or_none(xpath_text(video_xml, './track/duration', - 'duration')) + duration = int_or_none(xpath_text( + video_xml, './track/duration', 'duration')) thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail') - return { + common_entry = { + 'uploader': uploader, + 'uploader_id': uploader_id, + 'thumbnail': thumbnail, + } + + info = common_entry.copy() + info.update({ + 'id': video_id, + 'title': title, + 'duration': duration, + }) + + if not video_url: + entries = [] + file_elements = video_element.findall(compat_xpath('./file')) + one = len(file_elements) == 1 + for file_num, file_element in enumerate(file_elements, start=1): + file_url = file_element.text + if not file_url: + continue + key = file_element.get('key', '') + upload_date = self._search_regex( + r'^(\d{8})_', key, 'upload date', default=None) + file_duration = int_or_none(file_element.get('duration')) + format_id = key if key else '%s_%s' % (video_id, file_num) + formats = self._extract_m3u8_formats( + file_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', + note='Downloading part %d m3u8 information' % file_num) + file_info = common_entry.copy() + file_info.update({ + 'id': format_id, + 'title': title if one else '%s (part %d)' % (title, file_num), + 'upload_date': upload_date, + 'duration': file_duration, + 'formats': formats, + }) + entries.append(file_info) + entries_info = info.copy() + entries_info.update({ + '_type': 'multi_video', + 'entries': entries, + }) + return entries_info + + info = { 'id': video_id, - 'url': app, - 'ext': 'flv', - 'play_path': 'mp4:' + playpath, - 'rtmp_live': True, # downloading won't end without this 'title': title, 'uploader': uploader, 'uploader_id': uploader_id, @@ -133,6 +232,21 @@ class AfreecaTVIE(InfoExtractor): 'thumbnail': thumbnail, } + if determine_ext(video_url) == 'm3u8': + info['formats'] = self._extract_m3u8_formats( + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + else: + app, playpath = video_url.split('mp4:') + info.update({ + 'url': app, + 'ext': 'flv', + 'play_path': 'mp4:' + playpath, + 'rtmp_live': True, # downloading won't end without this + }) + + return info + class AfreecaTVGlobalIE(AfreecaTVIE): IE_NAME = 'afreecatv:global' diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py index 0e06918..9e38136 100644 --- a/youtube_dl/extractor/airmozilla.py +++ b/youtube_dl/extractor/airmozilla.py @@ -15,12 +15,12 @@ class AirMozillaIE(InfoExtractor): _VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?' _TEST = { 'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/', - 'md5': '2e3e7486ba5d180e829d453875b9b8bf', + 'md5': '8d02f53ee39cf006009180e21df1f3ba', 'info_dict': { 'id': '6x4q2w', 'ext': 'mp4', 'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco', - 'thumbnail': r're:https?://vid\.ly/(?P<id>[0-9a-z-]+)/poster', + 'thumbnail': r're:https?://.*/poster\.jpg', 'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...', 'timestamp': 1422487800, 'upload_date': '20150128', @@ -34,21 +34,13 @@ class AirMozillaIE(InfoExtractor): def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_regex(r'//vid.ly/(.*?)/embed', webpage, 'id') + video_id = self._html_search_regex(r'//vid\.ly/(.*?)/embed', webpage, 'id') embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id) - jwconfig = self._search_regex(r'\svar jwconfig = (\{.*?\});\s', embed_script, 'metadata') - metadata = self._parse_json(jwconfig, video_id) - - formats = [{ - 'url': source['file'], - 'ext': source['type'], - 'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'), - 'format': source['label'], - 'height': int(source['label'].rstrip('p')), - } for source in metadata['playlist'][0]['sources']] - self._sort_formats(formats) + jwconfig = self._parse_json(self._search_regex( + r'initCallback\((.*)\);', embed_script, 'metadata'), video_id)['config'] + info_dict = self._parse_jwplayer_data(jwconfig, video_id) view_count = int_or_none(self._html_search_regex( r'Views since archived: ([0-9]+)', webpage, 'view count', fatal=False)) @@ -58,17 +50,17 @@ class AirMozillaIE(InfoExtractor): r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)', webpage, 'duration', fatal=False)) - return { + info_dict.update({ 'id': video_id, 'title': self._og_search_title(webpage), - 'formats': formats, 'url': self._og_search_url(webpage), 'display_id': display_id, - 'thumbnail': metadata['playlist'][0].get('image'), 'description': self._og_search_description(webpage), 'timestamp': timestamp, 'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None), 'duration': duration, 'view_count': view_count, 'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage), - } + }) + + return info_dict diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py index 388e578..c68be31 100644 --- a/youtube_dl/extractor/aljazeera.py +++ b/youtube_dl/extractor/aljazeera.py @@ -4,9 +4,9 @@ from .common import InfoExtractor class AlJazeeraIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html' + _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html', 'info_dict': { 'id': '3792260579001', @@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor): }, 'add_ie': ['BrightcoveNew'], 'skip': 'Not accessible from Travis CI server', - } + }, { + 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html', + 'only_matching': True, + }] BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s' def _real_extract(self, url): diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py index 90f11d3..cd533ac 100644 --- a/youtube_dl/extractor/allocine.py +++ b/youtube_dl/extractor/allocine.py @@ -2,9 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - remove_end, + int_or_none, qualities, + remove_end, + try_get, + unified_timestamp, url_basename, ) @@ -22,6 +26,10 @@ class AllocineIE(InfoExtractor): 'title': 'Astérix - Le Domaine des Dieux Teaser VF', 'description': 'md5:4a754271d9c6f16c72629a8a993ee884', 'thumbnail': r're:http://.*\.jpg', + 'duration': 39, + 'timestamp': 1404273600, + 'upload_date': '20140702', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html', @@ -33,6 +41,10 @@ class AllocineIE(InfoExtractor): 'title': 'Planes 2 Bande-annonce VF', 'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway', 'thumbnail': r're:http://.*\.jpg', + 'duration': 69, + 'timestamp': 1385659800, + 'upload_date': '20131128', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html', @@ -44,6 +56,10 @@ class AllocineIE(InfoExtractor): 'title': 'Dragons 2 - Bande annonce finale VF', 'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a', 'thumbnail': r're:http://.*\.jpg', + 'duration': 144, + 'timestamp': 1397589900, + 'upload_date': '20140415', + 'view_count': int, }, }, { 'url': 'http://www.allocine.fr/video/video-19550147/', @@ -69,34 +85,37 @@ class AllocineIE(InfoExtractor): r'data-model="([^"]+)"', webpage, 'data model', default=None) if model: model_data = self._parse_json(model, display_id) - - for video_url in model_data['sources'].values(): + video = model_data['videos'][0] + title = video['title'] + for video_url in video['sources'].values(): video_id, format_id = url_basename(video_url).split('_')[:2] formats.append({ 'format_id': format_id, 'quality': quality(format_id), 'url': video_url, }) - - title = model_data['title'] + duration = int_or_none(video.get('duration')) + view_count = int_or_none(video.get('view_count')) + timestamp = unified_timestamp(try_get( + video, lambda x: x['added_at']['date'], compat_str)) else: video_id = display_id media_data = self._download_json( 'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id) + title = remove_end( + self._html_search_regex( + r'(?s)<title>(.+?)</title>', webpage, 'title').strip(), + ' - AlloCiné') for key, value in media_data['video'].items(): if not key.endswith('Path'): continue - format_id = key[:-len('Path')] formats.append({ 'format_id': format_id, 'quality': quality(format_id), 'url': value, }) - - title = remove_end(self._html_search_regex( - r'(?s)<title>(.+?)</title>', webpage, 'title' - ).strip(), ' - AlloCiné') + duration, view_count, timestamp = [None] * 3 self._sort_formats(formats) @@ -104,7 +123,10 @@ class AllocineIE(InfoExtractor): 'id': video_id, 'display_id': display_id, 'title': title, + 'description': self._og_search_description(webpage), 'thumbnail': self._og_search_thumbnail(webpage), + 'duration': duration, + 'timestamp': timestamp, + 'view_count': view_count, 'formats': formats, - 'description': self._og_search_description(webpage), } diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index e8e4012..fde1a8f 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -7,15 +7,19 @@ from ..utils import ( parse_iso8601, mimetype2ext, determine_ext, + ExtractorError, ) class AMPIE(InfoExtractor): # parse Akamai Adaptive Media Player feed def _extract_feed_info(self, url): - item = self._download_json( + feed = self._download_json( url, None, 'Downloading Akamai AMP feed', - 'Unable to download Akamai AMP feed')['channel']['item'] + 'Unable to download Akamai AMP feed') + item = feed.get('channel', {}).get('item') + if not item: + raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error'])) video_id = item['guid'] @@ -30,9 +34,12 @@ class AMPIE(InfoExtractor): if isinstance(media_thumbnail, dict): media_thumbnail = [media_thumbnail] for thumbnail_data in media_thumbnail: - thumbnail = thumbnail_data['@attributes'] + thumbnail = thumbnail_data.get('@attributes', {}) + thumbnail_url = thumbnail.get('url') + if not thumbnail_url: + continue thumbnails.append({ - 'url': self._proto_relative_url(thumbnail['url'], 'http:'), + 'url': self._proto_relative_url(thumbnail_url, 'http:'), 'width': int_or_none(thumbnail.get('width')), 'height': int_or_none(thumbnail.get('height')), }) @@ -43,9 +50,14 @@ class AMPIE(InfoExtractor): if isinstance(media_subtitle, dict): media_subtitle = [media_subtitle] for subtitle_data in media_subtitle: - subtitle = subtitle_data['@attributes'] - lang = subtitle.get('lang') or 'en' - subtitles[lang] = [{'url': subtitle['href']}] + subtitle = subtitle_data.get('@attributes', {}) + subtitle_href = subtitle.get('href') + if not subtitle_href: + continue + subtitles.setdefault(subtitle.get('lang') or 'en', []).append({ + 'url': subtitle_href, + 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href), + }) formats = [] media_content = get_media_node('content') diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py index 623f44d..8023da7 100644 --- a/youtube_dl/extractor/anvato.py +++ b/youtube_dl/extractor/anvato.py @@ -5,6 +5,7 @@ import base64 import hashlib import json import random +import re import time from .common import InfoExtractor @@ -16,6 +17,7 @@ from ..utils import ( intlist_to_bytes, int_or_none, strip_jsonp, + unescapeHTML, ) @@ -26,6 +28,8 @@ def md5_text(s): class AnvatoIE(InfoExtractor): + _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)' + # Copied from anvplayer.min.js _ANVACK_TABLE = { 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ', @@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor): 'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ' } + _MCP_TO_ACCESS_KEY_TABLE = { + 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922', + 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749', + 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa', + 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a', + 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336', + 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3', + 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900', + 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99', + 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe', + 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582' + } + + _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1' _AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce' def __init__(self, *args, **kwargs): @@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor): } if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'): - # Not using _extract_m3u8_formats here as individual media - # playlists are also included in published_urls. - if tbr is None: - formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls')) - continue - else: + if tbr is not None: a_format.update({ 'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])), 'ext': 'mp4', @@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor): 'subtitles': subtitles, } + @staticmethod + def _extract_urls(ie, webpage, video_id): + entries = [] + for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage): + anvplayer_data = ie._parse_json( + mobj.group('anvp'), video_id, transform_source=unescapeHTML, + fatal=False) + if not anvplayer_data: + continue + video = anvplayer_data.get('video') + if not isinstance(video, compat_str) or not video.isdigit(): + continue + access_key = anvplayer_data.get('accessKey') + if not access_key: + mcp = anvplayer_data.get('mcp') + if mcp: + access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get( + mcp.lower()) + if not access_key: + continue + entries.append(ie.url_result( + 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(), + video_id=video)) + return entries + def _extract_anvato_videos(self, webpage, video_id): - anvplayer_data = self._parse_json(self._html_search_regex( - r'<script[^>]+data-anvp=\'([^\']+)\'', webpage, - 'Anvato player data'), video_id) + anvplayer_data = self._parse_json( + self._html_search_regex( + self._ANVP_RE, webpage, 'Anvato player data', group='anvp'), + video_id) return self._get_anvato_videos( anvplayer_data['accessKey'], anvplayer_data['video']) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + access_key, video_id = mobj.group('access_key_or_mcp', 'id') + if access_key not in self._ANVACK_TABLE: + access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key] + return self._get_anvato_videos(access_key, video_id) diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py index ea7a703..a84b8b1 100644 --- a/youtube_dl/extractor/appleconnect.py +++ b/youtube_dl/extractor/appleconnect.py @@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor): _VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)' _TEST = { 'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3', - 'md5': '10d0f2799111df4cb1c924520ca78f98', + 'md5': 'e7c38568a01ea45402570e6029206723', 'info_dict': { 'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3', 'ext': 'm4v', 'title': 'Energy', 'uploader': 'Drake', - 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg', + 'thumbnail': r're:^https?://.*\.jpg$', 'upload_date': '20150710', 'timestamp': 1436545535, }, diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py index a6801f3..b45b431 100644 --- a/youtube_dl/extractor/appletrailers.py +++ b/youtube_dl/extractor/appletrailers.py @@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor): }, { 'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/', 'info_dict': { - 'id': 'blackthorn', + 'id': '4489', + 'title': 'Blackthorn', }, 'playlist_mincount': 2, 'expected_warnings': ['Unable to download JSON metadata'], @@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor): 'title': 'Most Popular', 'id': 'mostpopular', }, - 'playlist_mincount': 80, + 'playlist_mincount': 30, }, { 'url': 'http://trailers.apple.com/#section=moviestudios', 'info_dict': { diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index e21045b..3c7d725 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor): } }, { 'url': 'https://archive.org/details/Cops1922', - 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', + 'md5': '0869000b4ce265e8ca62738b336b268a', 'info_dict': { 'id': 'Cops1922', 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:b4544662605877edd99df22f9620d858', + 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6', } }, { 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 69a23e8..56baef2 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -180,7 +180,7 @@ class ArteTVBaseIE(InfoExtractor): class ArteTVPlus7IE(ArteTVBaseIE): IE_NAME = 'arte.tv:+7' - _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?P<lang>fr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D', @@ -188,6 +188,9 @@ class ArteTVPlus7IE(ArteTVBaseIE): }, { 'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22', 'only_matching': True, + }, { + 'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn', + 'only_matching': True, }] @classmethod diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py index 99af6dc..01fa308 100644 --- a/youtube_dl/extractor/atresplayer.py +++ b/youtube_dl/extractor/atresplayer.py @@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor): }, { 'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html', - 'md5': '0d0e918533bbd4b263f2de4d197d4aac', + 'md5': '6e52cbb513c405e403dbacb7aacf8747', 'info_dict': { 'id': 'capitulo-112-david-bustamante', 'ext': 'flv', diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py index 8fc5f65..e48bb89 100644 --- a/youtube_dl/extractor/audioboom.py +++ b/youtube_dl/extractor/audioboom.py @@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor): 'title': '3/09/2016 Czaban Hour 3', 'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans', 'duration': 2245.72, - 'uploader': 'Steve Czaban', + 'uploader': 'SB Nation A.M.', 'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio', } }, { diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py deleted file mode 100644 index 3ba2f00..0000000 --- a/youtube_dl/extractor/azubu.py +++ /dev/null @@ -1,140 +0,0 @@ -from __future__ import unicode_literals - -import json - -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - sanitized_Request, -) - - -class AzubuIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/[^/]+#!/play/(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1', - 'md5': 'a88b42fcf844f29ad6035054bd9ecaf4', - 'info_dict': { - 'id': '15575', - 'ext': 'mp4', - 'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1', - 'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'timestamp': 1417523507.334, - 'upload_date': '20141202', - 'duration': 9988.7, - 'uploader': 'GSL', - 'uploader_id': 414310, - 'view_count': int, - }, - }, - { - 'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-', - 'md5': 'b72a871fe1d9f70bd7673769cdb3b925', - 'info_dict': { - 'id': '9344', - 'ext': 'mp4', - 'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"', - 'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af', - 'thumbnail': r're:^https?://.*\.jpe?g', - 'timestamp': 1410530893.320, - 'upload_date': '20140912', - 'duration': 172.385, - 'uploader': 'FnaticTV', - 'uploader_id': 272749, - 'view_count': int, - }, - 'skip': 'Channel offline', - }, - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - - data = self._download_json( - 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data'] - - title = data['title'].strip() - description = data.get('description') - thumbnail = data.get('thumbnail') - view_count = data.get('view_count') - user = data.get('user', {}) - uploader = user.get('username') - uploader_id = user.get('id') - - stream_params = json.loads(data['stream_params']) - - timestamp = float_or_none(stream_params.get('creationDate'), 1000) - duration = float_or_none(stream_params.get('length'), 1000) - - renditions = stream_params.get('renditions') or [] - video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength') - if video: - renditions.append(video) - - if not renditions and not user.get('channel', {}).get('is_live', True): - raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True) - - formats = [{ - 'url': fmt['url'], - 'width': fmt['frameWidth'], - 'height': fmt['frameHeight'], - 'vbr': float_or_none(fmt['encodingRate'], 1000), - 'filesize': fmt['size'], - 'vcodec': fmt['videoCodec'], - 'container': fmt['videoContainer'], - } for fmt in renditions if fmt['url']] - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'timestamp': timestamp, - 'duration': duration, - 'uploader': uploader, - 'uploader_id': uploader_id, - 'view_count': view_count, - 'formats': formats, - } - - -class AzubuLiveIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/(?P<id>[^/]+)$' - - _TESTS = [{ - 'url': 'http://www.azubu.tv/MarsTVMDLen', - 'only_matching': True, - }, { - 'url': 'http://azubu.uol.com.br/adolfz', - 'only_matching': True, - }] - - def _real_extract(self, url): - user = self._match_id(url) - - info = self._download_json( - 'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user), - user)['data'] - if info['type'] != 'STREAM': - raise ExtractorError('{0} is not streaming live'.format(user), expected=True) - - req = sanitized_Request( - 'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id']) - req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV') - bc_info = self._download_json(req, user) - m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS') - formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4') - self._sort_formats(formats) - - return { - 'id': info['id'], - 'title': self._live_title(info['title']), - 'uploader_id': user, - 'formats': formats, - 'is_live': True, - 'thumbnail': bc_info['poster'], - } diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py index 056e063..489d0ba 100644 --- a/youtube_dl/extractor/bandcamp.py +++ b/youtube_dl/extractor/bandcamp.py @@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor): '_skip': 'There is a limit of 200 free downloads / month for the test song' }, { 'url': 'http://benprunty.bandcamp.com/track/lanius-battle', - 'md5': '73d0b3171568232574e45652f8720b5c', + 'md5': '0369ace6b939f0927e62c67a1a8d9fa7', 'info_dict': { 'id': '2650410135', - 'ext': 'mp3', - 'title': 'Lanius (Battle)', - 'uploader': 'Ben Prunty Music', + 'ext': 'aiff', + 'title': 'Ben Prunty - Lanius (Battle)', + 'uploader': 'Ben Prunty', }, }] @@ -47,6 +47,7 @@ class BandcampIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) title = mobj.group('title') webpage = self._download_webpage(url, title) + thumbnail = self._html_search_meta('og:image', webpage, default=None) m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage) if not m_download: m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage) @@ -75,6 +76,7 @@ class BandcampIE(InfoExtractor): return { 'id': track_id, 'title': data['title'], + 'thumbnail': thumbnail, 'formats': formats, 'duration': float_or_none(data.get('duration')), } @@ -143,7 +145,7 @@ class BandcampIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'thumbnail': info.get('thumb_url'), + 'thumbnail': info.get('thumb_url') or thumbnail, 'uploader': info.get('artist'), 'artist': artist, 'track': track, diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 8a2ed0a..dd65b8d 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -361,7 +361,7 @@ class BBCCoUkIE(InfoExtractor): fmt.update({ 'width': width, 'height': height, - 'vbr': bitrate, + 'tbr': bitrate, 'vcodec': encoding, }) else: @@ -370,7 +370,7 @@ class BBCCoUkIE(InfoExtractor): 'acodec': encoding, 'vcodec': 'none', }) - if protocol == 'http': + if protocol in ('http', 'https'): # Direct link fmt.update({ 'url': href, @@ -389,6 +389,8 @@ class BBCCoUkIE(InfoExtractor): 'rtmp_live': False, 'ext': 'flv', }) + else: + continue formats.append(fmt) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) @@ -407,7 +409,7 @@ class BBCCoUkIE(InfoExtractor): description = smp_config['summary'] for item in smp_config['items']: kind = item['kind'] - if kind != 'programme' and kind != 'radioProgramme': + if kind not in ('programme', 'radioProgramme'): continue programme_id = item.get('vpid') duration = int_or_none(item.get('duration')) @@ -448,7 +450,7 @@ class BBCCoUkIE(InfoExtractor): for item in self._extract_items(playlist): kind = item.get('kind') - if kind != 'programme' and kind != 'radioProgramme': + if kind not in ('programme', 'radioProgramme'): continue title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS) diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py index b0b7914..d5c5822 100644 --- a/youtube_dl/extractor/beeg.py +++ b/youtube_dl/extractor/beeg.py @@ -16,7 +16,7 @@ class BeegIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)' _TEST = { 'url': 'http://beeg.com/5416503', - 'md5': '46c384def73b33dbc581262e5ee67cef', + 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820', 'info_dict': { 'id': '5416503', 'ext': 'mp4', diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index 80dd838..1e3f255 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -122,6 +122,11 @@ class BiliBiliIE(InfoExtractor): 'preference': -2 if 'hd.mp4' in backup_url else -3, }) + for a_format in formats: + a_format.setdefault('http_headers', {}).update({ + 'Referer': url, + }) + self._sort_formats(formats) entries.append({ diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py index 7a8e1f6..e829974 100644 --- a/youtube_dl/extractor/bleacherreport.py +++ b/youtube_dl/extractor/bleacherreport.py @@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor): 'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo', 'timestamp': 1446839961, 'uploader': 'Sean Fay', - 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20', + 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757', 'uploader_id': 6466954, 'upload_date': '20151011', }, @@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE): _VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})' _TESTS = [{ 'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'md5': '8c2c12e3af7805152675446c905d159b', + 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1', 'info_dict': { 'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Cena vs. Rollins Would Expose the Heavyweight Division', 'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e', }, - 'params': { - # m3u8 download - 'skip_download': True, - }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py index ff0aa11..2c32b6a 100644 --- a/youtube_dl/extractor/br.py +++ b/youtube_dl/extractor/br.py @@ -77,7 +77,7 @@ class BRIE(InfoExtractor): 'description': 'md5:bb659990e9e59905c3d41e369db1fbe3', 'duration': 893, 'uploader': 'Eva Maria Steimle', - 'upload_date': '20140117', + 'upload_date': '20170208', } }, ] diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index 46ef8e6..0ed59bc 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -5,6 +5,7 @@ import re import json from .common import InfoExtractor +from .adobepass import AdobePassIE from ..compat import ( compat_etree_fromstring, compat_parse_qs, @@ -17,6 +18,7 @@ from ..compat import ( from ..utils import ( determine_ext, ExtractorError, + extract_attributes, find_xpath_attr, fix_xml_ampersands, float_or_none, @@ -109,6 +111,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'upload_date': '20140827', 'uploader_id': '710858724001', }, + 'skip': 'Video gone', }, { # playlist with 'videoList' @@ -129,6 +132,12 @@ class BrightcoveLegacyIE(InfoExtractor): }, 'playlist_mincount': 10, }, + { + # playerID inferred from bcpid + # from http://www.un.org/chinese/News/story.asp?NewsID=27724 + 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350', + 'only_matching': True, # Tested in GenericIE + } ] FLV_VCODECS = { 1: 'SORENSON', @@ -264,9 +273,13 @@ class BrightcoveLegacyIE(InfoExtractor): if matches: return list(filter(None, [cls._build_brighcove_url(m) for m in matches])) - return list(filter(None, [ - cls._build_brighcove_url_from_js(custom_bc) - for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)])) + matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage) + if matches: + return list(filter(None, [ + cls._build_brighcove_url_from_js(custom_bc) + for custom_bc in matches])) + return [src for _, src in re.findall( + r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)] def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -283,6 +296,10 @@ class BrightcoveLegacyIE(InfoExtractor): if videoPlayer: # We set the original url as the default 'Referer' header referer = smuggled_data.get('Referer', url) + if 'playerID' not in query: + mobj = re.search(r'/bcpid(\d+)', url) + if mobj is not None: + query['playerID'] = [mobj.group(1)] return self._get_video_info( videoPlayer[0], query, referer=referer) elif 'playerKey' in query: @@ -432,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor): return info -class BrightcoveNewIE(InfoExtractor): +class BrightcoveNewIE(AdobePassIE): IE_NAME = 'brightcove:new' _VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)' _TESTS = [{ @@ -482,17 +499,18 @@ class BrightcoveNewIE(InfoExtractor): }] @staticmethod - def _extract_url(webpage): - urls = BrightcoveNewIE._extract_urls(webpage) + def _extract_url(ie, webpage): + urls = BrightcoveNewIE._extract_urls(ie, webpage) return urls[0] if urls else None @staticmethod - def _extract_urls(webpage): + def _extract_urls(ie, webpage): # Reference: # 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe - # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript - # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html - # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player + # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag + # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript + # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html + # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player entries = [] @@ -501,22 +519,48 @@ class BrightcoveNewIE(InfoExtractor): r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage): entries.append(url if url.startswith('http') else 'http:' + url) - # Look for embed_in_page embeds [2] - for video_id, account_id, player_id, embed in re.findall( - # According to examples from [3] it's unclear whether video id - # may be optional and what to do when it is - # According to [4] data-video-id may be prefixed with ref: - r'''(?sx) - <video[^>]+ - data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*? - </video>.*? - <script[^>]+ - src=["\'](?:https?:)?//players\.brightcove\.net/ - (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js + # Look for <video> tags [2] and embed_in_page embeds [3] + # [2] looks like: + for video, script_tag, account_id, player_id, embed in re.findall( + r'''(?isx) + (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>) + (?:.*? + (<script[^>]+ + src=["\'](?:https?:)?//players\.brightcove\.net/ + (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js + ) + )? ''', webpage): - entries.append( - 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' - % (account_id, player_id, embed, video_id)) + attrs = extract_attributes(video) + + # According to examples from [4] it's unclear whether video id + # may be optional and what to do when it is + video_id = attrs.get('data-video-id') + if not video_id: + continue + + account_id = account_id or attrs.get('data-account') + if not account_id: + continue + + player_id = player_id or attrs.get('data-player') or 'default' + embed = embed or attrs.get('data-embed') or 'default' + + bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % ( + account_id, player_id, embed, video_id) + + # Some brightcove videos may be embedded with video tag only and + # without script tag or any mentioning of brightcove at all. Such + # embeds are considered ambiguous since they are matched based only + # on data-video-id and data-account attributes and in the wild may + # not be brightcove embeds at all. Let's check reconstructed + # brightcove URLs in case of such embeds and only process valid + # ones. By this we ensure there is indeed a brightcove embed. + if not script_tag and not ie._is_valid_url( + bc_url, video_id, 'possible brightcove video'): + continue + + entries.append(bc_url) return entries @@ -559,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor): raise ExtractorError(message, expected=True) raise + errors = json_data.get('errors') + if errors and errors[0].get('error_subcode') == 'TVE_AUTH': + custom_fields = json_data['custom_fields'] + tve_token = self._extract_mvpd_auth( + smuggled_data['source_url'], video_id, + custom_fields['bcadobepassrequestorid'], + custom_fields['bcadobepassresourceid']) + json_data = self._download_json( + api_url, video_id, headers={ + 'Accept': 'application/json;pk=%s' % policy_key + }, query={ + 'tveToken': tve_token, + }) + title = json_data['name'].strip() formats = [] @@ -624,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor): }) formats.append(f) - errors = json_data.get('errors') if not formats and errors: error = errors[0] raise ExtractorError( @@ -641,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor): is_live = False duration = float_or_none(json_data.get('duration'), 1000) - if duration and duration < 0: + if duration is not None and duration <= 0: is_live = True return { diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py index f1f128c..acd87e3 100644 --- a/youtube_dl/extractor/canalc2.py +++ b/youtube_dl/extractor/canalc2.py @@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor): 'md5': '060158428b650f896c542dfbb3d6487f', 'info_dict': { 'id': '12163', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Terrasses du Numérique', 'duration': 122, }, - 'params': { - 'skip_download': True, # Requires rtmpdump - } }, { 'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui', 'only_matching': True, diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py index 4b9fa2d..d8bf073 100644 --- a/youtube_dl/extractor/canalplus.py +++ b/youtube_dl/extractor/canalplus.py @@ -7,8 +7,8 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_urlparse from ..utils import ( dict_get, - ExtractorError, - HEADRequest, + # ExtractorError, + # HEADRequest, int_or_none, qualities, remove_end, @@ -45,6 +45,9 @@ class CanalplusIE(InfoExtractor): 'itele': 'itele', } + # Only works for direct mp4 URLs + _GEO_COUNTRIES = ['FR'] + _TESTS = [{ 'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814', 'info_dict': { @@ -56,6 +59,7 @@ class CanalplusIE(InfoExtractor): 'upload_date': '20160702', }, }, { + # geo restricted, bypassed 'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190', 'info_dict': { 'id': '1108190', @@ -65,19 +69,20 @@ class CanalplusIE(InfoExtractor): 'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff', 'upload_date': '20140724', }, - 'skip': 'Only works from France', + 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { - 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html', - 'md5': '4b47b12b4ee43002626b97fad8fb1de5', + # geo restricted, bypassed + 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html?vid=1443684', + 'md5': 'bb6f9f343296ab7ebd88c97b660ecf8d', 'info_dict': { - 'id': '1420213', + 'id': '1443684', 'display_id': 'pid6318-videos-integrales', 'ext': 'mp4', - 'title': 'TPMP ! Même le matin - Les 35H de Baba - 14/10/2016', - 'description': 'md5:f96736c1b0ffaa96fd5b9e60ad871799', - 'upload_date': '20161014', + 'title': 'Guess my iep ! - TPMP - 07/04/2017', + 'description': 'md5:6f005933f6e06760a9236d9b3b5f17fa', + 'upload_date': '20170407', }, - 'skip': 'Only works from France', + 'expected_warnings': ['HTTP Error 403: Forbidden'], }, { 'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510', 'info_dict': { @@ -134,15 +139,15 @@ class CanalplusIE(InfoExtractor): preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD']) - fmt_url = next(iter(media.get('VIDEOS'))) - if '/geo' in fmt_url.lower(): - response = self._request_webpage( - HEADRequest(fmt_url), video_id, - 'Checking if the video is georestricted') - if '/blocage' in response.geturl(): - raise ExtractorError( - 'The video is not available in your country', - expected=True) + # _, fmt_url = next(iter(media['VIDEOS'].items())) + # if '/geo' in fmt_url.lower(): + # response = self._request_webpage( + # HEADRequest(fmt_url), video_id, + # 'Checking if the video is georestricted') + # if '/blocage' in response.geturl(): + # raise ExtractorError( + # 'The video is not available in your country', + # expected=True) formats = [] for format_id, format_url in media['VIDEOS'].items(): diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py index 544c665..aada029 100644 --- a/youtube_dl/extractor/canvas.py +++ b/youtube_dl/extractor/canvas.py @@ -7,6 +7,7 @@ from ..utils import float_or_none class CanvasIE(InfoExtractor): + IE_DESC = 'canvas.be and een.be' _VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TESTS = [{ 'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week', diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index cf678e7..87ad14e 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -96,6 +96,7 @@ class CBCIE(InfoExtractor): 'info_dict': { 'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks', 'id': 'dog-indoor-exercise-winter-1.3928238', + 'description': 'md5:c18552e41726ee95bd75210d1ca9194c', }, 'playlist_mincount': 6, }] @@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, }, { - # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url 'url': 'http://www.cbc.ca/player/play/2164402062', - 'md5': '17a61eb813539abea40618d6323a7f82', + 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6', 'info_dict': { 'id': '2164402062', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cancer survivor four times over', 'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.', 'timestamp': 1320410746, diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 8d5f11d..7d78e3a 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE): 'title': 'A Very Blue Anniversary', 'description': 'CBS2’s Cindy Hsu has more.', 'thumbnail': 're:^https?://.*', - 'timestamp': 1479962220, - 'upload_date': '20161124', + 'timestamp': int, + 'upload_date': r're:^\d{8}$', 'uploader': 'CBS', 'subtitles': { 'en': 'mincount:5', diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py index 1ee35b5..78b7a92 100755 --- a/youtube_dl/extractor/cda.py +++ b/youtube_dl/extractor/cda.py @@ -9,7 +9,10 @@ from ..utils import ( ExtractorError, float_or_none, int_or_none, + multipart_encode, parse_duration, + random_birthday, + urljoin, ) @@ -27,7 +30,8 @@ class CDAIE(InfoExtractor): 'description': 'md5:269ccd135d550da90d1662651fcb9772', 'thumbnail': r're:^https?://.*\.jpg$', 'average_rating': float, - 'duration': 39 + 'duration': 39, + 'age_limit': 0, } }, { 'url': 'http://www.cda.pl/video/57413289', @@ -41,13 +45,41 @@ class CDAIE(InfoExtractor): 'uploader': 'crash404', 'view_count': int, 'average_rating': float, - 'duration': 137 + 'duration': 137, + 'age_limit': 0, } }, { + # Age-restricted + 'url': 'http://www.cda.pl/video/1273454c4', + 'info_dict': { + 'id': '1273454c4', + 'ext': 'mp4', + 'title': 'Bronson (2008) napisy HD 1080p', + 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c', + 'height': 1080, + 'uploader': 'boniek61', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 5554, + 'age_limit': 18, + 'view_count': int, + 'average_rating': float, + }, + }, { 'url': 'http://ebd.cda.pl/0x0/5749950c', 'only_matching': True, }] + def _download_age_confirm_page(self, url, video_id, *args, **kwargs): + form_data = random_birthday('rok', 'miesiac', 'dzien') + form_data.update({'return': url, 'module': 'video', 'module_id': video_id}) + data, content_type = multipart_encode(form_data) + return self._download_webpage( + urljoin(url, '/a/validatebirth'), video_id, *args, + data=data, headers={ + 'Referer': url, + 'Content-Type': content_type, + }, **kwargs) + def _real_extract(self, url): video_id = self._match_id(url) self._set_cookie('cda.pl', 'cda.player', 'html5') @@ -57,6 +89,13 @@ class CDAIE(InfoExtractor): if 'Ten film jest dostępny dla użytkowników premium' in webpage: raise ExtractorError('This video is only available for premium users.', expected=True) + need_confirm_age = False + if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")', + webpage, 'birthday validate form', default=None): + webpage = self._download_age_confirm_page( + url, video_id, note='Confirming age') + need_confirm_age = True + formats = [] uploader = self._search_regex(r'''(?x) @@ -81,6 +120,7 @@ class CDAIE(InfoExtractor): 'thumbnail': self._og_search_thumbnail(webpage), 'formats': formats, 'duration': None, + 'age_limit': 18 if need_confirm_age else 0, } def extract_format(page, version): @@ -121,7 +161,12 @@ class CDAIE(InfoExtractor): for href, resolution in re.findall( r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)', webpage): - webpage = self._download_webpage( + if need_confirm_age: + handler = self._download_age_confirm_page + else: + handler = self._download_webpage + + webpage = handler( self._BASE_URL + href, video_id, 'Downloading %s version information' % resolution, fatal=False) if not webpage: @@ -129,6 +174,7 @@ class CDAIE(InfoExtractor): # invalid version is requested. self.report_warning('Unable to download %s version information' % resolution) continue + extract_format(webpage, resolution) self._sort_formats(formats) diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py index dd2529a..e250de1 100644 --- a/youtube_dl/extractor/ceskatelevize.py +++ b/youtube_dl/extractor/ceskatelevize.py @@ -12,13 +12,14 @@ from ..utils import ( ExtractorError, float_or_none, sanitized_Request, + unescapeHTML, urlencode_postdata, USER_AGENTS, ) class CeskaTelevizeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P<id>[^/#?]+)/*(?:[#?].*)?$' + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' _TESTS = [{ 'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220', 'info_dict': { @@ -62,40 +63,12 @@ class CeskaTelevizeIE(InfoExtractor): }, 'skip': 'Georestricted to Czech Republic', }, { - # video with 18+ caution trailer - 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', - 'info_dict': { - 'id': '215562210900007-bogotart', - 'title': 'Queer: Bogotart', - 'description': 'Alternativní průvodce současným queer světem', - }, - 'playlist': [{ - 'info_dict': { - 'id': '61924494876844842', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Varování 18+)', - 'duration': 10.2, - }, - }, { - 'info_dict': { - 'id': '61924494877068022', - 'ext': 'mp4', - 'title': 'Queer: Bogotart (Queer)', - 'thumbnail': r're:^https?://.*\.jpg', - 'duration': 1558.3, - }, - }], - 'params': { - # m3u8 download - 'skip_download': True, - }, + 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25', + 'only_matching': True, }] def _real_extract(self, url): - url = url.replace('/porady/', '/ivysilani/').replace('/video/', '') - - mobj = re.match(self._VALID_URL, url) - playlist_id = mobj.group('id') + playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) @@ -103,13 +76,28 @@ class CeskaTelevizeIE(InfoExtractor): if '%s</p>' % NOT_AVAILABLE_STRING in webpage: raise ExtractorError(NOT_AVAILABLE_STRING, expected=True) - typ = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type') - episode_id = self._html_search_regex( - r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id') + type_ = None + episode_id = None + + playlist = self._parse_json( + self._search_regex( + r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist', + default='{}'), playlist_id) + if playlist: + type_ = playlist.get('type') + episode_id = playlist.get('id') + + if not type_: + type_ = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', + webpage, 'type') + if not episode_id: + episode_id = self._html_search_regex( + r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', + webpage, 'episode_id') data = { - 'playlist[0][type]': typ, + 'playlist[0][type]': type_, 'playlist[0][id]': episode_id, 'requestUrl': compat_urllib_parse_urlparse(url).path, 'requestSource': 'iVysilani', @@ -245,3 +233,47 @@ class CeskaTelevizeIE(InfoExtractor): yield line return '\r\n'.join(_fix_subtitle(subtitles)) + + +class CeskaTelevizePoradyIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)' + _TESTS = [{ + # video with 18+ caution trailer + 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/', + 'info_dict': { + 'id': '215562210900007-bogotart', + 'title': 'Queer: Bogotart', + 'description': 'Alternativní průvodce současným queer světem', + }, + 'playlist': [{ + 'info_dict': { + 'id': '61924494876844842', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Varování 18+)', + 'duration': 10.2, + }, + }, { + 'info_dict': { + 'id': '61924494877068022', + 'ext': 'mp4', + 'title': 'Queer: Bogotart (Queer)', + 'thumbnail': r're:^https?://.*\.jpg', + 'duration': 1558.3, + }, + }], + 'params': { + # m3u8 download + 'skip_download': True, + }, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + data_url = unescapeHTML(self._search_regex( + r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'iframe player url', group='url')) + + return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key()) diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index 8fbc91c..e3eba4b 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -33,10 +33,17 @@ class ChaturbateIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - m3u8_formats = [(m.group('id').lower(), m.group('url')) for m in re.finditer( - r'hlsSource(?P<id>.+?)\s*=\s*(?P<q>["\'])(?P<url>http.+?)(?P=q)', webpage)] + m3u8_urls = [] - if not m3u8_formats: + for m in re.finditer( + r'(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage): + m3u8_fast_url, m3u8_no_fast_url = m.group('url'), m.group( + 'url').replace('_fast', '') + for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url): + if m3u8_url not in m3u8_urls: + m3u8_urls.append(m3u8_url) + + if not m3u8_urls: error = self._search_regex( [r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>', r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'], @@ -50,7 +57,8 @@ class ChaturbateIE(InfoExtractor): raise ExtractorError('Unable to find stream URL') formats = [] - for m3u8_id, m3u8_url in m3u8_formats: + for m3u8_url in m3u8_urls: + m3u8_id = 'fast' if '_fast' in m3u8_url else 'slow' formats.extend(self._extract_m3u8_formats( m3u8_url, video_id, ext='mp4', # ffmpeg skips segments for fast m3u8 diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py index bb52e0c..0920f62 100644 --- a/youtube_dl/extractor/clipfish.py +++ b/youtube_dl/extractor/clipfish.py @@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/', - 'md5': '720563e467b86374c194bdead08d207d', + 'md5': 'b9a5dc46294154c1193e2d10e0c95693', 'info_dict': { 'id': '4343170', 'ext': 'mp4', diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py index 18c7347..6a41db8 100644 --- a/youtube_dl/extractor/collegerama.py +++ b/youtube_dl/extractor/collegerama.py @@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.', 'description': '', - 'thumbnail': r're:^https?://.*\.jpg$', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 'duration': 7713.088, 'timestamp': 1413309600, 'upload_date': '20141014', @@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor): 'ext': 'wmv', 'title': '64ste Vakantiecursus: Afvalwater', 'description': 'md5:7fd774865cc69d972f542b157c328305', + 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$', 'duration': 10853, 'timestamp': 1326446400, 'upload_date': '20120113', diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 6c3c095..fec39da 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import base64 @@ -244,6 +245,10 @@ class InfoExtractor(object): specified in the URL. end_time: Time in seconds where the reproduction should end, as specified in the URL. + chapters: A list of dictionaries, with the following entries: + * "start_time" - The start time of the chapter in seconds + * "end_time" - The end time of the chapter in seconds + * "title" (optional, string) The following fields should only be used when the video belongs to some logical chapter or section: @@ -547,6 +552,34 @@ class InfoExtractor(object): return encoding + def __check_blocked(self, content): + first_block = content[:512] + if ('<title>Access to this site is blocked</title>' in content and + 'Websense' in first_block): + msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' + blocked_iframe = self._html_search_regex( + r'<iframe src="([^"]+)"', content, + 'Websense information URL', default=None) + if blocked_iframe: + msg += ' Visit %s for more details' % blocked_iframe + raise ExtractorError(msg, expected=True) + if '<title>The URL you requested has been blocked</title>' in first_block: + msg = ( + 'Access to this webpage has been blocked by Indian censorship. ' + 'Use a VPN or proxy server (with --proxy) to route around it.') + block_msg = self._html_search_regex( + r'</h1><p>(.*?)</p>', + content, 'block message', default=None) + if block_msg: + msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') + raise ExtractorError(msg, expected=True) + if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and + 'blocklist.rkn.gov.ru' in content): + raise ExtractorError( + 'Access to this webpage has been blocked by decision of the Russian government. ' + 'Visit http://blocklist.rkn.gov.ru/ for a block reason.', + expected=True) + def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None): content_type = urlh.headers.get('Content-Type', '') webpage_bytes = urlh.read() @@ -588,25 +621,7 @@ class InfoExtractor(object): except LookupError: content = webpage_bytes.decode('utf-8', 'replace') - if ('<title>Access to this site is blocked</title>' in content and - 'Websense' in content[:512]): - msg = 'Access to this webpage has been blocked by Websense filtering software in your network.' - blocked_iframe = self._html_search_regex( - r'<iframe src="([^"]+)"', content, - 'Websense information URL', default=None) - if blocked_iframe: - msg += ' Visit %s for more details' % blocked_iframe - raise ExtractorError(msg, expected=True) - if '<title>The URL you requested has been blocked</title>' in content[:512]: - msg = ( - 'Access to this webpage has been blocked by Indian censorship. ' - 'Use a VPN or proxy server (with --proxy) to route around it.') - block_msg = self._html_search_regex( - r'</h1><p>(.*?)</p>', - content, 'block message', default=None) - if block_msg: - msg += ' (Message: "%s")' % block_msg.replace('\n', ' ') - raise ExtractorError(msg, expected=True) + self.__check_blocked(content) return content @@ -965,6 +980,23 @@ class InfoExtractor(object): return info if isinstance(json_ld, dict): json_ld = [json_ld] + + def extract_video_object(e): + assert e['@type'] == 'VideoObject' + info.update({ + 'url': e.get('contentUrl'), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + 'view_count': int_or_none(e.get('interactionCount')), + }) + for e in json_ld: if e.get('@context') == 'http://schema.org': item_type = e.get('@type') @@ -989,18 +1021,11 @@ class InfoExtractor(object): 'description': unescapeHTML(e.get('articleBody')), }) elif item_type == 'VideoObject': - info.update({ - 'url': e.get('contentUrl'), - 'title': unescapeHTML(e.get('name')), - 'description': unescapeHTML(e.get('description')), - 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'), - 'duration': parse_duration(e.get('duration')), - 'timestamp': unified_timestamp(e.get('uploadDate')), - 'filesize': float_or_none(e.get('contentSize')), - 'tbr': int_or_none(e.get('bitrate')), - 'width': int_or_none(e.get('width')), - 'height': int_or_none(e.get('height')), - }) + extract_video_object(e) + elif item_type == 'WebPage': + video = e.get('video') + if isinstance(video, dict) and video.get('@type') == 'VideoObject': + extract_video_object(video) break return dict((k, v) for k, v in info.items() if v is not None) @@ -1292,40 +1317,50 @@ class InfoExtractor(object): entry_protocol='m3u8', preference=None, m3u8_id=None, note=None, errnote=None, fatal=True, live=False): - res = self._download_webpage_handle( m3u8_url, video_id, note=note or 'Downloading m3u8 information', errnote=errnote or 'Failed to download m3u8 information', fatal=fatal) + if res is False: return [] + m3u8_doc, urlh = res m3u8_url = urlh.geturl() + return self._parse_m3u8_formats( + m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol, + preference=preference, m3u8_id=m3u8_id, live=live) + + def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None, + entry_protocol='m3u8', preference=None, + m3u8_id=None, live=False): if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access return [] - formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)] + formats = [] format_url = lambda u: ( u if re.match(r'^https?://', u) else compat_urlparse.urljoin(m3u8_url, u)) - # We should try extracting formats only from master playlists [1], i.e. - # playlists that describe available qualities. On the other hand media - # playlists [2] should be returned as is since they contain just the media - # without qualities renditions. + # References: + # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21 + # 2. https://github.com/rg3/youtube-dl/issues/12211 + + # We should try extracting formats only from master playlists [1, 4.3.4], + # i.e. playlists that describe available qualities. On the other hand + # media playlists [1, 4.3.3] should be returned as is since they contain + # just the media without qualities renditions. # Fortunately, master playlist can be easily distinguished from media - # playlist based on particular tags availability. As of [1, 2] master - # playlist tags MUST NOT appear in a media playist and vice versa. - # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist - # and MUST NOT appear in master playlist thus we can clearly detect media - # playlist with this criterion. - # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4 - # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3 - # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1 + # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4] + # master playlist tags MUST NOT appear in a media playist and vice versa. + # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every + # media playlist and MUST NOT appear in master playlist thus we can + # clearly detect media playlist with this criterion. + if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is return [{ 'url': m3u8_url, @@ -1334,52 +1369,72 @@ class InfoExtractor(object): 'protocol': entry_protocol, 'preference': preference, }] - audio_in_video_stream = {} - last_info = {} - last_media = {} + + groups = {} + last_stream_inf = {} + + def extract_media(x_media_line): + media = parse_m3u8_attributes(x_media_line) + # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED + media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME') + if not (media_type and group_id and name): + return + groups.setdefault(group_id, []).append(media) + if media_type not in ('VIDEO', 'AUDIO'): + return + media_url = media.get('URI') + if media_url: + format_id = [] + for v in (group_id, name): + if v: + format_id.append(v) + f = { + 'format_id': '-'.join(format_id), + 'url': format_url(media_url), + 'manifest_url': m3u8_url, + 'language': media.get('LANGUAGE'), + 'ext': ext, + 'protocol': entry_protocol, + 'preference': preference, + } + if media_type == 'AUDIO': + f['vcodec'] = 'none' + formats.append(f) + + def build_stream_name(): + # Despite specification does not mention NAME attribute for + # EXT-X-STREAM-INF tag it still sometimes may be present (see [1] + # or vidio test in TestInfoExtractor.test_parse_m3u8_formats) + # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015 + stream_name = last_stream_inf.get('NAME') + if stream_name: + return stream_name + # If there is no NAME in EXT-X-STREAM-INF it will be obtained + # from corresponding rendition group + stream_group_id = last_stream_inf.get('VIDEO') + if not stream_group_id: + return + stream_group = groups.get(stream_group_id) + if not stream_group: + return stream_group_id + rendition = stream_group[0] + return rendition.get('NAME') or stream_group_id + for line in m3u8_doc.splitlines(): if line.startswith('#EXT-X-STREAM-INF:'): - last_info = parse_m3u8_attributes(line) + last_stream_inf = parse_m3u8_attributes(line) elif line.startswith('#EXT-X-MEDIA:'): - media = parse_m3u8_attributes(line) - media_type = media.get('TYPE') - if media_type in ('VIDEO', 'AUDIO'): - group_id = media.get('GROUP-ID') - media_url = media.get('URI') - if media_url: - format_id = [] - for v in (group_id, media.get('NAME')): - if v: - format_id.append(v) - f = { - 'format_id': '-'.join(format_id), - 'url': format_url(media_url), - 'language': media.get('LANGUAGE'), - 'ext': ext, - 'protocol': entry_protocol, - 'preference': preference, - } - if media_type == 'AUDIO': - f['vcodec'] = 'none' - if group_id and not audio_in_video_stream.get(group_id): - audio_in_video_stream[group_id] = False - formats.append(f) - else: - # When there is no URI in EXT-X-MEDIA let this tag's - # data be used by regular URI lines below - last_media = media - if media_type == 'AUDIO' and group_id: - audio_in_video_stream[group_id] = True + extract_media(line) elif line.startswith('#') or not line.strip(): continue else: - tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000) + tbr = float_or_none( + last_stream_inf.get('AVERAGE-BANDWIDTH') or + last_stream_inf.get('BANDWIDTH'), scale=1000) format_id = [] if m3u8_id: format_id.append(m3u8_id) - # Despite specification does not mention NAME attribute for - # EXT-X-STREAM-INF it still sometimes may be present - stream_name = last_info.get('NAME') or last_media.get('NAME') + stream_name = build_stream_name() # Bandwidth of live streams may differ over time thus making # format_id unpredictable. So it's better to keep provided # format_id intact. @@ -1389,14 +1444,14 @@ class InfoExtractor(object): f = { 'format_id': '-'.join(format_id), 'url': manifest_url, - 'manifest_url': manifest_url, + 'manifest_url': m3u8_url, 'tbr': tbr, 'ext': ext, - 'fps': float_or_none(last_info.get('FRAME-RATE')), + 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')), 'protocol': entry_protocol, 'preference': preference, } - resolution = last_info.get('RESOLUTION') + resolution = last_stream_inf.get('RESOLUTION') if resolution: mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution) if mobj: @@ -1412,13 +1467,26 @@ class InfoExtractor(object): 'vbr': vbr, 'abr': abr, }) - f.update(parse_codecs(last_info.get('CODECS'))) - if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none': - # TODO: update acodec for audio only formats with the same GROUP-ID - f['acodec'] = 'none' + codecs = parse_codecs(last_stream_inf.get('CODECS')) + f.update(codecs) + audio_group_id = last_stream_inf.get('AUDIO') + # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which + # references a rendition group MUST have a CODECS attribute. + # However, this is not always respected, for example, [2] + # contains EXT-X-STREAM-INF tag which references AUDIO + # rendition group but does not have CODECS and despite + # referencing audio group an audio group, it represents + # a complete (with audio and video) format. So, for such cases + # we will ignore references to rendition groups and treat them + # as complete formats. + if audio_group_id and codecs and f.get('vcodec') != 'none': + audio_group = groups.get(audio_group_id) + if audio_group and audio_group[0].get('URI'): + # TODO: update acodec for audio only formats with + # the same GROUP-ID + f['acodec'] = 'none' formats.append(f) - last_info = {} - last_media = {} + last_stream_inf = {} return formats @staticmethod @@ -1768,7 +1836,7 @@ class InfoExtractor(object): if content_type == 'text': # TODO implement WebVTT downloading pass - elif content_type == 'video' or content_type == 'audio': + elif content_type in ('video', 'audio'): base_url = '' for element in (representation, adaptation_set, period, mpd_doc): base_url_e = element.find(_add_ns('BaseURL')) @@ -1792,7 +1860,7 @@ class InfoExtractor(object): 'ext': mimetype2ext(mime_type), 'width': int_or_none(representation_attrib.get('width')), 'height': int_or_none(representation_attrib.get('height')), - 'tbr': int_or_none(bandwidth, 1000), + 'tbr': float_or_none(bandwidth, 1000), 'asr': int_or_none(representation_attrib.get('audioSamplingRate')), 'fps': int_or_none(representation_attrib.get('frameRate')), 'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None, @@ -1933,6 +2001,12 @@ class InfoExtractor(object): compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id) def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None): + """ + Parse formats from ISM manifest. + References: + 1. [MS-SSTR]: Smooth Streaming Protocol, + https://msdn.microsoft.com/en-us/library/ff469518.aspx + """ if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None: return [] @@ -1954,8 +2028,11 @@ class InfoExtractor(object): self.report_warning('%s is not a supported codec' % fourcc) continue tbr = int(track.attrib['Bitrate']) // 1000 - width = int_or_none(track.get('MaxWidth')) - height = int_or_none(track.get('MaxHeight')) + # [1] does not mention Width and Height attributes. However, + # they're often present while MaxWidth and MaxHeight are + # missing, so should be used as fallbacks + width = int_or_none(track.get('MaxWidth') or track.get('Width')) + height = int_or_none(track.get('MaxHeight') or track.get('Height')) sampling_rate = int_or_none(track.get('SamplingRate')) track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern) @@ -2106,7 +2183,7 @@ class InfoExtractor(object): def _extract_akamai_formats(self, manifest_url, video_id, hosts={}): formats = [] hdcore_sign = 'hdcore=3.7.0' - f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') + f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m') hds_host = hosts.get('hds') if hds_host: f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url) @@ -2128,8 +2205,9 @@ class InfoExtractor(object): def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]): url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url) - url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url') - http_base_url = 'http' + url_base + url_base = self._search_regex( + r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url') + http_base_url = '%s:%s' % ('http', url_base) formats = [] if 'm3u8' not in skip_protocols: formats.extend(self._extract_m3u8_formats( @@ -2163,7 +2241,7 @@ class InfoExtractor(object): for protocol in ('rtmp', 'rtsp'): if protocol not in skip_protocols: formats.append({ - 'url': protocol + url_base, + 'url': '%s:%s' % (protocol, url_base), 'format_id': protocol, 'protocol': protocol, }) @@ -2171,7 +2249,7 @@ class InfoExtractor(object): def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json): mobj = re.search( - r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)', + r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)', webpage) if mobj: try: @@ -2247,11 +2325,17 @@ class InfoExtractor(object): def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None, m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None): + urls = [] formats = [] for source in jwplayer_sources_data: - source_url = self._proto_relative_url(source['file']) + source_url = self._proto_relative_url(source.get('file')) + if not source_url: + continue if base_url: source_url = compat_urlparse.urljoin(base_url, source_url) + if source_url in urls: + continue + urls.append(source_url) source_type = source.get('type') or '' ext = mimetype2ext(source_type) or determine_ext(source_url) if source_type == 'hls' or ext == 'm3u8': diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index d3463b8..0c3f0c0 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -16,7 +16,6 @@ from ..utils import ( mimetype2ext, orderedSet, parse_iso8601, - remove_end, ) @@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor): 'wmagazine': 'W Magazine', } - _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys()) + _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/ + (?: + (?: + embed(?:js)?| + (?:script|inline)/video + )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?| + (?P<type>watch|series|video)/(?P<display_id>[^/?#]+) + )''' % '|'.join(_SITES.keys()) IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values())) - EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys()) + EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys()) _TESTS = [{ 'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led', @@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor): 'upload_date': '20150916', 'timestamp': 1442434955, } + }, { + 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player', + 'only_matching': True, + }, { + 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js', + 'only_matching': True, }] def _extract_series(self, url, webpage): @@ -104,7 +116,7 @@ class CondeNastIE(InfoExtractor): entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) - def _extract_video(self, webpage, url_type): + def _extract_video_params(self, webpage): query = {} params = self._search_regex( r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) @@ -123,17 +135,30 @@ class CondeNastIE(InfoExtractor): 'playerId': params['data-player'], 'target': params['id'], }) - video_id = query['videoId'] + return query + + def _extract_video(self, params): + video_id = params['videoId'] + video_info = None - info_page = self._download_json( - 'http://player.cnevids.com/player/video.js', - video_id, 'Downloading video info', fatal=False, query=query) - if info_page: - video_info = info_page.get('video') - if not video_info: + if params.get('playerId'): + info_page = self._download_json( + 'http://player.cnevids.com/player/video.js', + video_id, 'Downloading video info', fatal=False, query=params) + if info_page: + video_info = info_page.get('video') + if not video_info: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=params) + else: info_page = self._download_webpage( - 'http://player.cnevids.com/player/loader.js', - video_id, 'Downloading loader info', query=query) + 'https://player.cnevids.com/inline/video/%s.js' % video_id, + video_id, 'Downloading inline info', query={ + 'target': params.get('target', 'embedplayer') + }) + + if not video_info: video_info = self._parse_json( self._search_regex( r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'), @@ -161,9 +186,7 @@ class CondeNastIE(InfoExtractor): }) self._sort_formats(formats) - info = self._search_json_ld( - webpage, video_id, fatal=False) if url_type != 'embed' else {} - info.update({ + return { 'id': video_id, 'formats': formats, 'title': title, @@ -174,22 +197,26 @@ class CondeNastIE(InfoExtractor): 'series': video_info.get('series_title'), 'season': video_info.get('season_title'), 'timestamp': parse_iso8601(video_info.get('premiere_date')), - }) - return info + 'categories': video_info.get('categories'), + } def _real_extract(self, url): - site, url_type, item_id = re.match(self._VALID_URL, url).groups() + video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups() - # Convert JS embed to regular embed - if url_type == 'embedjs': - parsed_url = compat_urlparse.urlparse(url) - url = compat_urlparse.urlunparse(parsed_url._replace( - path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/'))) - url_type = 'embed' + if video_id: + return self._extract_video({ + 'videoId': video_id, + 'playerId': player_id, + 'target': target, + }) - webpage = self._download_webpage(url, item_id) + webpage = self._download_webpage(url, display_id) if url_type == 'series': return self._extract_series(url, webpage) else: - return self._extract_video(webpage, url_type) + params = self._extract_video_params(webpage) + info = self._search_json_ld( + webpage, display_id, fatal=False) + info.update(self._extract_video(params)) + return info diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py index 5fa1f00..6ea03e6 100644 --- a/youtube_dl/extractor/coub.py +++ b/youtube_dl/extractor/coub.py @@ -24,12 +24,11 @@ class CoubIE(InfoExtractor): 'duration': 4.6, 'timestamp': 1428527772, 'upload_date': '20150408', - 'uploader': 'Артём Лоскутников', + 'uploader': 'Artyom Loskutnikov', 'uploader_id': 'artyom.loskutnikov', 'view_count': int, 'like_count': int, 'repost_count': int, - 'comment_count': int, 'age_limit': 0, }, }, { @@ -118,7 +117,6 @@ class CoubIE(InfoExtractor): view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count')) like_count = int_or_none(coub.get('likes_count')) repost_count = int_or_none(coub.get('recoubs_count')) - comment_count = int_or_none(coub.get('comments_count')) age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin')) if age_restricted is not None: @@ -137,7 +135,6 @@ class CoubIE(InfoExtractor): 'view_count': view_count, 'like_count': like_count, 'repost_count': repost_count, - 'comment_count': comment_count, 'age_limit': age_limit, 'formats': formats, } diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py index f919ed2..13f425b 100644 --- a/youtube_dl/extractor/crackle.py +++ b/youtube_dl/extractor/crackle.py @@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor): 'season_number': 8, 'episode_number': 4, 'subtitles': { - 'en-US': [{ - 'ext': 'ttml', - }] + 'en-US': [ + {'ext': 'vtt'}, + {'ext': 'tt'}, + ] }, }, 'params': { diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index d15fd37..2ffa4a7 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'info_dict': { 'id': '727589', 'ext': 'mp4', - 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!", + 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!", 'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': 'Kadokawa Pictures Inc.', @@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'series': "KONOSUBA -God's blessing on this wonderful world!", 'season': "KONOSUBA -God's blessing on this wonderful world! 2", 'season_number': 2, - 'episode': 'Give Me Deliverance from this Judicial Injustice!', + 'episode': 'Give Me Deliverance From This Judicial Injustice!', 'episode_number': 1, }, 'params': { @@ -390,7 +390,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text else: webpage_url = 'http://www.' + mobj.group('url') - webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage') + webpage = self._download_webpage( + self._add_skip_wall(webpage_url), video_id, + headers=self.geo_verification_headers()) note_m = self._html_search_regex( r'<div class="showmedia-trailer-notice">(.+?)</div>', webpage, 'trailer-notice', default='') @@ -565,7 +567,9 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE): def _real_extract(self, url): show_id = self._match_id(url) - webpage = self._download_webpage(self._add_skip_wall(url), show_id) + webpage = self._download_webpage( + self._add_skip_wall(url), show_id, + headers=self.geo_verification_headers()) title = self._html_search_regex( r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>', webpage, 'title') diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index d457616..171820e 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -10,6 +10,7 @@ from ..utils import ( smuggle_url, determine_ext, ExtractorError, + extract_attributes, ) from .senateisvp import SenateISVPIE from .ustream import UstreamIE @@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor): 'uploader_id': '12987475', }, }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' def _real_extract(self, url): video_id = self._match_id(url) @@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor): if ustream_url: return self.url_result(ustream_url, UstreamIE.ie_key()) + if '&vod' not in url: + bc = self._search_regex( + r"(<[^>]+id='brightcove-player-embed'[^>]+>)", + webpage, 'brightcove embed', default=None) + if bc: + bc_attr = extract_attributes(bc) + bc_url = self.BRIGHTCOVE_URL_TEMPLATE % ( + bc_attr.get('data-bcaccountid', '3162030207001'), + bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'), + bc_attr.get('data-newbcplayerid', 'default'), + bc_attr['data-bcid']) + return self.url_result(smuggle_url(bc_url, {'source_url': url})) + # We first look for clipid, because clipprog always appears before patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')] results = list(filter(None, (re.search(p, webpage) for p in patterns))) diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py index e3c9946..8e45923 100644 --- a/youtube_dl/extractor/curiositystream.py +++ b/youtube_dl/extractor/curiositystream.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( int_or_none, @@ -46,9 +48,50 @@ class CuriosityStreamBaseIE(InfoExtractor): def _extract_media_info(self, media): video_id = compat_str(media['id']) - limelight_media_id = media['limelight_media_id'] title = media['title'] + formats = [] + for encoding in media.get('encodings', []): + m3u8_url = encoding.get('master_playlist_url') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + encoding_url = encoding.get('url') + file_url = encoding.get('file_url') + if not encoding_url and not file_url: + continue + f = { + 'width': int_or_none(encoding.get('width')), + 'height': int_or_none(encoding.get('height')), + 'vbr': int_or_none(encoding.get('video_bitrate')), + 'abr': int_or_none(encoding.get('audio_bitrate')), + 'filesize': int_or_none(encoding.get('size_in_bytes')), + 'vcodec': encoding.get('video_codec'), + 'acodec': encoding.get('audio_codec'), + 'container': encoding.get('container_type'), + } + for f_url in (encoding_url, file_url): + if not f_url: + continue + fmt = f.copy() + rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url) + if rtmp: + fmt.update({ + 'url': rtmp.group('url'), + 'play_path': rtmp.group('playpath'), + 'app': rtmp.group('app'), + 'ext': 'flv', + 'format_id': 'rtmp', + }) + else: + fmt.update({ + 'url': f_url, + 'format_id': 'http', + }) + formats.append(fmt) + self._sort_formats(formats) + subtitles = {} for closed_caption in media.get('closed_captions', []): sub_url = closed_caption.get('file') @@ -60,16 +103,14 @@ class CuriosityStreamBaseIE(InfoExtractor): }) return { - '_type': 'url_transparent', 'id': video_id, - 'url': 'limelight:media:' + limelight_media_id, + 'formats': formats, 'title': title, 'description': media.get('description'), 'thumbnail': media.get('image_large') or media.get('image_medium') or media.get('image_small'), 'duration': int_or_none(media.get('duration')), 'tags': media.get('tags'), 'subtitles': subtitles, - 'ie_key': 'LimelightMedia', } @@ -78,14 +119,12 @@ class CuriosityStreamIE(CuriosityStreamBaseIE): _VALID_URL = r'https?://app\.curiositystream\.com/video/(?P<id>\d+)' _TEST = { 'url': 'https://app.curiositystream.com/video/2', - 'md5': 'a0074c190e6cddaf86900b28d3e9ee7a', + 'md5': '262bb2f257ff301115f1973540de8983', 'info_dict': { 'id': '2', 'ext': 'mp4', 'title': 'How Did You Develop The Internet?', 'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.', - 'timestamp': 1448388615, - 'upload_date': '20151124', } } @@ -105,7 +144,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE): 'title': 'Curious Minds: The Internet', 'description': 'How is the internet shaping our lives in the 21st Century?', }, - 'playlist_mincount': 17, + 'playlist_mincount': 12, } def _real_extract(self, url): diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index 1ab9333..f4cf0f1 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -82,6 +82,11 @@ class CWTVIE(InfoExtractor): 'url': quality_url, 'tbr': tbr, }) + video_metadata = video_data['assetFields'] + ism_url = video_metadata.get('smoothStreamingUrl') + if ism_url: + formats.extend(self._extract_ism_formats( + ism_url, video_id, ism_id='mss', fatal=False)) self._sort_formats(formats) thumbnails = [{ @@ -90,8 +95,6 @@ class CWTVIE(InfoExtractor): 'height': image.get('height'), } for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None - video_metadata = video_data['assetFields'] - subtitles = { 'en': [{ 'url': video_metadata['UnicornCcUrl'], diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index 98c835b..538565c 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -2,9 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( int_or_none, determine_protocol, + try_get, unescapeHTML, ) @@ -28,8 +30,14 @@ class DailyMailIE(InfoExtractor): video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) title = unescapeHTML(video_data['title']) - video_sources = self._download_json(video_data.get( - 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) + + sources_url = (try_get( + video_data, + (lambda x: x['plugins']['sources']['url'], + lambda x: x['sources']['url']), compat_str) or + 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id) + + video_sources = self._download_json(sources_url, video_id) formats = [] for rendition in video_sources['renditions']: diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 246efde..f8db76c 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor): class DailymotionIE(DailymotionBaseInfoExtractor): - _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)' + _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)' IE_NAME = 'dailymotion' _FORMATS = [ @@ -49,68 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor): ('stream_h264_hd1080_url', 'hd180'), ] - _TESTS = [ - { - 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', - 'md5': '2137c41a8e78554bb09225b8eb322406', - 'info_dict': { - 'id': 'x2iuewm', - 'ext': 'mp4', - 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', - 'description': 'Several come bundled with the Steam Controller.', - 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', - 'duration': 74, - 'timestamp': 1425657362, - 'upload_date': '20150306', - 'uploader': 'IGN', - 'uploader_id': 'xijv66', - 'age_limit': 0, - 'view_count': int, - } + _TESTS = [{ + 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news', + 'md5': '074b95bdee76b9e3654137aee9c79dfe', + 'info_dict': { + 'id': 'x5kesuj', + 'ext': 'mp4', + 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller', + 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 187, + 'timestamp': 1493651285, + 'upload_date': '20170501', + 'uploader': 'Deadline', + 'uploader_id': 'x1xm8ri', + 'age_limit': 0, + 'view_count': int, + }, + }, { + 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames', + 'md5': '2137c41a8e78554bb09225b8eb322406', + 'info_dict': { + 'id': 'x2iuewm', + 'ext': 'mp4', + 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News', + 'description': 'Several come bundled with the Steam Controller.', + 'thumbnail': r're:^https?:.*\.(?:jpg|png)$', + 'duration': 74, + 'timestamp': 1425657362, + 'upload_date': '20150306', + 'uploader': 'IGN', + 'uploader_id': 'xijv66', + 'age_limit': 0, + 'view_count': int, }, + 'skip': 'video gone', + }, { # Vevo video - { - 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', - 'info_dict': { - 'title': 'Roar (Official)', - 'id': 'USUV71301934', - 'ext': 'mp4', - 'uploader': 'Katy Perry', - 'upload_date': '20130905', - }, - 'params': { - 'skip_download': True, - }, - 'skip': 'VEVO is only available in some countries', + 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi', + 'info_dict': { + 'title': 'Roar (Official)', + 'id': 'USUV71301934', + 'ext': 'mp4', + 'uploader': 'Katy Perry', + 'upload_date': '20130905', + }, + 'params': { + 'skip_download': True, }, + 'skip': 'VEVO is only available in some countries', + }, { # age-restricted video - { - 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', - 'md5': '0d667a7b9cebecc3c89ee93099c4159d', - 'info_dict': { - 'id': 'xyh2zz', - 'ext': 'mp4', - 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', - 'uploader': 'HotWaves1012', - 'age_limit': 18, - }, - 'skip': 'video gone', + 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband', + 'md5': '0d667a7b9cebecc3c89ee93099c4159d', + 'info_dict': { + 'id': 'xyh2zz', + 'ext': 'mp4', + 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]', + 'uploader': 'HotWaves1012', + 'age_limit': 18, }, + 'skip': 'video gone', + }, { # geo-restricted, player v5 - { - 'url': 'http://www.dailymotion.com/video/xhza0o', - 'only_matching': True, - }, + 'url': 'http://www.dailymotion.com/video/xhza0o', + 'only_matching': True, + }, { # with subtitles - { - 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', - 'only_matching': True, - }, - { - 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', - 'only_matching': True, - } - ] + 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/video/x3n92nf', + 'only_matching': True, + }, { + 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun', + 'only_matching': True, + }] @staticmethod def _extract_urls(webpage): diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py index bdfe638..5c9c0ec 100644 --- a/youtube_dl/extractor/democracynow.py +++ b/youtube_dl/extractor/democracynow.py @@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor): 'info_dict': { 'id': '2015-0703-001', 'ext': 'mp4', - 'title': 'Daily Show', + 'title': 'Daily Show for July 03, 2015', + 'description': 'md5:80eb927244d6749900de6072c7cc2c86', }, }, { 'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree', diff --git a/youtube_dl/extractor/discoveryvr.py b/youtube_dl/extractor/discoveryvr.py new file mode 100644 index 0000000..cb63c26 --- /dev/null +++ b/youtube_dl/extractor/discoveryvr.py @@ -0,0 +1,59 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import parse_duration + + +class DiscoveryVRIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discoveryvr\.com/watch/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.discoveryvr.com/watch/discovery-vr-an-introduction', + 'md5': '32b1929798c464a54356378b7912eca4', + 'info_dict': { + 'id': 'discovery-vr-an-introduction', + 'ext': 'mp4', + 'title': 'Discovery VR - An Introduction', + 'description': 'md5:80d418a10efb8899d9403e61d8790f06', + } + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + bootstrap_data = self._search_regex( + r'root\.DVR\.bootstrapData\s+=\s+"({.+?})";', + webpage, 'bootstrap data') + bootstrap_data = self._parse_json( + bootstrap_data.encode('utf-8').decode('unicode_escape'), + display_id) + videos = self._parse_json(bootstrap_data['videos'], display_id)['allVideos'] + video_data = next(video for video in videos if video.get('slug') == display_id) + + series = video_data.get('showTitle') + title = episode = video_data.get('title') or series + if series and series != title: + title = '%s - %s' % (series, title) + + formats = [] + for f, format_id in (('cdnUriM3U8', 'mobi'), ('webVideoUrlSd', 'sd'), ('webVideoUrlHd', 'hd')): + f_url = video_data.get(f) + if not f_url: + continue + formats.append({ + 'format_id': format_id, + 'url': f_url, + }) + + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('thumbnail'), + 'duration': parse_duration(video_data.get('runTime')), + 'formats': formats, + 'episode': episode, + 'series': series, + } diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py index 1f75352..148605c 100644 --- a/youtube_dl/extractor/dotsub.py +++ b/youtube_dl/extractor/dotsub.py @@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor): 'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p', 'duration': 290, 'timestamp': 1476767794.2809999, - 'upload_date': '20160525', + 'upload_date': '20161018', 'uploader': 'parthivi001', 'uploader_id': 'user52596202', 'view_count': int, diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py index 82d8a04..9757f44 100644 --- a/youtube_dl/extractor/douyutv.py +++ b/youtube_dl/extractor/douyutv.py @@ -3,11 +3,14 @@ from __future__ import unicode_literals import time import hashlib +import re from .common import InfoExtractor from ..utils import ( ExtractorError, unescapeHTML, + unified_strdate, + urljoin, ) @@ -20,7 +23,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': 'iseven', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -51,7 +54,7 @@ class DouyuTVIE(InfoExtractor): 'id': '17732', 'display_id': '17732', 'ext': 'flv', - 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', 'description': r're:.*m7show@163\.com.*', 'thumbnail': r're:^https?://.*\.jpg$', 'uploader': '7师傅', @@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor): 'uploader': uploader, 'is_live': True, } + + +class DouyuShowIE(InfoExtractor): + _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)' + + _TESTS = [{ + 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw', + 'md5': '0c2cfd068ee2afe657801269b2d86214', + 'info_dict': { + 'id': 'rjNBdvnVXNzvE2yw', + 'ext': 'mp4', + 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场', + 'duration': 7150.08, + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': '陈一发儿', + 'uploader_id': 'XrZwYelr5wbK', + 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK', + 'upload_date': '20170402', + }, + }, { + 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw', + 'only_matching': True, + }] + + def _real_extract(self, url): + url = url.replace('vmobile.', 'v.') + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + room_info = self._parse_json(self._search_regex( + r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id) + + video_info = None + + for trial in range(5): + # Sometimes Douyu rejects our request. Let's try it more times + try: + video_info = self._download_json( + 'https://vmobile.douyu.com/video/getInfo', video_id, + query={'vid': video_id}, + headers={ + 'Referer': url, + 'x-requested-with': 'XMLHttpRequest', + }) + break + except ExtractorError: + self._sleep(1, video_id) + + if not video_info: + raise ExtractorError('Can\'t fetch video info') + + formats = self._extract_m3u8_formats( + video_info['data']['video_url'], video_id, + entry_protocol='m3u8_native', ext='mp4') + + upload_date = unified_strdate(self._html_search_regex( + r'<em>上传时间:</em><span>([^<]+)</span>', webpage, + 'upload date', fatal=False)) + + uploader = uploader_id = uploader_url = None + mobj = re.search( + r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"', + webpage) + if mobj: + uploader_id, uploader = mobj.groups() + uploader_url = urljoin(url, '/author/' + uploader_id) + + return { + 'id': video_id, + 'title': room_info['name'], + 'formats': formats, + 'duration': room_info.get('duration'), + 'thumbnail': room_info.get('pic'), + 'upload_date': upload_date, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'uploader_url': uploader_url, + } diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py index e491701..c84624f 100644 --- a/youtube_dl/extractor/drtv.py +++ b/youtube_dl/extractor/drtv.py @@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor): IE_NAME = 'drtv' _TESTS = [{ 'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10', - 'md5': '25e659cccc9a2ed956110a299fdf5983', + 'md5': '7ae17b4e18eb5d29212f424a7511c184', 'info_dict': { 'id': 'klassen-darlig-taber-10', 'ext': 'mp4', @@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor): 'upload_date': '20160823', 'duration': 606.84, }, - 'params': { - 'skip_download': True, - }, }, { + # embed 'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang', - 'md5': '2c37175c718155930f939ef59952474a', 'info_dict': { 'id': 'christiania-pusher-street-ryddes-drdkrjpo', 'ext': 'mp4', 'title': 'LIVE Christianias rydning af Pusher Street er i gang', - 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.', + 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5', 'timestamp': 1472800279, 'upload_date': '20160902', 'duration': 131.4, }, + 'params': { + 'skip_download': True, + }, + }, { + # with SignLanguage formats + 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder', + 'info_dict': { + 'id': 'historien-om-danmark-stenalder', + 'ext': 'mp4', + 'title': 'Historien om Danmark: Stenalder (1)', + 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a', + 'timestamp': 1490401996, + 'upload_date': '20170325', + 'duration': 3502.04, + 'formats': 'mincount:20', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor): elif kind in ('VideoResource', 'AudioResource'): duration = float_or_none(asset.get('DurationInMilliseconds'), 1000) restricted_to_denmark = asset.get('RestrictedToDenmark') - spoken_subtitles = asset.get('Target') == 'SpokenSubtitles' + asset_target = asset.get('Target') for link in asset.get('Links', []): uri = link.get('Uri') if not uri: @@ -96,9 +112,9 @@ class DRTVIE(InfoExtractor): target = link.get('Target') format_id = target or '' preference = None - if spoken_subtitles: + if asset_target in ('SpokenSubtitles', 'SignLanguage'): preference = -1 - format_id += '-spoken-subtitles' + format_id += '-%s' % asset_target if target == 'HDS': f4m_formats = self._extract_f4m_formats( uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43', diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6a7028a..ed603eb 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -19,6 +19,7 @@ from .acast import ( ACastChannelIE, ) from .addanime import AddAnimeIE +from .adn import ADNIE from .adobetv import ( AdobeTVIE, AdobeTVShowIE, @@ -40,6 +41,7 @@ from .alphaporno import AlphaPornoIE from .amcnetworks import AMCNetworksIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE +from .anvato import AnvatoIE from .anysex import AnySexIE from .aol import AolIE from .allocine import AllocineIE @@ -86,7 +88,6 @@ from .azmedien import ( AZMedienPlaylistIE, AZMedienShowPlaylistIE, ) -from .azubu import AzubuIE, AzubuLiveIE from .baidu import BaiduVideoIE from .bambuser import BambuserIE, BambuserChannelIE from .bandcamp import BandcampIE, BandcampAlbumIE @@ -164,7 +165,10 @@ from .ccc import CCCIE from .ccma import CCMAIE from .cctv import CCTVIE from .cda import CDAIE -from .ceskatelevize import CeskaTelevizeIE +from .ceskatelevize import ( + CeskaTelevizeIE, + CeskaTelevizePoradyIE, +) from .channel9 import Channel9IE from .charlierose import CharlieRoseIE from .chaturbate import ChaturbateIE @@ -247,7 +251,10 @@ from .democracynow import DemocracynowIE from .dfb import DFBIE from .dhm import DHMIE from .dotsub import DotsubIE -from .douyutv import DouyuTVIE +from .douyutv import ( + DouyuShowIE, + DouyuTVIE, +) from .dplay import ( DPlayIE, DPlayItIE, @@ -272,6 +279,7 @@ from .discoverygo import ( DiscoveryGoPlaylistIE, ) from .discoverynetworks import DiscoveryNetworksDeIE +from .discoveryvr import DiscoveryVRIE from .disney import DisneyIE from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE @@ -345,9 +353,9 @@ from .foxsports import FoxSportsIE from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( - PluzzIE, - FranceTvInfoIE, FranceTVIE, + FranceTVEmbedIE, + FranceTVInfoIE, GenerationQuoiIE, CultureboxIE, ) @@ -379,6 +387,7 @@ from .globo import ( GloboArticleIE, ) from .go import GoIE +from .go90 import Go90IE from .godtube import GodTubeIE from .godtv import GodTVIE from .golem import GolemIE @@ -536,6 +545,8 @@ from .mangomolo import ( ) from .matchtv import MatchTVIE from .mdr import MDRIE +from .mediaset import MediasetIE +from .medici import MediciIE from .meipai import MeipaiIE from .melonvod import MelonVODIE from .meta import METAIE @@ -656,6 +667,8 @@ from .nintendo import NintendoIE from .njpwworld import NJPWWorldIE from .nobelprize import NobelPrizeIE from .noco import NocoIE +from .nonktube import NonkTubeIE +from .noovo import NoovoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE from .nova import NovaIE @@ -724,10 +737,14 @@ from .openload import OpenloadIE from .ora import OraTVIE from .orf import ( ORFTVthekIE, - ORFOE1IE, ORFFM4IE, + ORFOE1IE, ORFIPTVIE, ) +from .packtpub import ( + PacktPubIE, + PacktPubCourseIE, +) from .pandatv import PandaTVIE from .pandoratv import PandoraTVIE from .parliamentliveuk import ParliamentLiveUKIE @@ -797,7 +814,7 @@ from .radiojavan import RadioJavanIE from .radiobremen import RadioBremenIE from .radiofrance import RadioFranceIE from .rai import ( - RaiTVIE, + RaiPlayIE, RaiIE, ) from .rbmaradio import RBMARadioIE @@ -828,7 +845,11 @@ from .rozhlas import RozhlasIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE from .rtlnl import RtlNlIE -from .rtl2 import RTL2IE +from .rtl2 import ( + RTL2IE, + RTL2YouIE, + RTL2YouSeriesIE, +) from .rtp import RTPIE from .rts import RTSIE from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE @@ -924,6 +945,7 @@ from .srmediathek import SRMediathekIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE from .streamable import StreamableIE +from .streamango import StreamangoIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE @@ -970,6 +992,7 @@ from .theplatform import ( from .thescene import TheSceneIE from .thesixtyone import TheSixtyOneIE from .thestar import TheStarIE +from .thesun import TheSunIE from .theweatherchannel import TheWeatherChannelIE from .thisamericanlife import ThisAmericanLifeIE from .thisav import ThisAVIE @@ -1016,8 +1039,10 @@ from .tv2 import ( TV2IE, TV2ArticleIE, ) +from .tv2hu import TV2HuIE from .tv3 import TV3IE from .tv4 import TV4IE +from .tv5mondeplus import TV5MondePlusIE from .tva import TVAIE from .tvanouvelles import ( TVANouvellesIE, @@ -1078,6 +1103,10 @@ from .uplynk import ( UplynkIE, UplynkPreplayIE, ) +from .upskill import ( + UpskillIE, + UpskillCourseIE, +) from .urort import UrortIE from .urplay import URPlayIE from .usanetwork import USANetworkIE @@ -1105,6 +1134,7 @@ from .vgtv import ( from .vh1 import VH1IE from .vice import ( ViceIE, + ViceArticleIE, ViceShowIE, ) from .viceland import VicelandIE @@ -1177,6 +1207,11 @@ from .voxmedia import VoxMediaIE from .vporn import VpornIE from .vrt import VRTIE from .vrak import VrakIE +from .vrv import ( + VRVIE, + VRVSeriesIE, +) +from .vshare import VShareIE from .medialaan import MedialaanIE from .vube import VubeIE from .vuclip import VuClipIE @@ -1210,7 +1245,10 @@ from .wrzuta import ( WrzutaIE, WrzutaPlaylistIE, ) -from .wsj import WSJIE +from .wsj import ( + WSJIE, + WSJArticleIE, +) from .xbef import XBefIE from .xboxclips import XboxClipsIE from .xfileshare import XFileShareIE @@ -1272,5 +1310,6 @@ from .youtube import ( YoutubeWatchLaterIE, ) from .zapiks import ZapiksIE +from .zaq1 import Zaq1IE from .zdf import ZDFIE, ZDFChannelIE from .zingmp3 import ZingMp3IE diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py index a3bb983..9855427 100644 --- a/youtube_dl/extractor/foxsports.py +++ b/youtube_dl/extractor/foxsports.py @@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)' _TEST = { - 'url': 'http://www.foxsports.com/video?vid=432609859715', + 'url': 'http://www.foxsports.com/tennessee/video/432609859715', 'md5': 'b49050e955bebe32c301972e4012ac17', 'info_dict': { - 'id': 'i0qKWsk3qJaM', + 'id': 'bwduI3X_TgUB', 'ext': 'mp4', 'title': 'Courtney Lee on going up 2-0 in series vs. Blazers', 'description': 'Courtney Lee talks about Memphis being focused.', @@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor): webpage = self._download_webpage(url, video_id) config = self._parse_json( - self._search_regex( - r"data-player-config='([^']+)'", webpage, 'data player config'), + self._html_search_regex( + r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""", + webpage, 'data player config'), video_id) return self.url_result(smuggle_url(update_url_query( diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index 48d43ae..546d5ca 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -21,11 +21,13 @@ from .dailymotion import ( class FranceTVBaseInfoExtractor(InfoExtractor): - def _extract_video(self, video_id, catalogue): + def _extract_video(self, video_id, catalogue=None): info = self._download_json( - 'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s' - % (video_id, catalogue), - video_id, 'Downloading video JSON') + 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/', + video_id, 'Downloading video JSON', query={ + 'idDiffusion': video_id, + 'catalogue': catalogue or '', + }) if info.get('status') == 'NOK': raise ExtractorError( @@ -109,27 +111,97 @@ class FranceTVBaseInfoExtractor(InfoExtractor): } -class PluzzIE(FranceTVBaseInfoExtractor): - IE_NAME = 'pluzz.francetv.fr' - _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html' +class FranceTVIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)+(?P<id>[^/]+)\.html' - # Can't use tests, videos expire in 7 days + _TESTS = [{ + 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html', + 'info_dict': { + 'id': '157550144', + 'ext': 'mp4', + 'title': '13h15, le dimanche... - Les mystères de Jésus', + 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42', + 'timestamp': 1494156300, + 'upload_date': '20170507', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, { + # france3 + 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html', + 'only_matching': True, + }, { + # france4 + 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html', + 'only_matching': True, + }, { + # france5 + 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html', + 'only_matching': True, + }, { + # franceo + 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html', + 'only_matching': True, + }, { + # france2 live + 'url': 'https://www.france.tv/france-2/direct.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html', + 'only_matching': True, + }, { + 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html', + 'only_matching': True, + }, { + 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - video_id = self._html_search_meta( - 'id_video', webpage, 'video id', default=None) + catalogue = None + video_id = self._search_regex( + r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1', + webpage, 'video id', default=None, group='id') + if not video_id: - video_id = self._search_regex( - r'data-diffusion=["\'](\d+)', webpage, 'video id') + video_id, catalogue = self._html_search_regex( + r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', + webpage, 'video ID').split('@') + return self._extract_video(video_id, catalogue) - return self._extract_video(video_id, 'Pluzz') +class FranceTVEmbedIE(FranceTVBaseInfoExtractor): + _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)' -class FranceTvInfoIE(FranceTVBaseInfoExtractor): + _TEST = { + 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961', + 'info_dict': { + 'id': 'NI_983319', + 'ext': 'mp4', + 'title': 'Le Pen Reims', + 'upload_date': '20170505', + 'timestamp': 1493981780, + 'duration': 16, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + video = self._download_json( + 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id, + video_id) + + return self._extract_video(video['video_id'], video.get('catalog')) + + +class FranceTVInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' @@ -233,124 +305,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): return self._extract_video(video_id, catalogue) -class FranceTVIE(FranceTVBaseInfoExtractor): - IE_NAME = 'francetv' - IE_DESC = 'France 2, 3, 4, 5 and Ô' - _VALID_URL = r'''(?x) - https?:// - (?: - (?:www\.)?france[2345o]\.fr/ - (?: - emissions/[^/]+/(?:videos|diffusions)| - emission/[^/]+| - videos| - jt - ) - /| - embed\.francetv\.fr/\?ue= - ) - (?P<id>[^/?]+) - ''' - - _TESTS = [ - # france2 - { - 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104', - 'md5': 'c03fc87cb85429ffd55df32b9fc05523', - 'info_dict': { - 'id': '109169362', - 'ext': 'flv', - 'title': '13h15, le dimanche...', - 'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7', - 'upload_date': '20140914', - 'timestamp': 1410693600, - }, - }, - # france3 - { - 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575', - 'md5': '679bb8f8921f8623bd658fa2f8364da0', - 'info_dict': { - 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au', - 'ext': 'mp4', - 'title': 'Le scandale du prix des médicaments', - 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce', - 'upload_date': '20131113', - 'timestamp': 1384380000, - }, - }, - # france4 - { - 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c', - 'info_dict': { - 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4', - 'ext': 'mp4', - 'title': 'Hero Corp Making of - Extrait 1', - 'description': 'md5:c87d54871b1790679aec1197e73d650a', - 'upload_date': '20131106', - 'timestamp': 1383766500, - }, - }, - # france5 - { - 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1', - 'md5': 'f6c577df3806e26471b3d21631241fd0', - 'info_dict': { - 'id': '123327454', - 'ext': 'flv', - 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?', - 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4', - 'upload_date': '20150831', - 'timestamp': 1441035120, - }, - }, - # franceo - { - 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015', - 'md5': '47d5816d3b24351cdce512ad7ab31da8', - 'info_dict': { - 'id': '125377621', - 'ext': 'flv', - 'title': 'Infô soir', - 'description': 'md5:01b8c6915a3d93d8bbbd692651714309', - 'upload_date': '20150718', - 'timestamp': 1437241200, - 'duration': 414, - }, - }, - { - # francetv embed - 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87', - 'info_dict': { - 'id': 'EV_30231', - 'ext': 'flv', - 'title': 'Alcaline, le concert avec Calogero', - 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff', - 'upload_date': '20150226', - 'timestamp': 1424989860, - 'duration': 5400, - }, - }, - { - 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05', - 'only_matching': True, - }, - { - 'url': 'http://www.franceo.fr/videos/125377617', - 'only_matching': True, - } - ] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_id, catalogue = self._html_search_regex( - r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"', - webpage, 'video ID').split('@') - return self._extract_video(video_id, catalogue) - - class GenerationQuoiIE(InfoExtractor): IE_NAME = 'france2.fr:generation-quoi' _VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)' diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py index eba00cd..8c37509 100644 --- a/youtube_dl/extractor/funimation.py +++ b/youtube_dl/extractor/funimation.py @@ -2,231 +2,148 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_HTTPError, - compat_urllib_parse_unquote_plus, -) +from ..compat import compat_HTTPError from ..utils import ( - clean_html, determine_ext, int_or_none, - sanitized_Request, + js_to_json, ExtractorError, urlencode_postdata ) class FunimationIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)' _NETRC_MACHINE = 'funimation' + _TOKEN = None _TESTS = [{ - 'url': 'http://www.funimation.com/shows/air/videos/official/breeze', + 'url': 'https://www.funimation.com/shows/hacksign/role-play/', 'info_dict': { - 'id': '658', - 'display_id': 'breeze', - 'ext': 'mp4', - 'title': 'Air - 1 - Breeze', - 'description': 'md5:1769f43cd5fc130ace8fd87232207892', - 'thumbnail': r're:https?://.*\.jpg', - }, - 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed', - }, { - 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play', - 'info_dict': { - 'id': '31128', + 'id': '91144', 'display_id': 'role-play', 'ext': 'mp4', - 'title': '.hack//SIGN - 1 - Role Play', + 'title': '.hack//SIGN - Role Play', 'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd', 'thumbnail': r're:https?://.*\.jpg', }, - 'skip': 'Access without user interaction is forbidden by CloudFlare', + 'params': { + # m3u8 download + 'skip_download': True, + }, }, { - 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview', + 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/', 'info_dict': { - 'id': '9635', + 'id': '210051', 'display_id': 'broadcast-dub-preview', 'ext': 'mp4', 'title': 'Attack on Titan: Junior High - Broadcast Dub Preview', - 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803', 'thumbnail': r're:https?://.*\.(?:jpg|png)', }, - 'skip': 'Access without user interaction is forbidden by CloudFlare', + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/', + 'only_matching': True, }] - _LOGIN_URL = 'http://www.funimation.com/login' - - def _download_webpage(self, *args, **kwargs): - try: - return super(FunimationIE, self)._download_webpage(*args, **kwargs) - except ExtractorError as ee: - if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403: - response = ee.cause.read() - if b'>Please complete the security check to access<' in response: - raise ExtractorError( - 'Access to funimation.com is blocked by CloudFlare. ' - 'Please browse to http://www.funimation.com/, solve ' - 'the reCAPTCHA, export browser cookies to a text file,' - ' and then try again with --cookies YOUR_COOKIE_FILE.', - expected=True) - raise - - def _extract_cloudflare_session_ua(self, url): - ci_session_cookie = self._get_cookies(url).get('ci_session') - if ci_session_cookie: - ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value) - # ci_session is a string serialized by PHP function serialize() - # This case is simple enough to use regular expressions only - return self._search_regex( - r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent', - default=None) - def _login(self): (username, password) = self._get_login_info() if username is None: return - data = urlencode_postdata({ - 'email_field': username, - 'password_field': password, - }) - user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL) - if not user_agent: - user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0' - login_request = sanitized_Request(self._LOGIN_URL, data, headers={ - 'User-Agent': user_agent, - 'Content-Type': 'application/x-www-form-urlencoded' - }) - login_page = self._download_webpage( - login_request, None, 'Logging in as %s' % username) - if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')): - return - error = self._html_search_regex( - r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>', - login_page, 'error messages', default=None) - if error: - raise ExtractorError('Unable to login: %s' % error, expected=True) - raise ExtractorError('Unable to log in') + try: + data = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/', + None, 'Logging in as %s' % username, data=urlencode_postdata({ + 'username': username, + 'password': password, + })) + self._TOKEN = data['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401: + error = self._parse_json(e.cause.read().decode(), None)['error'] + raise ExtractorError(error, expected=True) + raise def _real_initialize(self): self._login() def _real_extract(self, url): display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) - errors = [] - formats = [] - - ERRORS_MAP = { - 'ERROR_MATURE_CONTENT_LOGGED_IN': 'matureContentLoggedIn', - 'ERROR_MATURE_CONTENT_LOGGED_OUT': 'matureContentLoggedOut', - 'ERROR_SUBSCRIPTION_LOGGED_OUT': 'subscriptionLoggedOut', - 'ERROR_VIDEO_EXPIRED': 'videoExpired', - 'ERROR_TERRITORY_UNAVAILABLE': 'territoryUnavailable', - 'SVODBASIC_SUBSCRIPTION_IN_PLAYER': 'basicSubscription', - 'SVODNON_SUBSCRIPTION_IN_PLAYER': 'nonSubscription', - 'ERROR_PLAYER_NOT_RESPONDING': 'playerNotResponding', - 'ERROR_UNABLE_TO_CONNECT_TO_CDN': 'unableToConnectToCDN', - 'ERROR_STREAM_NOT_FOUND': 'streamNotFound', - } - - USER_AGENTS = ( - # PC UA is served with m3u8 that provides some bonus lower quality formats - ('pc', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'), - # Mobile UA allows to extract direct links and also does not fail when - # PC UA fails with hulu error (e.g. - # http://www.funimation.com/shows/hacksign/videos/official/role-play) - ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'), - ) - - user_agent = self._extract_cloudflare_session_ua(url) - if user_agent: - USER_AGENTS = ((None, user_agent),) - - for kind, user_agent in USER_AGENTS: - request = sanitized_Request(url) - request.add_header('User-Agent', user_agent) - webpage = self._download_webpage( - request, display_id, - 'Downloading %s webpage' % kind if kind else 'Downloading webpage') - - playlist = self._parse_json( - self._search_regex( - r'var\s+playersData\s*=\s*(\[.+?\]);\n', - webpage, 'players data'), - display_id)[0]['playlist'] - - items = next(item['items'] for item in playlist if item.get('items')) - item = next(item for item in items if item.get('itemAK') == display_id) - - error_messages = {} - video_error_messages = self._search_regex( - r'var\s+videoErrorMessages\s*=\s*({.+?});\n', - webpage, 'error messages', default=None) - if video_error_messages: - error_messages_json = self._parse_json(video_error_messages, display_id, fatal=False) - if error_messages_json: - for _, error in error_messages_json.items(): - type_ = error.get('type') - description = error.get('description') - content = error.get('content') - if type_ == 'text' and description and content: - error_message = ERRORS_MAP.get(description) - if error_message: - error_messages[error_message] = content - - for video in item.get('videoSet', []): - auth_token = video.get('authToken') - if not auth_token: - continue - funimation_id = video.get('FUNImationID') or video.get('videoId') - preference = 1 if video.get('languageMode') == 'dub' else 0 - if not auth_token.startswith('?'): - auth_token = '?%s' % auth_token - for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)): - format_url = video.get('%sUrl' % quality) - if not format_url: - continue - if not format_url.startswith(('http', '//')): - errors.append(format_url) - continue - if determine_ext(format_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native', - preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False)) - else: - tbr = int_or_none(self._search_regex( - r'-(\d+)[Kk]', format_url, 'tbr', default=None)) - formats.append({ - 'url': format_url + auth_token, - 'format_id': '%s-http-%dp' % (funimation_id, height), - 'height': height, - 'tbr': tbr, - 'preference': preference, - }) + def _search_kane(name): + return self._search_regex( + r"KANE_customdimensions\.%s\s*=\s*'([^']+)';" % name, + webpage, name, default=None) + + title_data = self._parse_json(self._search_regex( + r'TITLE_DATA\s*=\s*({[^}]+})', + webpage, 'title data', default=''), + display_id, js_to_json, fatal=False) or {} + + video_id = title_data.get('id') or self._search_regex([ + r"KANE_customdimensions.videoID\s*=\s*'(\d+)';", + r'<iframe[^>]+src="/player/(\d+)"', + ], webpage, 'video_id', default=None) + if not video_id: + player_url = self._html_search_meta([ + 'al:web:url', + 'og:video:url', + 'og:video:secure_url', + ], webpage, fatal=True) + video_id = self._search_regex(r'/player/(\d+)', player_url, 'video id') + + title = episode = title_data.get('title') or _search_kane('videoTitle') or self._og_search_title(webpage) + series = _search_kane('showName') + if series: + title = '%s - %s' % (series, title) + description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True) - if not formats and errors: - raise ExtractorError( - '%s returned error: %s' - % (self.IE_NAME, clean_html(error_messages.get(errors[0], errors[0]))), - expected=True) + try: + headers = {} + if self._TOKEN: + headers['Authorization'] = 'Token %s' % self._TOKEN + sources = self._download_json( + 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id, + video_id, headers=headers)['items'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + error = self._parse_json(e.cause.read(), video_id)['errors'][0] + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error.get('detail') or error.get('title')), expected=True) + raise + formats = [] + for source in sources: + source_url = source.get('src') + if not source_url: + continue + source_type = source.get('videoType') or determine_ext(source_url) + if source_type == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'format_id': source_type, + 'url': source_url, + }) self._sort_formats(formats) - title = item['title'] - artist = item.get('artist') - if artist: - title = '%s - %s' % (artist, title) - description = self._og_search_description(webpage) or item.get('description') - thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl') - video_id = item.get('itemId') or display_id - return { 'id': video_id, 'display_id': display_id, 'title': title, 'description': description, - 'thumbnail': thumbnail, + 'thumbnail': self._og_search_thumbnail(webpage), + 'series': series, + 'season_number': int_or_none(title_data.get('seasonNum') or _search_kane('season')), + 'episode_number': int_or_none(title_data.get('episodeNum')), + 'episode': episode, + 'season_id': title_data.get('seriesId'), 'formats': formats, } diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py index 81c0ce9..4940936 100644 --- a/youtube_dl/extractor/funnyordie.py +++ b/youtube_dl/extractor/funnyordie.py @@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor): m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) source_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)] bitrates.sort() diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 682c49e..00d3111 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -78,8 +78,7 @@ class GameSpotIE(OnceIE): if m3u8_formats: self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) if len(qualities) == len(m3u8_formats): for q, m3u8_format in zip(qualities, m3u8_formats): f = m3u8_format.copy() diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py index 3136427..f71d909 100644 --- a/youtube_dl/extractor/gdcvault.py +++ b/youtube_dl/extractor/gdcvault.py @@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor): 'format': 'jp', # The japanese audio } }, + { + # gdc-player.html + 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo', + 'info_dict': { + 'id': '1435', + 'display_id': 'An-American-engine-in-Tokyo', + 'ext': 'flv', + 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT', + }, + 'params': { + 'skip_download': True, # Requires rtmpdump + }, + }, ] def _login(self, webpage_url, display_id): @@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor): 'title': title, } - PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>' + PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>' xml_root = self._html_search_regex( PLAYER_REGEX, start_page, 'xml root', default=None) diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 274f817..c108d4a 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -85,6 +85,11 @@ from .ustream import UstreamIE from .openload import OpenloadIE from .videopress import VideoPressIE from .rutube import RutubeIE +from .limelight import LimelightBaseIE +from .anvato import AnvatoIE +from .washingtonpost import WashingtonPostIE +from .wistia import WistiaIE +from .mediaset import MediasetIE class GenericIE(InfoExtractor): @@ -430,6 +435,22 @@ class GenericIE(InfoExtractor): }, }, { + # Brightcove video in <iframe> + 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724', + 'md5': '36d74ef5e37c8b4a2ce92880d208b968', + 'info_dict': { + 'id': '5360463607001', + 'ext': 'mp4', + 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活', + 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。', + 'uploader': 'United Nations', + 'uploader_id': '1362235914001', + 'timestamp': 1489593889, + 'upload_date': '20170315', + }, + 'add_ie': ['BrightcoveLegacy'], + }, + { # Brightcove with alternative playerID key 'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html', 'info_dict': { @@ -465,6 +486,59 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, # m3u8 download }, + 'skip': 'video rotates...weekly?', + }, + { + # Brightcove:new type [2]. + 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis', + 'md5': '2b35148fcf48da41c9fb4591650784f3', + 'info_dict': { + 'id': '5348741021001', + 'ext': 'mp4', + 'upload_date': '20170306', + 'uploader_id': '4191638492001', + 'timestamp': 1488769918, + 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis', + + }, + }, + { + # Alternative brightcove <video> attributes + 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/', + 'info_dict': { + 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche', + 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs", + }, + 'playlist': [{ + 'md5': '732d22ba3d33f2f3fc253c39f8f36523', + 'info_dict': { + 'id': '5311302538001', + 'ext': 'mp4', + 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche", + 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)", + 'timestamp': 1486321708, + 'upload_date': '20170205', + 'uploader_id': '800000640001', + }, + 'only_matching': True, + }], + }, + { + # Brightcove with UUID in videoPlayer + 'url': 'http://www8.hp.com/cn/zh/home.html', + 'info_dict': { + 'id': '5255815316001', + 'ext': 'mp4', + 'title': 'Sprocket Video - China', + 'description': 'Sprocket Video - China', + 'uploader': 'HP-Video Gallery', + 'timestamp': 1482263210, + 'upload_date': '20161220', + 'uploader_id': '1107601872001', + }, + 'params': { + 'skip_download': True, # m3u8 download + }, }, # ooyala video { @@ -730,6 +804,21 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + # YouTube <object> embed + { + 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/', + 'md5': '516718101ec834f74318df76259fb3cc', + 'info_dict': { + 'id': 'msN87y-iEx0', + 'ext': 'webm', + 'title': 'Feynman: Mirrors FUN TO IMAGINE 6', + 'upload_date': '20080526', + 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d', + 'uploader': 'Christopher Sykes', + 'uploader_id': 'ChristopherJSykes', + }, + 'add_ie': ['Youtube'], + }, # Camtasia studio { 'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/', @@ -1080,6 +1169,21 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura iframe embed + 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/', + 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44', + 'info_dict': { + 'id': '0_f2cfbpwy', + 'ext': 'mp4', + 'title': 'I. M. Pei: A Centennial Celebration', + 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c', + 'upload_date': '20170403', + 'uploader_id': 'batchUser', + 'timestamp': 1491232186, + }, + 'add_ie': ['Kaltura'], + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1327,6 +1431,22 @@ class GenericIE(InfoExtractor): 'skip_download': True, }, }, + { + # Brightcove embed with whitespace around attribute names + 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill', + 'info_dict': { + 'id': '3167554373001', + 'ext': 'mp4', + 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill", + 'description': 'md5:57bacb0e0f29349de4972bfda3191713', + 'uploader_id': '1079349493', + 'upload_date': '20140207', + 'timestamp': 1391810548, + }, + 'params': { + 'skip_download': True, + }, + }, # Another form of arte.tv embed { 'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html', @@ -1568,6 +1688,51 @@ class GenericIE(InfoExtractor): }, 'add_ie': [SenateISVPIE.ie_key()], }, + { + # Limelight embeds (1 channel embed + 4 media embeds) + 'url': 'http://www.sedona.com/FacilitatorTraining2017', + 'info_dict': { + 'id': 'FacilitatorTraining2017', + 'title': 'Facilitator Training 2017', + }, + 'playlist_mincount': 5, + }, + { + 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/', + 'info_dict': { + 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest', + 'title': 'Standoff with Walnut Creek murder suspect ends', + 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788', + }, + 'playlist_mincount': 4, + }, + { + # WashingtonPost embed + 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches', + 'info_dict': { + 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac', + 'ext': 'mp4', + 'title': "No one has seen the drama series based on Trump's life \u2014 until now", + 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.', + 'timestamp': 1455216756, + 'uploader': 'The Washington Post', + 'upload_date': '20160211', + }, + 'add_ie': [WashingtonPostIE.ie_key()], + }, + { + # Mediaset embed + 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml', + 'info_dict': { + 'id': '720642', + 'ext': 'mp4', + 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [MediasetIE.ie_key()], + }, # { # # TODO: find another test # # http://schema.org/VideoObject @@ -1610,7 +1775,7 @@ class GenericIE(InfoExtractor): continue entries.append({ - '_type': 'url', + '_type': 'url_transparent', 'url': next_url, 'title': it.find('title').text, }) @@ -1870,7 +2035,6 @@ class GenericIE(InfoExtractor): # Look for Brightcove Legacy Studio embeds bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage) if bc_urls: - self.to_screen('Brightcove video detected.') entries = [{ '_type': 'url', 'url': smuggle_url(bc_url, {'Referer': url}), @@ -1885,7 +2049,7 @@ class GenericIE(InfoExtractor): } # Look for Brightcove New Studio embeds - bc_urls = BrightcoveNewIE._extract_urls(webpage) + bc_urls = BrightcoveNewIE._extract_urls(self, webpage) if bc_urls: return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew') @@ -1923,6 +2087,7 @@ class GenericIE(InfoExtractor): data-video-url=| <embed[^>]+?src=| embedSWF\(?:\s*| + <object[^>]+data=| new\s+SWFObject\( ) (["\']) @@ -1961,57 +2126,20 @@ class GenericIE(InfoExtractor): playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p) # Look for embedded Wistia player - match = re.search( - r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) - if match: - embed_url = self._proto_relative_url( - unescapeHTML(match.group('url'))) + wistia_url = WistiaIE._extract_url(webpage) + if wistia_url: return { '_type': 'url_transparent', - 'url': embed_url, - 'ie_key': 'Wistia', + 'url': self._proto_relative_url(wistia_url), + 'ie_key': WistiaIE.ie_key(), 'uploader': video_uploader, } - match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) - if match: - return { - '_type': 'url_transparent', - 'url': 'wistia:%s' % match.group('id'), - 'ie_key': 'Wistia', - 'uploader': video_uploader, - } - - match = re.search( - r'''(?sx) - <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? - <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 - ''', webpage) - if match: - return self.url_result(self._proto_relative_url( - 'wistia:%s' % match.group('id')), 'Wistia') - # Look for SVT player svt_url = SVTIE._extract_url(webpage) if svt_url: return self.url_result(svt_url, 'SVT') - # Look for embedded condenast player - matches = re.findall( - r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")', - webpage) - if matches: - return { - '_type': 'playlist', - 'entries': [{ - '_type': 'url', - 'ie_key': 'CondeNast', - 'url': ma, - } for ma in matches], - 'title': video_title, - 'id': video_id, - } - # Look for Bandcamp pages with custom domain mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage) if mobj is not None: @@ -2400,28 +2528,16 @@ class GenericIE(InfoExtractor): return self.url_result(piksel_url, PikselIE.ie_key()) # Look for Limelight embeds - mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) - if mobj: - lm = { - 'Media': 'media', - 'Channel': 'channel', - 'ChannelList': 'channel_list', - } - return self.url_result(smuggle_url('limelight:%s:%s' % ( - lm[mobj.group(1)], mobj.group(2)), {'source_url': url}), - 'Limelight%s' % mobj.group(1), mobj.group(2)) + limelight_urls = LimelightBaseIE._extract_urls(webpage, url) + if limelight_urls: + return self.playlist_result( + limelight_urls, video_id, video_title, video_description) - mobj = re.search( - r'''(?sx) - <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*? - <param[^>]+ - name=(["\'])flashVars\2[^>]+ - value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32}) - ''', webpage) - if mobj: - return self.url_result(smuggle_url( - 'limelight:media:%s' % mobj.group('id'), - {'source_url': url}), 'LimelightMedia', mobj.group('id')) + # Look for Anvato embeds + anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id) + if anvato_urls: + return self.playlist_result( + anvato_urls, video_id, video_title, video_description) # Look for AdobeTVVideo embeds mobj = re.search( @@ -2540,6 +2656,18 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( rutube_urls, ie=RutubeIE.ie_key()) + # Look for WashingtonPost embeds + wapo_urls = WashingtonPostIE._extract_urls(webpage) + if wapo_urls: + return self.playlist_from_matches( + wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key()) + + # Look for Mediaset embeds + mediaset_urls = MediasetIE._extract_urls(webpage) + if mediaset_urls: + return self.playlist_from_matches( + mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key()) + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') @@ -2568,7 +2696,7 @@ class GenericIE(InfoExtractor): webpage, video_id, transform_source=js_to_json) if jwplayer_data: info = self._parse_jwplayer_data( - jwplayer_data, video_id, require_title=False) + jwplayer_data, video_id, require_title=False, base_url=url) if not info.get('title'): info['title'] = video_title return info @@ -2580,7 +2708,7 @@ class GenericIE(InfoExtractor): return True vpath = compat_urlparse.urlparse(vurl).path vext = determine_ext(vpath) - return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js') + return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml') def filter_video(urls): return list(filter(check_video, urls)) diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py index 4c9be47..9c7b1bd 100644 --- a/youtube_dl/extractor/go.py +++ b/youtube_dl/extractor/go.py @@ -36,22 +36,26 @@ class GoIE(AdobePassIE): 'requestor_id': 'DisneyXD', } } - _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) + _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys()) _TESTS = [{ - 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx', + 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643', 'info_dict': { - 'id': '0_g86w5onx', + 'id': 'VDKA3807643', 'ext': 'mp4', - 'title': 'Sneak Peek: Language Arts', - 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c', + 'title': 'The Traitor in the White House', + 'description': 'md5:05b009d2d145a1e85d25111bd37222e8', }, 'params': { # m3u8 download 'skip_download': True, }, }, { - 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601', - 'only_matching': True, + 'url': 'http://watchdisneyxd.go.com/doraemon', + 'info_dict': { + 'title': 'Doraemon', + 'id': 'SH55574025', + }, + 'playlist_mincount': 51, }, { 'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding', 'only_matching': True, @@ -60,19 +64,36 @@ class GoIE(AdobePassIE): 'only_matching': True, }] + def _extract_videos(self, brand, video_id='-1', show_id='-1'): + display_id = video_id if video_id != '-1' else show_id + return self._download_json( + 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id), + display_id)['video'] + def _real_extract(self, url): sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups() + site_info = self._SITE_INFO[sub_domain] + brand = site_info['brand'] if not video_id: webpage = self._download_webpage(url, display_id) video_id = self._search_regex( # There may be inner quotes, e.g. data-video-id="'VDKA3609139'" # from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood - r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id') - site_info = self._SITE_INFO[sub_domain] - brand = site_info['brand'] - video_data = self._download_json( - 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id), - video_id)['video'][0] + r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None) + if not video_id: + # show extraction works for Disney, DisneyJunior and DisneyXD + # ABC and Freeform has different layout + show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id') + videos = self._extract_videos(brand, show_id=show_id) + show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False) + entries = [] + for video in videos: + entries.append(self.url_result( + video['url'], 'Go', video.get('id'), video.get('title'))) + entries.reverse() + return self.playlist_result(entries, show_id, show_title) + video_data = self._extract_videos(brand, video_id)[0] + video_id = video_data['id'] title = video_data['title'] formats = [] @@ -105,7 +126,7 @@ class GoIE(AdobePassIE): self._initialize_geo_bypass(['US']) entitlement = self._download_json( 'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json', - video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers()) + video_id, data=urlencode_postdata(data)) errors = entitlement.get('errors', {}).get('errors', []) if errors: for error in errors: diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py new file mode 100644 index 0000000..9b2e1c1 --- /dev/null +++ b/youtube_dl/extractor/go90.py @@ -0,0 +1,126 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + parse_iso8601, +) + + +class Go90IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)' + _TEST = { + 'url': 'https://www.go90.com/videos/84BUqjLpf9D', + 'md5': 'efa7670dbbbf21a7b07b360652b24a32', + 'info_dict': { + 'id': '84BUqjLpf9D', + 'ext': 'mp4', + 'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention', + 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.', + 'timestamp': 1491868800, + 'upload_date': '20170411', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + video_data = self._download_json( + 'https://www.go90.com/api/view/items/' + video_id, + video_id, headers={ + 'Content-Type': 'application/json; charset=utf-8', + }, data=b'{"client":"web","device_type":"pc"}') + main_video_asset = video_data['main_video_asset'] + + episode_number = int_or_none(video_data.get('episode_number')) + series = None + season = None + season_id = None + season_number = None + for metadata in video_data.get('__children', {}).get('Item', {}).values(): + if metadata.get('type') == 'show': + series = metadata.get('title') + elif metadata.get('type') == 'season': + season = metadata.get('title') + season_id = metadata.get('id') + season_number = int_or_none(metadata.get('season_number')) + + title = episode = video_data.get('title') or series + if series and series != title: + title = '%s - %s' % (series, title) + + thumbnails = [] + formats = [] + subtitles = {} + for asset in video_data.get('assets'): + if asset.get('id') == main_video_asset: + for source in asset.get('sources', []): + source_location = source.get('location') + if not source_location: + continue + source_type = source.get('type') + if source_type == 'hls': + m3u8_formats = self._extract_m3u8_formats( + source_location, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + for f in m3u8_formats: + mobj = re.search(r'/hls-(\d+)-(\d+)K', f['url']) + if mobj: + height, tbr = mobj.groups() + height = int_or_none(height) + f.update({ + 'height': f.get('height') or height, + 'width': f.get('width') or int_or_none(height / 9.0 * 16.0 if height else None), + 'tbr': f.get('tbr') or int_or_none(tbr), + }) + formats.extend(m3u8_formats) + elif source_type == 'dash': + formats.extend(self._extract_mpd_formats( + source_location, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'format_id': source.get('name'), + 'url': source_location, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'tbr': int_or_none(source.get('bitrate')), + }) + + for caption in asset.get('caption_metadata', []): + caption_url = caption.get('source_url') + if not caption_url: + continue + subtitles.setdefault(caption.get('language', 'en'), []).append({ + 'url': caption_url, + 'ext': determine_ext(caption_url, 'vtt'), + }) + elif asset.get('type') == 'image': + asset_location = asset.get('location') + if not asset_location: + continue + thumbnails.append({ + 'url': asset_location, + 'width': int_or_none(asset.get('width')), + 'height': int_or_none(asset.get('height')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'thumbnails': thumbnails, + 'description': video_data.get('short_description'), + 'like_count': int_or_none(video_data.get('like_count')), + 'timestamp': parse_iso8601(video_data.get('released_at')), + 'series': series, + 'episode': episode, + 'season': season, + 'season_id': season_id, + 'season_number': season_number, + 'episode_number': episode_number, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py index 931f71a..859ad54 100644 --- a/youtube_dl/extractor/hbo.py +++ b/youtube_dl/extractor/hbo.py @@ -92,12 +92,14 @@ class HBOBaseIE(InfoExtractor): video_url.replace('.tar', '/base_index_w8.m3u8'), video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) elif source.tag == 'hls': - # #EXT-X-BYTERANGE is not supported by native hls downloader - # and ffmpeg (#10955) - # formats.extend(self._extract_m3u8_formats( - # video_url.replace('.tar', '/base_index.m3u8'), - # video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - continue + m3u8_formats = self._extract_m3u8_formats( + video_url.replace('.tar', '/base_index.m3u8'), + video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) + for f in m3u8_formats: + if f.get('vcodec') == 'none' and not f.get('tbr'): + f['tbr'] = int_or_none(self._search_regex( + r'-(\d+)k/', f['url'], 'tbr', default=None)) + formats.extend(m3u8_formats) elif source.tag == 'dash': formats.extend(self._extract_mpd_formats( video_url.replace('.tar', '/manifest.mpd'), @@ -110,7 +112,7 @@ class HBOBaseIE(InfoExtractor): 'width': format_info.get('width'), 'height': format_info.get('height'), }) - self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id')) + self._sort_formats(formats) thumbnails = [] card_sizes = xpath_element(video_data, 'titleCardSizes') diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py index f95c00c..3ff672a 100644 --- a/youtube_dl/extractor/imdb.py +++ b/youtube_dl/extractor/imdb.py @@ -13,7 +13,7 @@ from ..utils import ( class ImdbIE(InfoExtractor): IE_NAME = 'imdb' IE_DESC = 'Internet Movie Database trailers' - _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)' + _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.imdb.com/video/imdb/vi2524815897', @@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor): }, { 'url': 'http://www.imdb.com/videoplayer/vi1562949145', 'only_matching': True, + }, { + 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py index 9fb71e8..fe425e7 100644 --- a/youtube_dl/extractor/infoq.py +++ b/youtube_dl/extractor/infoq.py @@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE): def _extract_http_audio(self, webpage, video_id): fields = self._hidden_inputs(webpage) - http_audio_url = fields['filename'] - if http_audio_url is None: + http_audio_url = fields.get('filename') + if not http_audio_url: return [] cookies_header = {'Cookie': self._extract_cookies(webpage)} diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index c1921cb..4667335 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -112,7 +112,8 @@ class InstagramIE(InfoExtractor): webpage = self._download_webpage(url, video_id) (video_url, description, thumbnail, timestamp, uploader, - uploader_id, like_count, comment_count, height, width) = [None] * 10 + uploader_id, like_count, comment_count, comments, height, + width) = [None] * 11 shared_data = self._parse_json( self._search_regex( @@ -121,7 +122,10 @@ class InstagramIE(InfoExtractor): video_id, fatal=False) if shared_data: media = try_get( - shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict) + shared_data, + (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'], + lambda x: x['entry_data']['PostPage'][0]['media']), + dict) if media: video_url = media.get('video_url') height = int_or_none(media.get('dimensions', {}).get('height')) diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index 2af6a6d..fdfa7de 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -189,7 +189,11 @@ class IqiyiIE(InfoExtractor): 'only_matching': True, }, { 'url': 'http://yule.iqiyi.com/pcb.html', - 'only_matching': True, + 'info_dict': { + 'id': '4a0af228fddb55ec96398a364248ed7f', + 'ext': 'mp4', + 'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰', + }, }, { # VIP-only video. The first 2 parts (6 minutes) are available without login # MD5 sums omitted as values are different on Travis CI and my machine @@ -337,15 +341,18 @@ class IqiyiIE(InfoExtractor): url, 'temp_id', note='download video page') # There's no simple way to determine whether an URL is a playlist or not - # So detect it - playlist_result = self._extract_playlist(webpage) - if playlist_result: - return playlist_result - + # Sometimes there are playlist links in individual videos, so treat it + # as a single video first tvid = self._search_regex( - r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') + r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None) + if tvid is None: + playlist_result = self._extract_playlist(webpage) + if playlist_result: + return playlist_result + raise ExtractorError('Can\'t find any video') + video_id = self._search_regex( - r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') + r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') formats = [] for _ in range(5): @@ -377,7 +384,8 @@ class IqiyiIE(InfoExtractor): self._sort_formats(formats) title = (get_element_by_id('widget-videotitle', webpage) or - clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))) + clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or + self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title')) return { 'id': video_id, diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py index 021c6b2..f315680 100644 --- a/youtube_dl/extractor/itv.py +++ b/youtube_dl/extractor/itv.py @@ -116,13 +116,25 @@ class ITVIE(InfoExtractor): if not play_path: continue tbr = int_or_none(media_file.get('bitrate'), 1000) - formats.append({ + f = { 'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''), - 'url': rtmp_url, 'play_path': play_path, + # Providing this swfVfy allows to avoid truncated downloads + 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf', + 'page_url': url, 'tbr': tbr, 'ext': 'flv', - }) + } + app = self._search_regex( + 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None) + if app: + f.update({ + 'url': rtmp_url.split('?', 1)[0], + 'app': app, + }) + else: + f['url'] = rtmp_url + formats.append(f) ios_playlist_url = params.get('data-video-playlist') hmac = params.get('data-video-hmac') @@ -172,7 +184,9 @@ class ITVIE(InfoExtractor): href = ios_base_url + href ext = determine_ext(href) if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats(href, video_id, 'mp4', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_m3u8_formats( + href, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) else: formats.append({ 'url': href, @@ -189,7 +203,8 @@ class ITVIE(InfoExtractor): 'ext': 'ttml' if ext == 'xml' else ext, }) - return { + info = self._search_json_ld(webpage, video_id, default={}) + info.update({ 'id': video_id, 'title': title, 'formats': formats, @@ -198,4 +213,5 @@ class ITVIE(InfoExtractor): 'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')), 'series': xpath_text(playlist, 'ProgrammeTitle'), 'duartion': parse_duration(xpath_text(playlist, 'Duration')), - } + }) + return info diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index 54374ea..41c1f3d 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -91,6 +91,7 @@ class KalturaIE(InfoExtractor): }], }, }, + 'skip': 'Gone. Maybe https://www.safaribooksonline.com/library/tutorials/introduction-to-python-anon/3469/', 'params': { 'skip_download': True, }, @@ -107,27 +108,37 @@ class KalturaIE(InfoExtractor): @staticmethod def _extract_url(webpage): + # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site mobj = ( re.search( r"""(?xs) kWidget\.(?:thumb)?[Ee]mbed\( \{.*? - (?P<q1>['\"])wid(?P=q1)\s*:\s* - (?P<q2>['\"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*? - (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s* - (?P<q4>['\"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) + (?P<q1>['"])wid(?P=q1)\s*:\s* + (?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*? + (?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s* + (?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\}) """, webpage) or re.search( r'''(?xs) - (?P<q1>["\']) + (?P<q1>["']) (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)* (?P=q1).*? (?: entry_?[Ii]d| - (?P<q2>["\'])entry_?[Ii]d(?P=q2) + (?P<q2>["'])entry_?[Ii]d(?P=q2) )\s*:\s* - (?P<q3>["\'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3) - ''', webpage)) + (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3) + ''', webpage) or + re.search( + r'''(?xs) + <iframe[^>]+src=(?P<q1>["']) + (?:https?:)?//(?:www\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+) + (?:(?!(?P=q1)).)* + [?&]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+) + (?P=q1) + ''', webpage) + ) if mobj: embed_info = mobj.groupdict() url = 'kaltura:%(partner_id)s:%(id)s' % embed_info diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py index 3190b18..1f91ba0 100644 --- a/youtube_dl/extractor/laola1tv.py +++ b/youtube_dl/extractor/laola1tv.py @@ -1,6 +1,8 @@ # coding: utf-8 from __future__ import unicode_literals +import json + from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -8,15 +10,15 @@ from ..utils import ( urlencode_postdata, xpath_element, xpath_text, - urljoin, update_url_query, + js_to_json, ) class Laola1TvEmbedIE(InfoExtractor): IE_NAME = 'laola1tv:embed' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)' - _TEST = { + _TESTS = [{ # flashvars.premium = "false"; 'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024', 'info_dict': { @@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor): 'uploader': 'ITTF - International Table Tennis Federation', 'upload_date': '20161211', }, - } + }] + + def _extract_token_url(self, stream_access_url, video_id, data): + return self._download_json( + stream_access_url, video_id, headers={ + 'Content-Type': 'application/json', + }, data=json.dumps(data).encode())['data']['stream-access'][0] + + def _extract_formats(self, token_url, video_id): + token_doc = self._download_xml( + token_url, video_id, 'Downloading token', + headers=self.geo_verification_headers()) + + token_attrib = xpath_element(token_doc, './/token').attrib + + if token_attrib['status'] != '0': + raise ExtractorError( + 'Token error: %s' % token_attrib['comment'], expected=True) + + formats = self._extract_akamai_formats( + '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), + video_id) + self._sort_formats(formats) + return formats def _real_extract(self, url): video_id = self._match_id(url) @@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor): else: data_abo = urlencode_postdata( dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(',')))) - token_url = self._download_json( - 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', - video_id, query={ + stream_access_url = update_url_query( + 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', { 'videoId': _v('id'), 'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'), 'label': _v('label'), 'area': _v('area'), - }, data=data_abo)['data']['stream-access'][0] - - token_doc = self._download_xml( - token_url, video_id, 'Downloading token', - headers=self.geo_verification_headers()) - - token_attrib = xpath_element(token_doc, './/token').attrib - - if token_attrib['status'] != '0': - raise ExtractorError( - 'Token error: %s' % token_attrib['comment'], expected=True) + }) + token_url = self._extract_token_url(stream_access_url, video_id, data_abo) - formats = self._extract_akamai_formats( - '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']), - video_id) - self._sort_formats(formats) + formats = self._extract_formats(token_url, video_id) categories_str = _v('meta_sports') categories = categories_str.split(',') if categories_str else [] @@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor): } -class Laola1TvIE(InfoExtractor): +class Laola1TvIE(Laola1TvEmbedIE): IE_NAME = 'laola1tv' _VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)' _TESTS = [{ @@ -164,13 +176,42 @@ class Laola1TvIE(InfoExtractor): if 'Dieser Livestream ist bereits beendet.' in webpage: raise ExtractorError('This live stream has already finished.', expected=True) - iframe_url = urljoin(url, self._search_regex( - r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"', - webpage, 'iframe url')) + conf = self._parse_json(self._search_regex( + r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'), + display_id, js_to_json) + + video_id = conf['videoid'] + + config = self._download_json(conf['configUrl'], video_id, query={ + 'videoid': video_id, + 'partnerid': conf['partnerid'], + 'language': conf.get('language', ''), + 'portal': conf.get('portalid', ''), + }) + error = config.get('error') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_data = config['video'] + title = video_data['title'] + is_live = video_data.get('isLivestream') and video_data.get('isLive') + meta = video_data.get('metaInformation') + sports = meta.get('sports') + categories = sports.split(',') if sports else [] + + token_url = self._extract_token_url( + video_data['streamAccess'], video_id, + video_data['abo']['required']) + + formats = self._extract_formats(token_url, video_id) return { - '_type': 'url', + 'id': video_id, 'display_id': display_id, - 'url': iframe_url, - 'ie_key': 'Laola1TvEmbed', + 'title': self._live_title(title) if is_live else title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('image'), + 'categories': categories, + 'formats': formats, + 'is_live': is_live, } diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 9eda956..0a07c13 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -23,7 +23,6 @@ from ..utils import ( str_or_none, url_basename, urshift, - update_url_query, ) @@ -51,7 +50,7 @@ class LeIE(InfoExtractor): 'id': '1415246', 'ext': 'mp4', 'title': '美人天下01', - 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda', + 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d', }, 'params': { 'hls_prefer_native': True, @@ -69,7 +68,6 @@ class LeIE(InfoExtractor): 'params': { 'hls_prefer_native': True, }, - 'skip': 'Only available in China', }, { 'url': 'http://sports.le.com/video/25737697.html', 'only_matching': True, @@ -81,7 +79,7 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf + # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: @@ -90,15 +88,8 @@ class LeIE(InfoExtractor): return param1 def calc_time_key(self, param1): - _loc2_ = 773625421 - _loc3_ = self.ror(param1, _loc2_ % 13) - _loc3_ = _loc3_ ^ _loc2_ - _loc3_ = self.ror(_loc3_, _loc2_ % 17) - return _loc3_ - - # reversed from http://jstatic.letvcdn.com/sdk/player.js - def get_mms_key(self, time): - return self.ror(time, 8) ^ 185025305 + _loc2_ = 185025305 + return self.ror(param1, _loc2_ % 17) ^ _loc2_ # see M3U8Encryption class in KLetvPlayer.swf @staticmethod @@ -122,7 +113,7 @@ class LeIE(InfoExtractor): def _check_errors(self, play_json): # Check for errors - playstatus = play_json['playstatus'] + playstatus = play_json['msgs']['playstatus'] if playstatus['status'] == 0: flag = playstatus['flag'] if flag == 1: @@ -134,58 +125,31 @@ class LeIE(InfoExtractor): media_id = self._match_id(url) page = self._download_webpage(url, media_id) - play_json_h5 = self._download_json( - 'http://api.le.com/mms/out/video/playJsonH5', - media_id, 'Downloading html5 playJson data', query={ - 'id': media_id, - 'platid': 3, - 'splatid': 304, - 'format': 1, - 'tkey': self.get_mms_key(int(time.time())), - 'domain': 'www.le.com', - 'tss': 'no', - }, - headers=self.geo_verification_headers()) - self._check_errors(play_json_h5) - play_json_flash = self._download_json( - 'http://api.le.com/mms/out/video/playJson', + 'http://player-pc.le.com/mms/out/video/playJson', media_id, 'Downloading flash playJson data', query={ 'id': media_id, 'platid': 1, 'splatid': 101, 'format': 1, + 'source': 1000, 'tkey': self.calc_time_key(int(time.time())), 'domain': 'www.le.com', + 'region': 'cn', }, headers=self.geo_verification_headers()) self._check_errors(play_json_flash) - def get_h5_urls(media_url, format_id): - location = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id, query={ - 'format': 1, - 'expect': 3, - 'tss': 'no', - })['location'] - - return { - 'http': update_url_query(location, {'tss': 'no'}), - 'hls': update_url_query(location, {'tss': 'ios'}), - } - def get_flash_urls(media_url, format_id): - media_url += '&' + compat_urllib_parse_urlencode({ - 'm3v': 1, - 'format': 1, - 'expect': 3, - 'rateid': format_id, - }) - nodes_data = self._download_json( media_url, media_id, - 'Download JSON metadata for format %s' % format_id) + 'Download JSON metadata for format %s' % format_id, + query={ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'tss': 'ios', + }) req = self._request_webpage( nodes_data['nodelist'][0]['location'], media_id, @@ -199,29 +163,28 @@ class LeIE(InfoExtractor): extracted_formats = [] formats = [] - for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): - playurl = play_json['playurl'] - play_domain = playurl['domain'][0] - - for format_id, format_data in playurl.get('dispatch', []).items(): - if format_id in extracted_formats: - continue - extracted_formats.append(format_id) - - media_url = play_domain + format_data[0] - for protocol, format_url in get_urls(media_url, format_id).items(): - f = { - 'url': format_url, - 'ext': determine_ext(format_data[1]), - 'format_id': '%s-%s' % (protocol, format_id), - 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', - 'quality': int_or_none(format_id), - } - - if format_id[-1:] == 'p': - f['height'] = int_or_none(format_id[:-1]) - - formats.append(f) + playurl = play_json_flash['msgs']['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_flash_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) self._sort_formats(formats, ('height', 'quality', 'format_id')) publish_time = parse_iso8601(self._html_search_regex( diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py index d3bca64..b312e77 100644 --- a/youtube_dl/extractor/lego.py +++ b/youtube_dl/extractor/lego.py @@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor): formats = self._extract_akamai_formats( '%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == len(self._BITRATES): self._sort_formats(m3u8_formats) diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 422be25..0a5a395 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -9,6 +9,7 @@ from ..utils import ( determine_ext, float_or_none, int_or_none, + smuggle_url, unsmuggle_url, ExtractorError, ) @@ -18,6 +19,42 @@ class LimelightBaseIE(InfoExtractor): _PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s' _API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json' + @classmethod + def _extract_urls(cls, webpage, source_url): + lm = { + 'Media': 'media', + 'Channel': 'channel', + 'ChannelList': 'channel_list', + } + entries = [] + for kind, video_id in re.findall( + r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', + webpage): + entries.append(cls.url_result( + smuggle_url( + 'limelight:%s:%s' % (lm[kind], video_id), + {'source_url': source_url}), + 'Limelight%s' % kind, video_id)) + for mobj in re.finditer( + # As per [1] class attribute should be exactly equal to + # LimelightEmbeddedPlayerFlash but numerous examples seen + # that don't exactly match it (e.g. [2]). + # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage + # 2. http://www.sedona.com/FacilitatorTraining2017 + r'''(?sx) + <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*? + <param[^>]+ + name=(["\'])flashVars\2[^>]+ + value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32}) + ''', webpage): + kind, video_id = mobj.group('kind'), mobj.group('id') + entries.append(cls.url_result( + smuggle_url( + 'limelight:%s:%s' % (kind, video_id), + {'source_url': source_url}), + 'Limelight%s' % kind.capitalize(), video_id)) + return entries + def _call_playlist_service(self, item_id, method, fatal=True, referer=None): headers = {} if referer: @@ -62,13 +99,21 @@ class LimelightBaseIE(InfoExtractor): fmt = { 'url': stream_url, 'abr': float_or_none(stream.get('audioBitRate')), - 'vbr': float_or_none(stream.get('videoBitRate')), 'fps': float_or_none(stream.get('videoFrameRate')), - 'width': int_or_none(stream.get('videoWidthInPixels')), - 'height': int_or_none(stream.get('videoHeightInPixels')), 'ext': ext, } - rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url) + width = int_or_none(stream.get('videoWidthInPixels')) + height = int_or_none(stream.get('videoHeightInPixels')) + vbr = float_or_none(stream.get('videoBitRate')) + if width or height or vbr: + fmt.update({ + 'width': width, + 'height': height, + 'vbr': vbr, + }) + else: + fmt['vcodec'] = 'none' + rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', stream_url) if rtmp: format_id = 'rtmp' if stream.get('videoBitRate'): diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py index c7de653..c545196 100644 --- a/youtube_dl/extractor/liveleak.py +++ b/youtube_dl/extractor/liveleak.py @@ -1,6 +1,5 @@ from __future__ import unicode_literals -import json import re from .common import InfoExtractor @@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor): _VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)' _TESTS = [{ 'url': 'http://www.liveleak.com/view?i=757_1364311680', - 'md5': '50f79e05ba149149c1b4ea961223d5b3', + 'md5': '0813c2430bea7a46bf13acf3406992f4', 'info_dict': { 'id': '757_1364311680', - 'ext': 'flv', + 'ext': 'mp4', 'description': 'extremely bad day for this guy..!', 'uploader': 'ljfriel2', 'title': 'Most unlucky car accident', @@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor): } }, { 'url': 'http://www.liveleak.com/view?i=f93_1390833151', - 'md5': 'b13a29626183c9d33944e6a04f41aafc', + 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf', 'info_dict': { 'id': 'f93_1390833151', 'ext': 'mp4', @@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor): 'thumbnail': r're:^https?://.*\.jpg$' } }, { + # Prochan embed 'url': 'http://www.liveleak.com/view?i=4f7_1392687779', 'md5': '42c6d97d54f1db107958760788c5f48f', 'info_dict': { @@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor): 'uploader': 'CapObveus', 'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck', 'age_limit': 18, - } + }, + 'skip': 'Video is dead', }, { # Covers https://github.com/rg3/youtube-dl/pull/5983 + # Multiple resolutions 'url': 'http://www.liveleak.com/view?i=801_1409392012', - 'md5': '0b3bec2d888c20728ca2ad3642f0ef15', + 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b', 'info_dict': { 'id': '801_1409392012', 'ext': 'mp4', @@ -93,57 +95,38 @@ class LiveLeakIE(InfoExtractor): webpage, 'age limit', default=None)) video_thumbnail = self._og_search_thumbnail(webpage) - sources_raw = self._search_regex( - r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None) - if sources_raw is None: - alt_source = self._search_regex( - r'(file: ".*?"),', webpage, 'video URL', default=None) - if alt_source: - sources_raw = '[{ %s}]' % alt_source - else: - # Maybe an embed? - embed_url = self._search_regex( - r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', - webpage, 'embed URL') - return { - '_type': 'url_transparent', - 'url': embed_url, - 'id': video_id, - 'title': video_title, - 'description': video_description, - 'uploader': video_uploader, - 'age_limit': age_limit, - } + entries = self._parse_html5_media_entries(url, webpage, video_id) + if not entries: + # Maybe an embed? + embed_url = self._search_regex( + r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"', + webpage, 'embed URL') + return { + '_type': 'url_transparent', + 'url': embed_url, + 'id': video_id, + 'title': video_title, + 'description': video_description, + 'uploader': video_uploader, + 'age_limit': age_limit, + } - sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw) - sources = json.loads(sources_json) + info_dict = entries[0] - formats = [{ - 'format_id': '%s' % i, - 'format_note': s.get('label'), - 'url': s['file'], - } for i, s in enumerate(sources)] + for a_format in info_dict['formats']: + if not a_format.get('height'): + a_format['height'] = self._search_regex( + r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None) - for i, s in enumerate(sources): - # Removing '.h264_*.mp4' gives the raw video, which is essentially - # the same video without the LiveLeak logo at the top (see - # https://github.com/rg3/youtube-dl/pull/4768) - orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file']) - if s['file'] != orig_url: - formats.append({ - 'format_id': 'original-%s' % i, - 'format_note': s.get('label'), - 'url': orig_url, - 'preference': 1, - }) - self._sort_formats(formats) + self._sort_formats(info_dict['formats']) - return { + info_dict.update({ 'id': video_id, 'title': video_title, 'description': video_description, 'uploader': video_uploader, - 'formats': formats, 'age_limit': age_limit, 'thumbnail': video_thumbnail, - } + }) + + return info_dict diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py new file mode 100644 index 0000000..9760eaf --- /dev/null +++ b/youtube_dl/extractor/mediaset.py @@ -0,0 +1,118 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + parse_duration, + try_get, + unified_strdate, +) + + +class MediasetIE(InfoExtractor): + _VALID_URL = r'''(?x) + (?: + mediaset:| + https?:// + (?:www\.)?video\.mediaset\.it/ + (?: + (?:video|on-demand)/(?:[^/]+/)+[^/]+_| + player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid= + ) + )(?P<id>[0-9]+) + ''' + _TESTS = [{ + # full episode + 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html', + 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d', + 'info_dict': { + 'id': '661824', + 'ext': 'mp4', + 'title': 'Quarta puntata', + 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1414, + 'creator': 'mediaset', + 'upload_date': '20161107', + 'series': 'Hello Goodbye', + 'categories': ['reality'], + }, + 'expected_warnings': ['is not a supported codec'], + }, { + # clip + 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html', + 'only_matching': True, + }, { + # iframe simple + 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true', + 'only_matching': True, + }, { + # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/) + 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true', + 'only_matching': True, + }, { + 'url': 'mediaset:661824', + 'only_matching': True, + }] + + @staticmethod + def _extract_urls(webpage): + return [ + mobj.group('url') + for mobj in re.finditer( + r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1', + webpage)] + + def _real_extract(self, url): + video_id = self._match_id(url) + + video_list = self._download_json( + 'http://cdnsel01.mediaset.net/GetCdn.aspx', + video_id, 'Downloading video CDN JSON', query={ + 'streamid': video_id, + 'format': 'json', + })['videoList'] + + formats = [] + for format_url in video_list: + if '.ism' in format_url: + formats.extend(self._extract_ism_formats( + format_url, video_id, ism_id='mss', fatal=False)) + else: + formats.append({ + 'url': format_url, + 'format_id': determine_ext(format_url), + }) + self._sort_formats(formats) + + mediainfo = self._download_json( + 'http://plr.video.mediaset.it/html/metainfo.sjson', + video_id, 'Downloading video info JSON', query={ + 'id': video_id, + })['video'] + + title = mediainfo['title'] + + creator = try_get( + mediainfo, lambda x: x['brand-info']['publisher'], compat_str) + category = try_get( + mediainfo, lambda x: x['brand-info']['category'], compat_str) + categories = [category] if category else None + + return { + 'id': video_id, + 'title': title, + 'description': mediainfo.get('short-description'), + 'thumbnail': mediainfo.get('thumbnail'), + 'duration': parse_duration(mediainfo.get('duration')), + 'creator': creator, + 'upload_date': unified_strdate(mediainfo.get('production-date')), + 'webpage_url': mediainfo.get('url'), + 'series': mediainfo.get('brand-value'), + 'categories': categories, + 'formats': formats, + } diff --git a/youtube_dl/extractor/medici.py b/youtube_dl/extractor/medici.py new file mode 100644 index 0000000..cd91023 --- /dev/null +++ b/youtube_dl/extractor/medici.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + unified_strdate, + update_url_query, + urlencode_postdata, +) + + +class MediciIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?medici\.tv/#!/(?P<id>[^?#&]+)' + _TEST = { + 'url': 'http://www.medici.tv/#!/daniel-harding-frans-helmerson-verbier-festival-music-camp', + 'md5': '004c21bb0a57248085b6ff3fec72719d', + 'info_dict': { + 'id': '3059', + 'ext': 'flv', + 'title': 'Daniel Harding conducts the Verbier Festival Music Camp \u2013 With Frans Helmerson', + 'description': 'md5:322a1e952bafb725174fd8c1a8212f58', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20170408', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Sets csrftoken cookie + self._download_webpage(url, video_id) + + MEDICI_URL = 'http://www.medici.tv/' + + data = self._download_json( + MEDICI_URL, video_id, + data=urlencode_postdata({ + 'json': 'true', + 'page': '/%s' % video_id, + 'timezone_offset': -420, + }), headers={ + 'X-CSRFToken': self._get_cookies(url)['csrftoken'].value, + 'X-Requested-With': 'XMLHttpRequest', + 'Referer': MEDICI_URL, + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + video = data['video']['videos']['video1'] + + title = video.get('nom') or data['title'] + + video_id = video.get('id') or video_id + formats = self._extract_f4m_formats( + update_url_query(video['url_akamai'], { + 'hdcore': '3.1.0', + 'plugin=aasp': '3.1.0.43.124', + }), video_id, f4m_id='hds') + + description = data.get('meta_description') + thumbnail = video.get('url_thumbnail') or data.get('main_image') + upload_date = unified_strdate(data['video'].get('date')) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py index a24b316..0efbe66 100644 --- a/youtube_dl/extractor/mixcloud.py +++ b/youtube_dl/extractor/mixcloud.py @@ -97,7 +97,7 @@ class MixcloudIE(InfoExtractor): view_count = str_to_int(self._search_regex( [r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"', r'/listeners/?">([0-9,.]+)</a>', - r'm-tooltip=["\']([\d,.]+) plays'], + r'(?:m|data)-tooltip=["\']([\d,.]+) plays'], webpage, 'play count', default=None)) return { @@ -138,12 +138,12 @@ class MixcloudPlaylistBaseIE(InfoExtractor): def _get_user_description(self, page_content): return self._html_search_regex( - r'<div[^>]+class="description-text"[^>]*>(.+?)</div>', + r'<div[^>]+class="profile-bio"[^>]*>(.+?)</div>', page_content, 'user description', fatal=False) class MixcloudUserIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$' + _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$' IE_NAME = 'mixcloud:user' _TESTS = [{ @@ -151,7 +151,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:327af72d1efeb404a8216c27240d1370', + 'description': 'md5:def36060ac8747b3aabca54924897e47', }, 'playlist_mincount': 11, }, { @@ -159,7 +159,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_uploads', 'title': 'Daniel Holbach (uploads)', - 'description': 'md5:327af72d1efeb404a8216c27240d1370', + 'description': 'md5:def36060ac8747b3aabca54924897e47', }, 'playlist_mincount': 11, }, { @@ -167,7 +167,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_favorites', 'title': 'Daniel Holbach (favorites)', - 'description': 'md5:327af72d1efeb404a8216c27240d1370', + 'description': 'md5:def36060ac8747b3aabca54924897e47', }, 'params': { 'playlist_items': '1-100', @@ -178,7 +178,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): 'info_dict': { 'id': 'dholbach_listens', 'title': 'Daniel Holbach (listens)', - 'description': 'md5:327af72d1efeb404a8216c27240d1370', + 'description': 'md5:def36060ac8747b3aabca54924897e47', }, 'params': { 'playlist_items': '1-100', @@ -216,7 +216,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE): class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$' IE_NAME = 'mixcloud:playlist' _TESTS = [{ @@ -229,12 +229,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): 'playlist_mincount': 16, }, { 'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/', - 'info_dict': { - 'id': 'maxvibes_jazzcat-on-ness-radio', - 'title': 'Jazzcat on Ness Radio', - 'description': 'md5:7bbbf0d6359a0b8cda85224be0f8f263', - }, - 'playlist_mincount': 23 + 'only_matching': True, }] def _real_extract(self, url): @@ -243,15 +238,16 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): playlist_id = mobj.group('playlist') video_id = '%s_%s' % (user_id, playlist_id) - profile = self._download_webpage( + webpage = self._download_webpage( url, user_id, note='Downloading playlist page', errnote='Unable to download playlist page') - description = self._get_user_description(profile) - playlist_title = self._html_search_regex( - r'<span[^>]+class="[^"]*list-playlist-title[^"]*"[^>]*>(.*?)</span>', - profile, 'playlist title') + title = self._html_search_regex( + r'<a[^>]+class="parent active"[^>]*><b>\d+</b><span[^>]*>([^<]+)', + webpage, 'playlist title', + default=None) or self._og_search_title(webpage, fatal=False) + description = self._get_user_description(webpage) entries = OnDemandPagedList( functools.partial( @@ -259,11 +255,11 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE): '%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'), self._PAGE_SIZE) - return self.playlist_result(entries, video_id, playlist_title, description) + return self.playlist_result(entries, video_id, title, description) class MixcloudStreamIE(MixcloudPlaylistBaseIE): - _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$' + _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$' IE_NAME = 'mixcloud:stream' _TEST = { diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py index f281238..e164d59 100644 --- a/youtube_dl/extractor/myspace.py +++ b/youtube_dl/extractor/myspace.py @@ -12,64 +12,62 @@ from ..utils import ( class MySpaceIE(InfoExtractor): - _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)' + _VALID_URL = r'''(?x) + https?:// + myspace\.com/[^/]+/ + (?P<mediatype> + video/[^/]+/(?P<video_id>\d+)| + music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$) + ) + ''' - _TESTS = [ - { - 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', - 'md5': '9c1483c106f4a695c47d2911feed50a7', - 'info_dict': { - 'id': '109594919', - 'ext': 'mp4', - 'title': 'Little Big Town', - 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', - 'uploader': 'Five Minutes to the Stage', - 'uploader_id': 'fiveminutestothestage', - 'timestamp': 1414108751, - 'upload_date': '20141023', - }, + _TESTS = [{ + 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919', + 'md5': '9c1483c106f4a695c47d2911feed50a7', + 'info_dict': { + 'id': '109594919', + 'ext': 'mp4', + 'title': 'Little Big Town', + 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.', + 'uploader': 'Five Minutes to the Stage', + 'uploader_id': 'fiveminutestothestage', + 'timestamp': 1414108751, + 'upload_date': '20141023', }, + }, { # songs - { - 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', - 'md5': '1d7ee4604a3da226dd69a123f748b262', - 'info_dict': { - 'id': '93388656', - 'ext': 'm4a', - 'title': 'Of weakened soul...', - 'uploader': 'Killsorrow', - 'uploader_id': 'killsorrow', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', - 'info_dict': { - 'id': 'xqds0B_meys', - 'ext': 'webm', - 'title': 'Three Days Grace - Animal I Have Become', - 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', - 'uploader': 'ThreeDaysGraceVEVO', - 'uploader_id': 'ThreeDaysGraceVEVO', - 'upload_date': '20091002', - }, - }, { - 'add_ie': ['Youtube'], - 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', - 'info_dict': { - 'id': 'ypWvQgnJrSU', - 'ext': 'mp4', - 'title': 'Starset - First Light', - 'description': 'md5:2d5db6c9d11d527683bcda818d332414', - 'uploader': 'Yumi K', - 'uploader_id': 'SorenPromotions', - 'upload_date': '20140725', - } + 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681', + 'md5': '1d7ee4604a3da226dd69a123f748b262', + 'info_dict': { + 'id': '93388656', + 'ext': 'm4a', + 'title': 'Of weakened soul...', + 'uploader': 'Killsorrow', + 'uploader_id': 'killsorrow', }, - ] + }, { + 'add_ie': ['Youtube'], + 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041', + 'info_dict': { + 'id': 'xqds0B_meys', + 'ext': 'webm', + 'title': 'Three Days Grace - Animal I Have Become', + 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb', + 'uploader': 'ThreeDaysGraceVEVO', + 'uploader_id': 'ThreeDaysGraceVEVO', + 'upload_date': '20091002', + }, + }, { + 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426', + 'only_matching': True, + }, { + 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = mobj.group('video_id') or mobj.group('song_id') is_song = mobj.group('mediatype').startswith('music/song') webpage = self._download_webpage(url, video_id) player_url = self._search_regex( diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py index d2a44d0..62db70b 100644 --- a/youtube_dl/extractor/nbc.py +++ b/youtube_dl/extractor/nbc.py @@ -5,10 +5,8 @@ import re from .common import InfoExtractor from .theplatform import ThePlatformIE from .adobepass import AdobePassIE -from ..compat import compat_urllib_parse_urlparse from ..utils import ( find_xpath_attr, - lowercase_escape, smuggle_url, unescapeHTML, update_url_query, @@ -17,7 +15,7 @@ from ..utils import ( class NBCIE(AdobePassIE): - _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)' + _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))' _TESTS = [ { @@ -37,16 +35,6 @@ class NBCIE(AdobePassIE): }, }, { - 'url': 'http://www.nbc.com/the-tonight-show/episodes/176', - 'info_dict': { - 'id': '176', - 'ext': 'flv', - 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen', - 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.', - }, - 'skip': '404 Not Found', - }, - { 'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821', 'info_dict': { 'id': '2832821', @@ -64,11 +52,6 @@ class NBCIE(AdobePassIE): 'skip': 'Only works from US', }, { - # This video has expired but with an escaped embedURL - 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515', - 'only_matching': True, - }, - { # HLS streams requires the 'hdnea3' cookie 'url': 'http://www.nbc.com/Kings/video/goliath/n1806', 'info_dict': { @@ -88,59 +71,38 @@ class NBCIE(AdobePassIE): ] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - info = { + permalink, video_id = re.match(self._VALID_URL, url).groups() + video_data = self._download_json( + 'https://api.nbc.com/v3/videos', video_id, query={ + 'filter[permalink]': permalink, + })['data'][0]['attributes'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + video_id = video_data['guid'] + title = video_data['title'] + if video_data.get('entitlement') == 'auth': + resource = self._get_mvpd_resource( + 'nbcentertainment', title, video_id, + video_data.get('vChipRating')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, 'nbcentertainment', resource) + theplatform_url = smuggle_url(update_url_query( + 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, + query), {'force_smil_url': True}) + return { '_type': 'url_transparent', - 'ie_key': 'ThePlatform', 'id': video_id, + 'title': title, + 'url': theplatform_url, + 'description': video_data.get('description'), + 'keywords': video_data.get('keywords'), + 'season_number': int_or_none(video_data.get('seasonNumber')), + 'episode_number': int_or_none(video_data.get('episodeNumber')), + 'series': video_data.get('showName'), + 'ie_key': 'ThePlatform', } - video_data = None - preload = self._search_regex( - r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None) - if preload: - preload_data = self._parse_json(preload, video_id) - path = compat_urllib_parse_urlparse(url).path.rstrip('/') - entity_id = preload_data.get('xref', {}).get(path) - video_data = preload_data.get('entities', {}).get(entity_id) - if video_data: - query = { - 'mbr': 'true', - 'manifest': 'm3u', - } - video_id = video_data['guid'] - title = video_data['title'] - if video_data.get('entitlement') == 'auth': - resource = self._get_mvpd_resource( - 'nbcentertainment', title, video_id, - video_data.get('vChipRating')) - query['auth'] = self._extract_mvpd_auth( - url, video_id, 'nbcentertainment', resource) - theplatform_url = smuggle_url(update_url_query( - 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id, - query), {'force_smil_url': True}) - info.update({ - 'id': video_id, - 'title': title, - 'url': theplatform_url, - 'description': video_data.get('description'), - 'keywords': video_data.get('keywords'), - 'season_number': int_or_none(video_data.get('seasonNumber')), - 'episode_number': int_or_none(video_data.get('episodeNumber')), - 'series': video_data.get('showName'), - }) - else: - theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex( - [ - r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"', - r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"', - r'"embedURL"\s*:\s*"([^"]+)"' - ], - webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/'))) - if theplatform_url.startswith('//'): - theplatform_url = 'http:' + theplatform_url - info['url'] = smuggle_url(theplatform_url, {'source_url': url}) - return info class NBCSportsVPlayerIE(InfoExtractor): diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py new file mode 100644 index 0000000..63e58aa --- /dev/null +++ b/youtube_dl/extractor/nonktube.py @@ -0,0 +1,33 @@ +from __future__ import unicode_literals + +from .nuevo import NuevoBaseIE + + +class NonkTubeIE(NuevoBaseIE): + _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized', + 'info_dict': { + 'id': '118636', + 'ext': 'mp4', + 'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized', + 'age_limit': 18, + 'duration': 1150.98, + }, + 'params': { + 'skip_download': True, + } + }, { + 'url': 'https://www.nonktube.com/embed/118636', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + info = self._extract_nuevo( + 'https://www.nonktube.com/media/nuevo/econfig.php?key=%s' + % video_id, video_id) + + info['age_limit'] = 18 + return info diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py new file mode 100644 index 0000000..f7fa098 --- /dev/null +++ b/youtube_dl/extractor/noovo.py @@ -0,0 +1,97 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .brightcove import BrightcoveNewIE +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + smuggle_url, + try_get, +) + + +class NoovoIE(InfoExtractor): + _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)' + _TESTS = [{ + # clip + 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial', + 'info_dict': { + 'id': '5386045029001', + 'ext': 'mp4', + 'title': 'Chrysler Imperial', + 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056', + 'timestamp': 1491399228, + 'upload_date': '20170405', + 'uploader_id': '618566855001', + 'creator': 'vtele', + 'view_count': int, + 'series': 'RPM+', + }, + 'params': { + 'skip_download': True, + }, + }, { + # episode + 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8', + 'info_dict': { + 'id': '5395865725001', + 'title': 'Épisode 13 : Les retrouvailles', + 'description': 'md5:336d5ebc5436534e61d16e63ddfca327', + 'ext': 'mp4', + 'timestamp': 1492019320, + 'upload_date': '20170412', + 'uploader_id': '618566855001', + 'creator': 'vtele', + 'view_count': int, + 'series': "L'amour est dans le pré", + 'season_number': 5, + 'episode': 'Épisode 13', + 'episode_number': 13, + }, + 'params': { + 'skip_download': True, + }, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + video_id = self._match_id(url) + + data = self._download_json( + 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id, + video_id)['data'] + + content = try_get(data, lambda x: x['contents'][0]) + + brightcove_id = data.get('brightcoveId') or content['brightcoveId'] + + series = try_get( + data, ( + lambda x: x['show']['title'], + lambda x: x['season']['show']['title']), + compat_str) + + episode = None + og = data.get('og') + if isinstance(og, dict) and og.get('type') == 'video.episode': + episode = og.get('title') + + video = content or data + + return { + '_type': 'url_transparent', + 'ie_key': BrightcoveNewIE.ie_key(), + 'url': smuggle_url( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + {'geo_countries': ['CA']}), + 'id': brightcove_id, + 'title': video.get('title'), + 'creator': video.get('source'), + 'view_count': int_or_none(video.get('viewsCount')), + 'series': series, + 'season_number': int_or_none(try_get( + data, lambda x: x['season']['seasonNumber'])), + 'episode': episode, + 'episode_number': int_or_none(data.get('episodeNumber')), + } diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py index b6c5ee6..f26dafb 100644 --- a/youtube_dl/extractor/nowness.py +++ b/youtube_dl/extractor/nowness.py @@ -28,7 +28,7 @@ class NownessBaseIE(InfoExtractor): bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code) if bc_url: return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) - bc_url = BrightcoveNewIE._extract_url(player_code) + bc_url = BrightcoveNewIE._extract_url(self, player_code) if bc_url: return self.url_result(bc_url, BrightcoveNewIE.ie_key()) raise ExtractorError('Could not find player definition') diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py index 38fefe4..79296f0 100644 --- a/youtube_dl/extractor/npo.py +++ b/youtube_dl/extractor/npo.py @@ -313,9 +313,9 @@ class NPOIE(NPOBaseIE): class NPOLiveIE(NPOBaseIE): IE_NAME = 'npo.nl:live' - _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?npo\.nl/live(?:/(?P<id>[^/?#&]+))?' - _TEST = { + _TESTS = [{ 'url': 'http://www.npo.nl/live/npo-1', 'info_dict': { 'id': 'LI_NL1_4188102', @@ -327,10 +327,13 @@ class NPOLiveIE(NPOBaseIE): 'params': { 'skip_download': True, } - } + }, { + 'url': 'http://www.npo.nl/live', + 'only_matching': True, + }] def _real_extract(self, url): - display_id = self._match_id(url) + display_id = self._match_id(url) or 'npo-1' webpage = self._download_webpage(url, display_id) diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py index 7fe79cb..3b4f51f 100644 --- a/youtube_dl/extractor/nrk.py +++ b/youtube_dl/extractor/nrk.py @@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor): vcodec = 'none' if data.get('mediaType') == 'Audio' else None - # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged - for entry in entries: entry.update(common_info) for f in entry['formats']: f['vcodec'] = vcodec + points = data.get('shortIndexPoints') + if isinstance(points, list): + chapters = [] + for next_num, point in enumerate(points, start=1): + if not isinstance(point, dict): + continue + start_time = parse_duration(point.get('startPoint')) + if start_time is None: + continue + end_time = parse_duration( + data.get('duration') + if next_num == len(points) + else points[next_num].get('startPoint')) + if end_time is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': point.get('title'), + }) + if chapters and len(entries) == 1: + entries[0]['chapters'] = chapters + return self.playlist_result(entries, video_id, title, description) diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py index 87fb94d..be1e09d 100644 --- a/youtube_dl/extractor/nuevo.py +++ b/youtube_dl/extractor/nuevo.py @@ -10,9 +10,10 @@ from ..utils import ( class NuevoBaseIE(InfoExtractor): - def _extract_nuevo(self, config_url, video_id): + def _extract_nuevo(self, config_url, video_id, headers={}): config = self._download_xml( - config_url, video_id, transform_source=lambda s: s.strip()) + config_url, video_id, transform_source=lambda s: s.strip(), + headers=headers) title = xpath_text(config, './title', 'title', fatal=True).strip() video_id = xpath_text(config, './mediaid', default=video_id) diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py index 986708e..854b680 100644 --- a/youtube_dl/extractor/odnoklassniki.py +++ b/youtube_dl/extractor/odnoklassniki.py @@ -3,6 +3,7 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import ( + compat_etree_fromstring, compat_parse_qs, compat_urllib_parse_unquote, compat_urllib_parse_urlparse, @@ -37,7 +38,7 @@ class OdnoklassnikiIE(InfoExtractor): }, { # metadataUrl 'url': 'http://ok.ru/video/63567059965189-0?fromTime=5', - 'md5': '9676cf86eff5391d35dea675d224e131', + 'md5': '6ff470ea2dd51d5d18c295a355b0b6bc', 'info_dict': { 'id': '63567059965189-0', 'ext': 'mp4', @@ -53,7 +54,7 @@ class OdnoklassnikiIE(InfoExtractor): }, { # YouTube embed (metadataUrl, provider == USER_YOUTUBE) 'url': 'http://ok.ru/video/64211978996595-1', - 'md5': '5d7475d428845cd2e13bae6f1a992278', + 'md5': '2f206894ffb5dbfcce2c5a14b909eea5', 'info_dict': { 'id': '64211978996595-1', 'ext': 'mp4', @@ -61,8 +62,8 @@ class OdnoklassnikiIE(InfoExtractor): 'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0', 'duration': 440, 'upload_date': '20150826', - 'uploader_id': '750099571', - 'uploader': 'Алина П', + 'uploader_id': 'tvroscosmos', + 'uploader': 'Телестудия Роскосмоса', 'age_limit': 0, }, }, { @@ -81,6 +82,7 @@ class OdnoklassnikiIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'Video has not been found', }, { 'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452', 'only_matching': True, @@ -176,14 +178,32 @@ class OdnoklassnikiIE(InfoExtractor): }) return info - quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd')) + quality = qualities(('4', '0', '1', '2', '3', '5')) formats = [{ 'url': f['url'], 'ext': 'mp4', 'format_id': f['name'], - 'quality': quality(f['name']), } for f in metadata['videos']] + + m3u8_url = metadata.get('hlsManifestUrl') + if m3u8_url: + formats.extend(self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + + dash_manifest = metadata.get('metadataEmbedded') + if dash_manifest: + formats.extend(self._parse_mpd_formats( + compat_etree_fromstring(dash_manifest), 'mpd')) + + for fmt in formats: + fmt_type = self._search_regex( + r'\btype[/=](\d)', fmt['url'], + 'format type', default=None) + if fmt_type: + fmt['quality'] = quality(fmt_type) + self._sort_formats(formats) info['formats'] = formats diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 58ffde5..d8036b5 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -75,51 +75,38 @@ class OpenloadIE(InfoExtractor): '<span[^>]+id="[^"]+"[^>]*>([0-9A-Za-z]+)</span>', webpage, 'openload ID') - video_url_chars = [] - - first_char = ord(ol_id[0]) - key = first_char - 55 - maxKey = max(2, key) - key = min(maxKey, len(ol_id) - 38) - t = ol_id[key:key + 36] - - hashMap = {} - v = ol_id.replace(t, '') - h = 0 - - while h < len(t): - f = t[h:h + 3] - i = int(f, 8) - hashMap[h / 3] = i - h += 3 - - h = 0 - H = 0 - while h < len(v): - B = '' - C = '' - if len(v) >= h + 2: - B = v[h:h + 2] - if len(v) >= h + 3: - C = v[h:h + 3] - i = int(B, 16) - h += 2 - if H % 3 == 0: - i = int(C, 8) - h += 1 - elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60: - i = int(C, 10) - h += 1 - index = H % 7 - - A = hashMap[index] - i ^= 213 - i ^= A - video_url_chars.append(compat_chr(i)) - H += 1 + decoded = '' + a = ol_id[0:24] + b = [] + for i in range(0, len(a), 8): + b.append(int(a[i:i + 8] or '0', 16)) + ol_id = ol_id[24:] + j = 0 + k = 0 + while j < len(ol_id): + c = 128 + d = 0 + e = 0 + f = 0 + _more = True + while _more: + if j + 1 >= len(ol_id): + c = 143 + f = int(ol_id[j:j + 2] or '0', 16) + j += 2 + d += (f & 127) << e + e += 7 + _more = f >= c + g = d ^ b[k % 3] + for i in range(4): + char_dec = (g >> 8 * i) & (c + 127) + char = compat_chr(char_dec) + if char != '#': + decoded += char + k += 1 video_url = 'https://openload.co/stream/%s?mime=true' - video_url = video_url % (''.join(video_url_chars)) + video_url = video_url % decoded title = self._og_search_title(webpage, default=None) or self._search_regex( r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 1e2c54e..cc296ea 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -2,8 +2,6 @@ from __future__ import unicode_literals import re -import calendar -import datetime from .common import InfoExtractor from ..compat import compat_str @@ -144,77 +142,25 @@ class ORFTVthekIE(InfoExtractor): } -class ORFOE1IE(InfoExtractor): - IE_NAME = 'orf:oe1' - IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' - - # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TESTS = [{ - 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', - 'only_matching': True, - }, { - 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', - 'only_matching': True, - }] - - def _real_extract(self, url): - show_id = self._match_id(url) - data = self._download_json( - 'http://oe1.orf.at/programm/%s/konsole' % show_id, - show_id - ) - - timestamp = datetime.datetime.strptime('%s %s' % ( - data['item']['day_label'], - data['item']['time'] - ), '%d.%m.%Y %H:%M') - unix_timestamp = calendar.timegm(timestamp.utctimetuple()) - - return { - 'id': show_id, - 'title': data['item']['title'], - 'url': data['item']['url_stream'], - 'ext': 'mp3', - 'description': data['item'].get('info'), - 'timestamp': unix_timestamp - } - - -class ORFFM4IE(InfoExtractor): - IE_NAME = 'orf:fm4' - IE_DESC = 'radio FM4' - _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)' - - _TEST = { - 'url': 'http://fm4.orf.at/player/20160110/IS/', - 'md5': '01e736e8f1cef7e13246e880a59ad298', - 'info_dict': { - 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244', - 'ext': 'mp3', - 'title': 'Im Sumpf', - 'description': 'md5:384c543f866c4e422a55f66a62d669cd', - 'duration': 7173, - 'timestamp': 1452456073, - 'upload_date': '20160110', - }, - 'skip': 'Live streams on FM4 got deleted soon', - } - +class ORFRadioIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) + station = mobj.group('station') show_date = mobj.group('date') show_id = mobj.group('show') + if station == 'fm4': + show_id = '4%s' % show_id + data = self._download_json( - 'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id), + 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date), show_id ) def extract_entry_dict(info, title, subtitle): return { 'id': info['loopStreamId'].replace('.mp3', ''), - 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'], + 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']), 'title': title, 'description': subtitle, 'duration': (info['end'] - info['start']) / 1000, @@ -233,6 +179,47 @@ class ORFFM4IE(InfoExtractor): } +class ORFFM4IE(ORFRadioIE): + IE_NAME = 'orf:fm4' + IE_DESC = 'radio FM4' + _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://fm4.orf.at/player/20170107/CC', + 'md5': '2b0be47375432a7ef104453432a19212', + 'info_dict': { + 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295', + 'ext': 'mp3', + 'title': 'Solid Steel Radioshow', + 'description': 'Die Mixshow von Coldcut und Ninja Tune.', + 'duration': 3599, + 'timestamp': 1483819257, + 'upload_date': '20170107', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + +class ORFOE1IE(ORFRadioIE): + IE_NAME = 'orf:oe1' + IE_DESC = 'Radio Österreich 1' + _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)' + + _TEST = { + 'url': 'http://oe1.orf.at/player/20170108/456544', + 'md5': '34d8a6e67ea888293741c86a099b745b', + 'info_dict': { + 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141', + 'ext': 'mp3', + 'title': 'Morgenjournal', + 'duration': 609, + 'timestamp': 1483858796, + 'upload_date': '20170108', + }, + 'skip': 'Shows from ORF radios are only available for 7 days.' + } + + class ORFIPTVIE(InfoExtractor): IE_NAME = 'orf:iptv' IE_DESC = 'iptv.ORF.at' diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py new file mode 100644 index 0000000..bb668c9 --- /dev/null +++ b/youtube_dl/extractor/packtpub.py @@ -0,0 +1,171 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_HTTPError, +) +from ..utils import ( + clean_html, + ExtractorError, + remove_end, + strip_or_none, + unified_timestamp, + urljoin, + urlencode_postdata, +) + + +class PacktPubBaseIE(InfoExtractor): + _PACKT_BASE = 'https://www.packtpub.com' + _MAPT_REST = '%s/mapt-rest' % _PACKT_BASE + + +class PacktPubIE(PacktPubBaseIE): + _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>\d+)/(?P<id>\d+)' + + _TEST = { + 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro', + 'md5': '1e74bd6cfd45d7d07666f4684ef58f70', + 'info_dict': { + 'id': '20530', + 'ext': 'mp4', + 'title': 'Project Intro', + 'thumbnail': r're:(?i)^https?://.*\.jpg', + 'timestamp': 1490918400, + 'upload_date': '20170331', + }, + } + _NETRC_MACHINE = 'packtpub' + _TOKEN = None + + def _real_initialize(self): + (username, password) = self._get_login_info() + if username is None: + return + webpage = self._download_webpage(self._PACKT_BASE, None) + login_form = self._form_hidden_inputs( + 'packt-user-login-form', webpage) + login_form.update({ + 'email': username, + 'password': password, + }) + self._download_webpage( + self._PACKT_BASE, None, 'Logging in as %s' % username, + data=urlencode_postdata(login_form)) + try: + self._TOKEN = self._download_json( + '%s/users/tokens/sessions' % self._MAPT_REST, None, + 'Downloading Authorization Token')['data']['token'] + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 404): + message = self._parse_json(e.cause.read().decode(), None)['message'] + raise ExtractorError(message, expected=True) + raise + + def _handle_error(self, response): + if response.get('status') != 'success': + raise ExtractorError( + '% said: %s' % (self.IE_NAME, response['message']), + expected=True) + + def _download_json(self, *args, **kwargs): + response = super(PacktPubIE, self)._download_json(*args, **kwargs) + self._handle_error(response) + return response + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + course_id, chapter_id, video_id = mobj.group( + 'course_id', 'chapter_id', 'id') + + headers = {} + if self._TOKEN: + headers['Authorization'] = self._TOKEN + video = self._download_json( + '%s/users/me/products/%s/chapters/%s/sections/%s' + % (self._MAPT_REST, course_id, chapter_id, video_id), video_id, + 'Downloading JSON video', headers=headers)['data'] + + content = video.get('content') + if not content: + self.raise_login_required('This video is locked') + + video_url = content['file'] + + metadata = self._download_json( + '%s/products/%s/chapters/%s/sections/%s/metadata' + % (self._MAPT_REST, course_id, chapter_id, video_id), + video_id)['data'] + + title = metadata['pageTitle'] + course_title = metadata.get('title') + if course_title: + title = remove_end(title, ' - %s' % course_title) + timestamp = unified_timestamp(metadata.get('publicationDate')) + thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath')) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'thumbnail': thumbnail, + 'timestamp': timestamp, + } + + +class PacktPubCourseIE(PacktPubBaseIE): + _VALID_URL = r'(?P<url>https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<id>\d+))' + _TEST = { + 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215', + 'info_dict': { + 'id': '9781787122215', + 'title': 'Learn Nodejs by building 12 projects [Video]', + }, + 'playlist_count': 90, + } + + @classmethod + def suitable(cls, url): + return False if PacktPubIE.suitable(url) else super( + PacktPubCourseIE, cls).suitable(url) + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, course_id = mobj.group('url', 'id') + + course = self._download_json( + '%s/products/%s/metadata' % (self._MAPT_REST, course_id), + course_id)['data'] + + entries = [] + for chapter_num, chapter in enumerate(course['tableOfContents'], 1): + if chapter.get('type') != 'chapter': + continue + children = chapter.get('children') + if not isinstance(children, list): + continue + chapter_info = { + 'chapter': chapter.get('title'), + 'chapter_number': chapter_num, + 'chapter_id': chapter.get('id'), + } + for section in children: + if section.get('type') != 'section': + continue + section_url = section.get('seoUrl') + if not isinstance(section_url, compat_str): + continue + entry = { + '_type': 'url_transparent', + 'url': urljoin(url + '/', section_url), + 'title': strip_or_none(section.get('title')), + 'description': clean_html(section.get('summary')), + 'ie_key': PacktPubIE.ie_key(), + } + entry.update(chapter_info) + entries.append(entry) + + return self.playlist_result(entries, course_id, course.get('title')) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 3e51b4d..16cc667 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -8,7 +8,9 @@ from ..utils import ( ExtractorError, determine_ext, int_or_none, + float_or_none, js_to_json, + orderedSet, strip_jsonp, strip_or_none, unified_strdate, @@ -264,6 +266,13 @@ class PBSIE(InfoExtractor): 'playlist_count': 2, }, { + 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/', + 'info_dict': { + 'id': 'great-war', + }, + 'playlist_count': 3, + }, + { 'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/', 'info_dict': { 'id': '2276541483', @@ -381,10 +390,10 @@ class PBSIE(InfoExtractor): # tabbed frontline videos MULTI_PART_REGEXES = ( r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"', - r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)', + r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)', ) for p in MULTI_PART_REGEXES: - tabbed_videos = re.findall(p, webpage) + tabbed_videos = orderedSet(re.findall(p, webpage)) if tabbed_videos: return tabbed_videos, presumptive_id, upload_date, description @@ -464,6 +473,7 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) + chapters = [] # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -479,6 +489,20 @@ class PBSIE(InfoExtractor): extract_redirect_urls(video_info) if not info: info = video_info + if not chapters: + for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player): + chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False) + if not chapter: + continue + start_time = float_or_none(chapter.get('start_time'), 1000) + duration = float_or_none(chapter.get('duration'), 1000) + if start_time is None or duration is None: + continue + chapters.append({ + 'start_time': start_time, + 'end_time': start_time + duration, + 'title': chapter.get('title'), + }) formats = [] http_url = None @@ -515,7 +539,7 @@ class PBSIE(InfoExtractor): http_url = format_url self._remove_duplicate_formats(formats) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: @@ -588,4 +612,5 @@ class PBSIE(InfoExtractor): 'upload_date': upload_date, 'formats': formats, 'subtitles': subtitles, + 'chapters': chapters, } diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index 0e36230..1add6b8 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -20,7 +20,7 @@ class PeriscopeBaseIE(InfoExtractor): class PeriscopeIE(PeriscopeBaseIE): IE_DESC = 'Periscope' IE_NAME = 'periscope' - _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)' # Alive example URLs can be found here http://onperiscope.com/ _TESTS = [{ 'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==', @@ -41,6 +41,9 @@ class PeriscopeIE(PeriscopeBaseIE): }, { 'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX', 'only_matching': True, + }, { + 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv', + 'only_matching': True, }] @staticmethod @@ -103,7 +106,7 @@ class PeriscopeIE(PeriscopeBaseIE): class PeriscopeUserIE(PeriscopeBaseIE): - _VALID_URL = r'https?://(?:www\.)?periscope\.tv/(?P<id>[^/]+)/?$' + _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$' IE_DESC = 'Periscope user videos' IE_NAME = 'periscope:user' diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py index 073fc3e..24c3600 100644 --- a/youtube_dl/extractor/porn91.py +++ b/youtube_dl/extractor/porn91.py @@ -1,10 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, -) from .common import InfoExtractor from ..utils import ( parse_duration, @@ -19,7 +15,7 @@ class Porn91IE(InfoExtractor): _TEST = { 'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134', - 'md5': '6df8f6d028bc8b14f5dbd73af742fb20', + 'md5': '7fcdb5349354f40d41689bd0fa8db05a', 'info_dict': { 'id': '7e42283b4f5ab36da134', 'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!', @@ -43,24 +39,7 @@ class Porn91IE(InfoExtractor): r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title') title = title.replace('\n', '') - # get real url - file_id = self._search_regex( - r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id') - sec_code = self._search_regex( - r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code') - max_vid = self._search_regex( - r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid') - url_params = compat_urllib_parse_urlencode({ - 'VID': file_id, - 'mp4': '1', - 'seccode': sec_code, - 'max_vid': max_vid, - }) - info_cn = self._download_webpage( - 'http://91porn.com/getfile.php?' + url_params, video_id, - 'Downloading real video url') - video_url = compat_urllib_parse_unquote(self._search_regex( - r'file=([^&]+)&', info_cn, 'url')) + info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0] duration = parse_duration(self._search_regex( r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False)) @@ -68,11 +47,12 @@ class Porn91IE(InfoExtractor): comment_count = int_or_none(self._search_regex( r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False)) - return { + info_dict.update({ 'id': video_id, 'title': title, - 'url': video_url, 'duration': duration, 'comment_count': comment_count, 'age_limit': self._rta_search(webpage), - } + }) + + return info_dict diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index b25f1f1..1dcc8df 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor): _VALID_URL = r'''(?x) https?:// (?: - (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| + (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)| (?:www\.)?thumbzilla\.com/video/ ) (?P<id>[\da-z]+) @@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor): }, { 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', 'only_matching': True, + }, { + 'url': 'http://www.pornhub.com/video/show?viewkey=648719015', + 'only_matching': True, }] @staticmethod diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py index ed38c77..e2202d6 100644 --- a/youtube_dl/extractor/r7.py +++ b/youtube_dl/extractor/r7.py @@ -62,8 +62,7 @@ class R7IE(InfoExtractor): # m3u8 format always matches the http format, let's copy metadata from # one to another m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - formats)) + lambda f: f.get('vcodec') != 'none', formats)) if len(m3u8_formats) == 1: f_copy = m3u8_formats[0].copy() f_copy.update(f) diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index 41afbd9..81eb9db 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,23 +1,40 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_urlparse, + compat_str, +) from ..utils import ( - determine_ext, ExtractorError, + determine_ext, find_xpath_attr, fix_xml_ampersands, + GeoRestrictedError, int_or_none, parse_duration, + strip_or_none, + try_get, unified_strdate, + unified_timestamp, update_url_query, + urljoin, xpath_text, ) class RaiBaseIE(InfoExtractor): - def _extract_relinker_formats(self, relinker_url, video_id): + _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' + _GEO_COUNTRIES = ['IT'] + _GEO_BYPASS = False + + def _extract_relinker_info(self, relinker_url, video_id): formats = [] + geoprotection = None + is_live = None + duration = None for platform in ('mon', 'flash', 'native'): relinker = self._download_xml( @@ -27,9 +44,27 @@ class RaiBaseIE(InfoExtractor): query={'output': 45, 'pl': platform}, headers=self.geo_verification_headers()) - media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if not geoprotection: + geoprotection = xpath_text( + relinker, './geoprotection', default=None) == 'Y' + + if not is_live: + is_live = xpath_text( + relinker, './is_live', default=None) == 'Y' + if not duration: + duration = parse_duration(xpath_text( + relinker, './duration', default=None)) + + url_elem = find_xpath_attr(relinker, './url', 'type', 'content') + if url_elem is None: + continue + + media_url = url_elem.text + + # This does not imply geo restriction (e.g. + # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html) if media_url == 'http://download.rai.it/video_no_available.mp4': - self.raise_geo_restricted() + continue ext = determine_ext(media_url) if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): @@ -53,215 +88,333 @@ class RaiBaseIE(InfoExtractor): 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', }) - return formats + if not formats and geoprotection is True: + self.raise_geo_restricted(countries=self._GEO_COUNTRIES) + + return dict((k, v) for k, v in { + 'is_live': is_live, + 'duration': duration, + 'formats': formats, + }.items() if v is not None) + + @staticmethod + def _extract_subtitles(url, subtitle_url): + subtitles = {} + if subtitle_url and isinstance(subtitle_url, compat_str): + subtitle_url = urljoin(url, subtitle_url) + STL_EXT = '.stl' + SRT_EXT = '.srt' + subtitles['it'] = [{ + 'ext': 'stl', + 'url': subtitle_url, + }] + if subtitle_url.endswith(STL_EXT): + srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT + subtitles['it'].append({ + 'ext': 'srt', + 'url': srt_url, + }) + return subtitles + + +class RaiPlayIE(RaiBaseIE): + _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE + _TESTS = [{ + 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter', + 'md5': '340aa3b7afb54bfd14a8c11786450d76', + 'info_dict': { + 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66', + 'ext': 'mp4', + 'title': 'La Casa Bianca', + 'alt_title': 'S2016 - Puntata del 23/10/2016', + 'description': 'md5:a09d45890850458077d1f68bb036e0a5', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 3', + 'creator': 'Rai 3', + 'duration': 3278, + 'timestamp': 1477764300, + 'upload_date': '20161029', + 'series': 'La Casa Bianca', + 'season': '2016', + }, + }, { + 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', + 'info_dict': { + 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', + 'ext': 'mp4', + 'title': 'Report del 07/04/2014', + 'alt_title': 'S2013/14 - Puntata del 07/04/2014', + 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', + 'thumbnail': r're:^https?://.*\.jpg$', + 'uploader': 'Rai 5', + 'creator': 'Rai 5', + 'duration': 6160, + 'series': 'Report', + 'season_number': 5, + 'season': '2013/14', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + url, video_id = mobj.group('url', 'id') - def _extract_from_content_id(self, content_id, base_url): + media = self._download_json( + '%s?json' % url, video_id, 'Downloading video JSON') + + title = media['name'] + + video = media['video'] + + relinker_info = self._extract_relinker_info(video['contentUrl'], video_id) + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + if 'images' in media: + for _, value in media.get('images').items(): + if value: + thumbnails.append({ + 'url': value.replace('[RESOLUTION]', '600x400') + }) + + timestamp = unified_timestamp(try_get( + media, lambda x: x['availabilities'][0]['start'], compat_str)) + + subtitles = self._extract_subtitles(url, video.get('subtitles')) + + info = { + 'id': video_id, + 'title': title, + 'alt_title': media.get('subtitle'), + 'description': media.get('description'), + 'uploader': media.get('channel'), + 'creator': media.get('editor'), + 'duration': parse_duration(video.get('duration')), + 'timestamp': timestamp, + 'thumbnails': thumbnails, + 'series': try_get( + media, lambda x: x['isPartOf']['name'], compat_str), + 'season_number': int_or_none(try_get( + media, lambda x: x['isPartOf']['numeroStagioni'])), + 'season': media.get('stagione') or None, + 'subtitles': subtitles, + } + + info.update(relinker_info) + + return info + + +class RaiIE(RaiBaseIE): + _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE + _TESTS = [{ + # var uniquename = "ContentItem-..." + # data-id="ContentItem-..." + 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', + 'info_dict': { + 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', + 'ext': 'mp4', + 'title': 'TG PRIMO TEMPO', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 1758, + 'upload_date': '20140612', + } + }, { + # with ContentItem in many metas + 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html', + 'info_dict': { + 'id': '1632c009-c843-4836-bb65-80c33084a64b', + 'ext': 'mp4', + 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"', + 'description': 'I film in uscita questa settimana.', + 'thumbnail': r're:^https?://.*\.png$', + 'duration': 833, + 'upload_date': '20161103', + } + }, { + # with ContentItem in og:url + 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html', + 'md5': '11959b4e44fa74de47011b5799490adf', + 'info_dict': { + 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 03/11/2016', + 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016', + 'thumbnail': r're:^https?://.*\.jpg$', + 'duration': 2214, + 'upload_date': '20161103', + } + }, { + # drawMediaRaiTV(...) + 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', + 'info_dict': { + 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', + 'ext': 'mp4', + 'title': 'Il pacco', + 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', + 'thumbnail': r're:^https?://.*\.jpg$', + 'upload_date': '20141221', + }, + }, { + # initEdizione('ContentItem-...' + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'info_dict': { + 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303', + 'ext': 'mp4', + 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}', + 'duration': 2274, + 'upload_date': '20170401', + }, + 'skip': 'Changes daily', + }, { + # HDS live stream with only relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + 'params': { + 'skip_download': True, + }, + }, { + # HLS live stream with ContentItem in og:url + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'mp4', + 'title': 'La diretta di Rainews24', + }, + 'params': { + 'skip_download': True, + }, + }] + + def _extract_from_content_id(self, content_id, url): media = self._download_json( 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, content_id, 'Downloading video JSON') - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': compat_urlparse.urljoin(base_url, thumbnail_url), - }) + title = media['name'].strip() - formats = [] media_type = media['type'] if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) + relinker_info = { + 'formats': { + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + } + } elif 'Video' in media_type: - formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) - self._sort_formats(formats) + relinker_info = self._extract_relinker_info(media['mediaUri'], content_id) else: raise ExtractorError('not a media file') - subtitles = {} - captions = media.get('subtitlesUrl') - if captions: - STL_EXT = '.stl' - SRT_EXT = '.srt' - if captions.endswith(STL_EXT): - captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = [{ - 'ext': 'srt', - 'url': captions, - }] + self._sort_formats(relinker_info['formats']) + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(url, thumbnail_url), + }) - return { + subtitles = self._extract_subtitles(url, media.get('subtitlesUrl')) + + info = { 'id': content_id, - 'title': media['name'], - 'description': media.get('desc'), + 'title': title, + 'description': strip_or_none(media.get('desc')), 'thumbnails': thumbnails, 'uploader': media.get('author'), 'upload_date': unified_strdate(media.get('date')), 'duration': parse_duration(media.get('length')), - 'formats': formats, 'subtitles': subtitles, } + info.update(relinker_info) -class RaiTVIE(RaiBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '8970abf8caf8aef4696e7b1f2adfc696', - 'info_dict': { - 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'mp4', - 'title': 'Report del 07/04/2014', - 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', - 'upload_date': '20140407', - 'duration': 6160, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - { - # no m3u8 stream - 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - # HDS download, MD5 is unstable - 'info_dict': { - 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', - 'ext': 'flv', - 'title': 'TG PRIMO TEMPO', - 'upload_date': '20140612', - 'duration': 1758, - 'thumbnail': r're:^https?://.*\.jpg$', - }, - 'skip': 'Geo-restricted to Italy', - }, - { - 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', - 'md5': '35cf7c229f22eeef43e48b5cf923bef0', - 'info_dict': { - 'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13', - 'ext': 'mp4', - 'title': 'State of the Net, Antonella La Carpia: regole virali', - 'description': 'md5:b0ba04a324126903e3da7763272ae63c', - 'upload_date': '20140613', - }, - 'skip': 'Error 404', - }, - { - 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html', - 'info_dict': { - 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132', - 'ext': 'mp4', - 'title': 'Alluvione in Sardegna e dissesto idrogeologico', - 'description': 'Edizione delle ore 20:30 ', - }, - 'skip': 'invalid urls', - }, - { - 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': 'e57493e1cb8bc7c564663f363b171847', - 'info_dict': { - 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'mp4', - 'title': 'Il Candidato - Primo episodio: "Le Primarie"', - 'description': 'md5:364b604f7db50594678f483353164fb8', - 'upload_date': '20140923', - 'duration': 386, - 'thumbnail': r're:^https?://.*\.jpg$', - } - }, - ] + return info def _real_extract(self, url): video_id = self._match_id(url) - return self._extract_from_content_id(video_id, url) + webpage = self._download_webpage(url, video_id) + content_item_id = None -class RaiIE(RaiBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' - _TESTS = [ - { - 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': '2dd727e61114e1ee9c47f0da6914e178', - 'info_dict': { - 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'mp4', - 'title': 'Il pacco', - 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', - 'upload_date': '20141221', - }, - }, - { - # Direct relinker URL - 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', - # HDS live stream, MD5 is unstable - 'info_dict': { - 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', - 'ext': 'flv', - 'title': 'EuroNews', - }, - 'skip': 'Geo-restricted to Italy', - }, - { - # Embedded content item ID - 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', - 'md5': '84c1135ce960e8822ae63cec34441d63', - 'info_dict': { - 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', - 'ext': 'mp4', - 'title': 'TG1 ore 20:00 del 02/07/2016', - 'upload_date': '20160702', - }, - }, - { - 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', - # HDS live stream, MD5 is unstable - 'info_dict': { - 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', - 'ext': 'flv', - 'title': 'La diretta di Rainews24', - }, - }, - ] + content_item_url = self._html_search_meta( + ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url', + 'twitter:player', 'jsonlink'), webpage, default=None) + if content_item_url: + content_item_id = self._search_regex( + r'ContentItem-(%s)' % self._UUID_RE, content_item_url, + 'content item id', default=None) - @classmethod - def suitable(cls, url): - return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url) - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + if not content_item_id: + content_item_id = self._search_regex( + r'''(?x) + (?: + (?:initEdizione|drawMediaRaiTV)\(| + <(?:[^>]+\bdata-id|var\s+uniquename)= + ) + (["\']) + (?:(?!\1).)*\bContentItem-(?P<id>%s) + ''' % self._UUID_RE, + webpage, 'content item id', default=None, group='id') - iframe_url = self._search_regex( - [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', - r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe', default=None) - if iframe_url: - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) - - content_item_id = self._search_regex( - r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', - webpage, 'content item ID', group='content_id', default=None) + content_item_ids = set() if content_item_id: - return self._extract_from_content_id(content_item_id, url) + content_item_ids.add(content_item_id) + if video_id not in content_item_ids: + content_item_ids.add(video_id) + + for content_item_id in content_item_ids: + try: + return self._extract_from_content_id(content_item_id, url) + except GeoRestrictedError: + raise + except ExtractorError: + pass + + relinker_url = self._search_regex( + r'''(?x) + (?: + var\s+videoURL| + mediaInfo\.mediaUri + )\s*=\s* + ([\'"]) + (?P<url> + (?:https?:)? + //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\? + (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1 + ''', + webpage, 'relinker URL', group='url') - relinker_url = compat_urlparse.urljoin(url, self._search_regex( - r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', - webpage, 'relinker URL', group='url')) - formats = self._extract_relinker_formats(relinker_url, video_id) - self._sort_formats(formats) + relinker_info = self._extract_relinker_info( + urljoin(url, relinker_url), video_id) + self._sort_formats(relinker_info['formats']) title = self._search_regex( r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', - webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + webpage, 'title', group='title', + default=None) or self._og_search_title(webpage) - return { + info = { 'id': video_id, 'title': title, - 'formats': formats, } + + info.update(relinker_info) + + return info diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 53b82fb..afa7b91 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -13,15 +13,15 @@ from ..utils import ( class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' + _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' _TEST = { 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { 'id': 'ford-lopatin-live-at-primavera-sound-2011', 'ext': 'mp3', - 'title': 'Main Stage - Ford & Lopatin', - 'description': 'md5:4f340fb48426423530af5a9d87bd7b91', + 'title': 'Main Stage - Ford & Lopatin at Primavera Sound', + 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e', 'thumbnail': r're:^https?://.*\.jpg', 'duration': 2452, 'timestamp': 1307103164, diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py index 2340dae..e921ca3 100644 --- a/youtube_dl/extractor/rmcdecouverte.py +++ b/youtube_dl/extractor/rmcdecouverte.py @@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor): _VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)' _TEST = { - 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE', + 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET', 'info_dict': { - 'id': '5111223049001', + 'id': '5419055995001', 'ext': 'mp4', - 'title': ': LES HEROS DU 88e ETAGE', - 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.', + 'title': 'UN DELICIEUX PROJET', + 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5', 'uploader_id': '1969646226001', - 'upload_date': '20160904', - 'timestamp': 1472951103, + 'upload_date': '20170502', + 'timestamp': 1493745308, }, 'params': { - # rtmp download 'skip_download': True, }, - 'skip': 'Only works from France', + 'skip': 'only available for a week', } BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s' @@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage) - brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0] - return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) + if brightcove_legacy_url: + brightcove_id = compat_parse_qs(compat_urlparse.urlparse( + brightcove_legacy_url).query)['@videoPlayer'][0] + else: + brightcove_id = self._search_regex( + r'data-video-id=["\'](\d+)', webpage, 'brightcove id') + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', + brightcove_id) diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py index 721ee73..666e90e 100644 --- a/youtube_dl/extractor/rtl2.py +++ b/youtube_dl/extractor/rtl2.py @@ -1,13 +1,26 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 import re from .common import InfoExtractor -from ..utils import int_or_none +from ..aes import aes_cbc_decrypt +from ..compat import ( + compat_ord, + compat_str, +) +from ..utils import ( + bytes_to_intlist, + ExtractorError, + intlist_to_bytes, + int_or_none, + strip_or_none, +) class RTL2IE(InfoExtractor): + IE_NAME = 'rtl2' _VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))' _TESTS = [{ 'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0', @@ -98,3 +111,98 @@ class RTL2IE(InfoExtractor): 'duration': int_or_none(video_info.get('duration')), 'formats': formats, } + + +class RTL2YouBaseIE(InfoExtractor): + _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/' + + +class RTL2YouIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you' + _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du', + 'info_dict': { + 'id': '15740', + 'ext': 'mp4', + 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!', + 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01', + 'age_limit': 12, + }, + }, { + 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712', + 'only_matching': True, + }] + _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!' + _GEO_COUNTRIES = ['DE'] + + def _real_extract(self, url): + video_id = self._match_id(url) + + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id) + + data, iv = base64.b64decode(stream_data['streamUrl']).decode().split(':') + stream_url = intlist_to_bytes(aes_cbc_decrypt( + bytes_to_intlist(base64.b64decode(data)), + bytes_to_intlist(self._AES_KEY), + bytes_to_intlist(base64.b64decode(iv)) + )) + if b'rtl2_you_video_not_found' in stream_url: + raise ExtractorError('video not found', expected=True) + + formats = self._extract_m3u8_formats( + stream_url[:-compat_ord(stream_url[-1])].decode(), + video_id, 'mp4', 'm3u8_native') + self._sort_formats(formats) + + video_data = self._download_json( + self._BACKWERK_BASE_URL + 'video/' + video_id, video_id) + + series = video_data.get('formatTitle') + title = episode = video_data.get('title') or series + if series and series != title: + title = '%s - %s' % (series, title) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'description': strip_or_none(video_data.get('description')), + 'thumbnail': video_data.get('image'), + 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000), + 'series': series, + 'episode': episode, + 'age_limit': int_or_none(video_data.get('minimumAge')), + } + + +class RTL2YouSeriesIE(RTL2YouBaseIE): + IE_NAME = 'rtl2:you:series' + _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)' + _TEST = { + 'url': 'http://you.rtl2.de/videos/115/dragon-ball', + 'info_dict': { + 'id': '115', + }, + 'playlist_mincount': 5, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + stream_data = self._download_json( + self._BACKWERK_BASE_URL + 'videos', + series_id, query={ + 'formatId': series_id, + 'limit': 1000000000, + }) + + entries = [] + for video in stream_data.get('videos', []): + video_id = compat_str(video['videoId']) + if not video_id: + continue + entries.append(self.url_result( + 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id), + 'RTL2You', video_id)) + return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py index 5164401..f036f67 100644 --- a/youtube_dl/extractor/rudo.py +++ b/youtube_dl/extractor/rudo.py @@ -26,7 +26,7 @@ class RudoIE(InfoExtractor): } @classmethod - def _extract_url(self, webpage): + def _extract_url(cls, webpage): mobj = re.search( r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', webpage) diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py index 9f5c237..3472527 100644 --- a/youtube_dl/extractor/streamable.py +++ b/youtube_dl/extractor/streamable.py @@ -12,7 +12,7 @@ from ..utils import ( class StreamableIE(InfoExtractor): - _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)' + _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)' _TESTS = [ { 'url': 'https://streamable.com/dnd1', @@ -47,6 +47,10 @@ class StreamableIE(InfoExtractor): { 'url': 'https://streamable.com/e/dnd1', 'only_matching': True, + }, + { + 'url': 'https://streamable.com/s/okkqk/drxjds', + 'only_matching': True, } ] diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py new file mode 100644 index 0000000..aa4fad1 --- /dev/null +++ b/youtube_dl/extractor/streamango.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + int_or_none, + js_to_json, +) + + +class StreamangoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4', + 'md5': 'e992787515a182f55e38fc97588d802a', + 'info_dict': { + 'id': 'clapasobsptpkdfe', + 'ext': 'mp4', + 'title': '20170315_150006.mp4', + } + }, { + 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + title = self._og_search_title(webpage) + + formats = [] + for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage): + video = self._parse_json( + format_, video_id, transform_source=js_to_json, fatal=False) + if not video: + continue + src = video.get('src') + if not src: + continue + ext = determine_ext(src, default_ext=None) + if video.get('type') == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': src, + 'ext': ext or 'mp4', + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'tbr': int_or_none(video.get('bitrate')), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'url': url, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py index 1b1afab..3f3c681 100644 --- a/youtube_dl/extractor/ted.py +++ b/youtube_dl/extractor/ted.py @@ -210,7 +210,7 @@ class TEDIE(InfoExtractor): resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False)) m3u8_formats = list(filter( - lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', + lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none', formats)) if http_url: for m3u8_format in m3u8_formats: diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 9a424b1..de236bb 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE): 'url': src, }) + duration = info.get('duration') + tp_chapters = info.get('chapters', []) + chapters = [] + if tp_chapters: + def _add_chapter(start_time, end_time): + start_time = float_or_none(start_time, 1000) + end_time = float_or_none(end_time, 1000) + if start_time is None or end_time is None: + return + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + }) + + for chapter in tp_chapters[:-1]: + _add_chapter(chapter.get('startTime'), chapter.get('endTime')) + _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration) + return { 'title': info['title'], 'subtitles': subtitles, 'description': info['description'], 'thumbnail': info['defaultThumbnailUrl'], - 'duration': int_or_none(info.get('duration'), 1000), + 'duration': float_or_none(duration, 1000), 'timestamp': int_or_none(info.get('pubDate'), 1000) or None, 'uploader': info.get('billingCode'), + 'chapters': chapters, } def _extract_theplatform_metadata(self, path, video_id): diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py index b8504f0..cd64235 100644 --- a/youtube_dl/extractor/thescene.py +++ b/youtube_dl/extractor/thescene.py @@ -3,10 +3,6 @@ from __future__ import unicode_literals from .common import InfoExtractor from ..compat import compat_urlparse -from ..utils import ( - int_or_none, - qualities, -) class TheSceneIE(InfoExtractor): @@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor): 'season': 'Ready To Wear Spring 2013', 'tags': list, 'categories': list, + 'upload_date': '20120913', + 'timestamp': 1347512400, + 'uploader': 'vogue', }, } @@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor): self._html_search_regex( r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url')) - player = self._download_webpage(player_url, display_id) - info = self._parse_json( - self._search_regex( - r'(?m)video\s*:\s*({.+?}),$', player, 'info json'), - display_id) - - video_id = info['id'] - title = info['title'] - - qualities_order = qualities(('low', 'high')) - formats = [{ - 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']), - 'url': f['src'], - 'quality': qualities_order(f['quality']), - } for f in info['sources']] - self._sort_formats(formats) - return { - 'id': video_id, + '_type': 'url_transparent', 'display_id': display_id, - 'title': title, - 'formats': formats, - 'thumbnail': info.get('poster_frame'), - 'duration': int_or_none(info.get('duration')), - 'series': info.get('series_title'), - 'season': info.get('season_title'), - 'tags': info.get('tags'), - 'categories': info.get('categories'), + 'url': player_url, + 'ie_key': 'CondeNast', } diff --git a/youtube_dl/extractor/thesun.py b/youtube_dl/extractor/thesun.py new file mode 100644 index 0000000..22d0037 --- /dev/null +++ b/youtube_dl/extractor/thesun.py @@ -0,0 +1,32 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class TheSunIE(InfoExtractor): + _VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P<id>\d+)' + _TEST = { + 'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/', + 'info_dict': { + 'id': '2261604', + 'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf', + }, + 'playlist_count': 2, + } + + def _real_extract(self, url): + article_id = self._match_id(url) + + webpage = self._download_webpage(url, article_id) + + entries = [] + for ooyala_id in re.findall( + r'<[^>]+\b(?:id\s*=\s*"thesun-ooyala-player-|data-content-id\s*=\s*")([^"]+)', + webpage): + entries.append(OoyalaIE._build_url_result(ooyala_id)) + + return self.playlist_result( + entries, article_id, self._og_search_title(webpage, fatal=False)) diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py index 1c0be9f..efeb677 100644 --- a/youtube_dl/extractor/turner.py +++ b/youtube_dl/extractor/turner.py @@ -13,6 +13,7 @@ from ..utils import ( xpath_attr, update_url_query, ExtractorError, + strip_or_none, ) @@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE): 'height': int_or_none(image.get('height')), } for image in video_data.findall('images/image')] + is_live = xpath_text(video_data, 'isLive') == 'true' + return { 'id': video_id, - 'title': title, + 'title': self._live_title(title) if is_live else title, 'formats': formats, 'subtitles': subtitles, 'thumbnails': thumbnails, - 'description': xpath_text(video_data, 'description'), + 'thumbnail': xpath_text(video_data, 'poster'), + 'description': strip_or_none(xpath_text(video_data, 'description')), 'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')), 'timestamp': self._extract_timestamp(video_data), 'upload_date': xpath_attr(video_data, 'metas', 'version'), 'series': xpath_text(video_data, 'showTitle'), 'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')), 'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')), + 'is_live': is_live, } diff --git a/youtube_dl/extractor/tv2hu.py b/youtube_dl/extractor/tv2hu.py new file mode 100644 index 0000000..86017b7 --- /dev/null +++ b/youtube_dl/extractor/tv2hu.py @@ -0,0 +1,62 @@ +# encoding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import int_or_none + + +class TV2HuIE(InfoExtractor): + IE_NAME = 'tv2.hu' + _VALID_URL = r'https?://(?:www\.)?tv2\.hu/(?:[^/]+/)+(?P<id>\d+)_[^/?#]+?\.html' + _TESTS = [{ + 'url': 'http://tv2.hu/ezek_megorultek/217679_ezek-megorultek---1.-adas-1.-resz.html', + 'md5': '585e58e2e090f34603804bb2c48e98d8', + 'info_dict': { + 'id': '217679', + 'ext': 'mp4', + 'title': 'Ezek megőrültek! - 1. adás 1. rész', + 'upload_date': '20160826', + 'thumbnail': r're:^https?://.*\.jpg$' + } + }, { + 'url': 'http://tv2.hu/ezek_megorultek/teljes_adasok/217677_ezek-megorultek---1.-adas-2.-resz.html', + 'only_matching': True + }, { + 'url': 'http://tv2.hu/musoraink/aktiv/aktiv_teljes_adas/217963_aktiv-teljes-adas---2016.08.30..html', + 'only_matching': True + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + json_url = self._search_regex( + r'jsonUrl\s*=\s*"([^"]+)"', webpage, 'json url') + json_data = self._download_json(json_url, video_id) + + formats = [] + for b in ('bitrates', 'backupBitrates'): + bitrates = json_data.get(b, {}) + m3u8_url = bitrates.get('hls') + if m3u8_url: + formats.extend(self._extract_wowza_formats( + m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp'])) + + for mp4_url in bitrates.get('mp4', []): + height = int_or_none(self._search_regex( + r'\.(\d+)p\.mp4', mp4_url, 'height', default=None)) + formats.append({ + 'format_id': 'http' + ('-%d' % height if height else ''), + 'url': mp4_url, + 'height': height, + 'width': int_or_none(height / 9.0 * 16.0 if height else None), + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': self._og_search_title(webpage).strip(), + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': self._search_regex( + r'/vod/(\d{8})/', json_url, 'upload_date', default=None), + 'formats': formats, + } diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py new file mode 100644 index 0000000..88b6baa --- /dev/null +++ b/youtube_dl/extractor/tv5mondeplus.py @@ -0,0 +1,79 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + determine_ext, + extract_attributes, + get_element_by_class, + int_or_none, + parse_duration, + parse_iso8601, +) + + +class TV5MondePlusIE(InfoExtractor): + IE_DESC = 'TV5MONDE+' + _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)' + _TEST = { + 'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', + 'md5': '12130fc199f020673138a83466542ec6', + 'info_dict': { + 'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants', + 'ext': 'mp4', + 'title': 'Tdah, mon amour - Enfants', + 'description': 'md5:230e3aca23115afcf8006d1bece6df74', + 'upload_date': '20170401', + 'timestamp': 1491022860, + } + } + _GEO_BYPASS = False + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage: + self.raise_geo_restricted(countries=['FR']) + + series = get_element_by_class('video-detail__title', webpage) + title = episode = get_element_by_class( + 'video-detail__subtitle', webpage) or series + if series and series != title: + title = '%s - %s' % (series, title) + vpl_data = extract_attributes(self._search_regex( + r'(<[^>]+class="video_player_loader"[^>]+>)', + webpage, 'video player loader')) + + video_files = self._parse_json( + vpl_data['data-broadcast'], display_id).get('files', []) + formats = [] + for video_file in video_files: + v_url = video_file.get('url') + if not v_url: + continue + video_format = video_file.get('format') or determine_ext(v_url) + if video_format == 'm3u8': + formats.extend(self._extract_m3u8_formats( + v_url, display_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': v_url, + 'format_id': video_format, + }) + self._sort_formats(formats) + + return { + 'id': display_id, + 'display_id': display_id, + 'title': title, + 'description': clean_html(get_element_by_class('video-detail__description', webpage)), + 'thumbnail': vpl_data.get('data-image'), + 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)), + 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)), + 'formats': formats, + 'episode': episode, + 'series': series, + } diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 06ea2b4..c5b3288 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor): 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False) self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) formats.extend(m3u8_formats) for i, m3u8_format in enumerate(m3u8_formats, 2): http_url = '%s-%d.mp4' % (video_url_base, i) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index 3eda0a3..99ff82a 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -225,7 +225,11 @@ class TVPlayIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - + geo_country = self._search_regex( + r'https?://[^/]+\.([a-z]{2})', url, + 'geo country', default=None) + if geo_country: + self._initialize_geo_bypass([geo_country.upper()]) video = self._download_json( 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py index b653714..ebde605 100644 --- a/youtube_dl/extractor/tvplayer.py +++ b/youtube_dl/extractor/tvplayer.py @@ -2,9 +2,13 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_HTTPError +from ..compat import ( + compat_HTTPError, + compat_str, +) from ..utils import ( extract_attributes, + try_get, urlencode_postdata, ExtractorError, ) @@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor): webpage, 'channel element')) title = current_channel['data-name'] - resource_id = self._search_regex( - r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id') - platform = self._search_regex( - r'platform\s*=\s*"([^"]+)"', webpage, 'platform') + resource_id = current_channel['data-id'] + token = self._search_regex( - r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null') - validate = self._search_regex( - r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null') + r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage, + 'token', group='token') + + context = self._download_json( + 'https://tvplayer.com/watch/context', display_id, + 'Downloading JSON context', query={ + 'resource': resource_id, + 'nonce': token, + }) + + validate = context['validate'] + platform = try_get( + context, lambda x: x['platform']['key'], compat_str) or 'firefox' try: response = self._download_json( 'http://api.tvplayer.com/api/v2/stream/live', - resource_id, headers={ + display_id, 'Downloading JSON stream', headers={ 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', }, data=urlencode_postdata({ + 'id': resource_id, 'service': 1, 'platform': platform, - 'id': resource_id, - 'token': token, 'validate': validate, }))['tvplayer']['response'] except ExtractorError as e: @@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor): '%s said: %s' % (self.IE_NAME, response['error']), expected=True) raise - formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4') + formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4') self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py index cce29c6..dae1aa3 100644 --- a/youtube_dl/extractor/udemy.py +++ b/youtube_dl/extractor/udemy.py @@ -212,12 +212,15 @@ class UdemyIE(InfoExtractor): thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl') duration = float_or_none(asset.get('data', {}).get('duration')) + subtitles = {} + automatic_captions = {} + formats = [] - def extract_output_format(src): + def extract_output_format(src, f_id): return { 'url': src['url'], - 'format_id': '%sp' % (src.get('height') or format_id), + 'format_id': '%sp' % (src.get('height') or f_id), 'width': int_or_none(src.get('width')), 'height': int_or_none(src.get('height')), 'vbr': int_or_none(src.get('video_bitrate_in_kbps')), @@ -237,30 +240,33 @@ class UdemyIE(InfoExtractor): def add_output_format_meta(f, key): output = outputs.get(key) if isinstance(output, dict): - output_format = extract_output_format(output) + output_format = extract_output_format(output, key) output_format.update(f) return output_format return f + def extract_formats(source_list): + if not isinstance(source_list, list): + return + for source in source_list: + video_url = source.get('file') or source.get('src') + if not video_url or not isinstance(video_url, compat_str): + continue + format_id = source.get('label') + f = { + 'url': video_url, + 'format_id': '%sp' % format_id, + 'height': int_or_none(format_id), + } + if format_id: + # Some videos contain additional metadata (e.g. + # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) + f = add_output_format_meta(f, format_id) + formats.append(f) + download_urls = asset.get('download_urls') if isinstance(download_urls, dict): - video = download_urls.get('Video') - if isinstance(video, list): - for format_ in video: - video_url = format_.get('file') - if not video_url: - continue - format_id = format_.get('label') - f = { - 'url': format_['file'], - 'format_id': '%sp' % format_id, - 'height': int_or_none(format_id), - } - if format_id: - # Some videos contain additional metadata (e.g. - # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208) - f = add_output_format_meta(f, format_id) - formats.append(f) + extract_formats(download_urls.get('Video')) view_html = lecture.get('view_html') if view_html: @@ -294,6 +300,35 @@ class UdemyIE(InfoExtractor): 'height': height, }, res)) + # react rendition since 2017.04.15 (see + # https://github.com/rg3/youtube-dl/issues/12744) + data = self._parse_json( + self._search_regex( + r'videojs-setup-data=(["\'])(?P<data>{.+?})\1', view_html, + 'setup data', default='{}', group='data'), video_id, + transform_source=unescapeHTML, fatal=False) + if data and isinstance(data, dict): + extract_formats(data.get('sources')) + if not duration: + duration = int_or_none(data.get('duration')) + tracks = data.get('tracks') + if isinstance(tracks, list): + for track in tracks: + if not isinstance(track, dict): + continue + if track.get('kind') != 'captions': + continue + src = track.get('src') + if not src or not isinstance(src, compat_str): + continue + lang = track.get('language') or track.get( + 'srclang') or track.get('label') + sub_dict = automatic_captions if track.get( + 'autogenerated') is True else subtitles + sub_dict.setdefault(lang, []).append({ + 'url': src, + }) + self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id')) return { @@ -302,7 +337,9 @@ class UdemyIE(InfoExtractor): 'description': description, 'thumbnail': thumbnail, 'duration': duration, - 'formats': formats + 'formats': formats, + 'subtitles': subtitles, + 'automatic_captions': automatic_captions, } diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py new file mode 100644 index 0000000..30297b4 --- /dev/null +++ b/youtube_dl/extractor/upskill.py @@ -0,0 +1,176 @@ +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .wistia import WistiaIE +from ..compat import compat_str +from ..utils import ( + clean_html, + ExtractorError, + get_element_by_class, + urlencode_postdata, + urljoin, +) + + +class UpskillBaseIE(InfoExtractor): + _LOGIN_URL = 'http://upskillcourses.com/sign_in' + _NETRC_MACHINE = 'upskill' + + def _real_initialize(self): + self._login() + + def _login(self): + username, password = self._get_login_info() + if username is None: + return + + login_page, urlh = self._download_webpage_handle( + self._LOGIN_URL, None, 'Downloading login page') + + login_url = compat_str(urlh.geturl()) + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'user[email]': username, + 'user[password]': password, + }) + + post_url = self._search_regex( + r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page, + 'post url', default=login_url, group='url') + + if not post_url.startswith('http'): + post_url = urljoin(login_url, post_url) + + response = self._download_webpage( + post_url, None, 'Logging in', + data=urlencode_postdata(login_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': login_url, + }) + + # Successful login + if any(re.search(p, response) for p in ( + r'class=["\']user-signout', + r'<a[^>]+\bhref=["\']/sign_out', + r'>\s*Log out\s*<')): + return + + message = get_element_by_class('alert', response) + if message is not None: + raise ExtractorError( + 'Unable to login: %s' % clean_html(message), expected=True) + + raise ExtractorError('Unable to log in') + + +class UpskillIE(UpskillBaseIE): + _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100', + 'info_dict': { + 'id': 'uzw6zw58or', + 'ext': 'mp4', + 'title': 'Welcome to the Course!', + 'description': 'md5:8d66c13403783370af62ca97a7357bdd', + 'duration': 138.763, + 'timestamp': 1479846621, + 'upload_date': '20161122', + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + wistia_url = WistiaIE._extract_url(webpage) + if not wistia_url: + if any(re.search(p, webpage) for p in ( + r'class=["\']lecture-contents-locked', + r'>\s*Lecture contents locked', + r'id=["\']lecture-locked')): + self.raise_login_required('Lecture contents locked') + + title = self._og_search_title(webpage, default=None) + + return { + '_type': 'url_transparent', + 'url': wistia_url, + 'ie_key': WistiaIE.ie_key(), + 'title': title, + } + + +class UpskillCourseIE(UpskillBaseIE): + _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/', + 'info_dict': { + 'id': '119763', + 'title': 'The Essential Web Developer Course (Free)', + }, + 'playlist_count': 192, + }, { + 'url': 'http://upskillcourses.com/courses/119763/', + 'only_matching': True, + }, { + 'url': 'http://upskillcourses.com/courses/enrolled/119763', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if UpskillIE.suitable(url) else super( + UpskillCourseIE, cls).suitable(url) + + def _real_extract(self, url): + course_id = self._match_id(url) + + webpage = self._download_webpage(url, course_id) + + course_id = self._search_regex( + r'data-course-id=["\'](\d+)', webpage, 'course id', + default=course_id) + + entries = [] + + for mobj in re.finditer( + r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)', + webpage): + li = mobj.group('li') + if 'fa-youtube-play' not in li: + continue + lecture_url = self._search_regex( + r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li, + 'lecture url', default=None, group='url') + if not lecture_url: + continue + lecture_id = self._search_regex( + r'/lectures/(\d+)', lecture_url, 'lecture id', default=None) + title = self._html_search_regex( + r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li, + 'title', default=None) + entries.append( + self.url_result( + urljoin('http://upskillcourses.com/', lecture_url), + ie=UpskillIE.ie_key(), video_id=lecture_id, + video_title=clean_html(title))) + + course_title = self._html_search_regex( + (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h', + r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'), + webpage, 'course title', fatal=False) + + return self.playlist_result(entries, course_id, course_title) diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py index 9aa38bc..890a149 100644 --- a/youtube_dl/extractor/vevo.py +++ b/youtube_dl/extractor/vevo.py @@ -1,6 +1,7 @@ from __future__ import unicode_literals import re +import json from .common import InfoExtractor from ..compat import ( @@ -11,7 +12,6 @@ from ..compat import ( from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, parse_iso8601, ) @@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE): } def _initialize_api(self, video_id): - req = sanitized_Request( - 'http://www.vevo.com/auth', data=b'') webpage = self._download_webpage( - req, None, + 'https://accounts.vevo.com/token', None, note='Retrieving oauth token', - errnote='Unable to retrieve oauth token') + errnote='Unable to retrieve oauth token', + data=json.dumps({ + 'client_id': 'SPupX1tvqFEopQ1YS6SS', + 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous', + }).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + }) if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage): self.raise_geo_restricted( '%s said: This page is currently unavailable in your region' % self.IE_NAME) auth_info = self._parse_json(webpage, video_id) - self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token'] + self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token'] def _call_api(self, path, *args, **kwargs): try: diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py index f0a7fd7..54e207b 100644 --- a/youtube_dl/extractor/vice.py +++ b/youtube_dl/extractor/vice.py @@ -20,7 +20,7 @@ from ..utils import ( class ViceBaseIE(AdobePassIE): - def _extract_preplay_video(self, url, webpage): + def _extract_preplay_video(self, url, locale, webpage): watch_hub_data = extract_attributes(self._search_regex( r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub')) video_id = watch_hub_data['vms-id'] @@ -32,7 +32,8 @@ class ViceBaseIE(AdobePassIE): resource = self._get_mvpd_resource( 'VICELAND', title, video_id, watch_hub_data.get('video-rating')) - query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) + query['tvetoken'] = self._extract_mvpd_auth( + url, video_id, 'VICELAND', resource) # signature generation algorithm is reverse engineered from signatureGenerator in # webpack:///../shared/~/vice-player/dist/js/vice-player.js in @@ -45,11 +46,14 @@ class ViceBaseIE(AdobePassIE): try: host = 'www.viceland' if is_locked else self._PREPLAY_HOST - preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query) + preplay = self._download_json( + 'https://%s.com/%s/preplay/%s' % (host, locale, video_id), + video_id, query=query) except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: error = json.loads(e.cause.read().decode()) - raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + raise ExtractorError('%s said: %s' % ( + self.IE_NAME, error['details']), expected=True) raise video_data = preplay['video'] @@ -88,41 +92,30 @@ class ViceBaseIE(AdobePassIE): class ViceIE(ViceBaseIE): - _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)' + IE_NAME = 'vice' + _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)' _TESTS = [{ - 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1', - 'md5': 'e9d77741f9e42ba583e683cd170660f7', + 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', + 'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2', 'info_dict': { - 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp', + 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj', 'ext': 'flv', - 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov', - 'duration': 725.983, + 'title': 'Monkey Labs of Holland', + 'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149', }, 'add_ie': ['Ooyala'], }, { - 'url': 'http://www.vice.com/video/how-to-hack-a-car', - 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', - 'info_dict': { - 'id': '3jstaBeXgAs', - 'ext': 'mp4', - 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', - 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', - 'uploader_id': 'MotherboardTV', - 'uploader': 'Motherboard', - 'upload_date': '20140529', - }, - 'add_ie': ['Youtube'], - }, { 'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56', - 'md5': '', 'info_dict': { 'id': '5816510690b70e6c5fd39a56', 'ext': 'mp4', 'uploader': 'Waypoint', 'title': 'The Signal From Tölva', + 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5', 'uploader_id': '57f7d621e05ca860fa9ccaf9', - 'timestamp': 1477941983938, + 'timestamp': 1477941983, + 'upload_date': '20161031', }, 'params': { # m3u8 download @@ -130,19 +123,31 @@ class ViceIE(ViceBaseIE): }, 'add_ie': ['UplynkPreplay'], }, { - 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab', - 'only_matching': True, - }, { - 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229', - 'only_matching': True, + 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f', + 'info_dict': { + 'id': '581b12b60a0e1f4c0fb6ea2f', + 'ext': 'mp4', + 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1', + 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c67b', + 'timestamp': 1485368119, + 'upload_date': '20170125', + 'age_limit': 14, + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], }, { - 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show', + 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4', 'only_matching': True, }] _PREPLAY_HOST = 'video.vice' def _real_extract(self, url): - video_id = self._match_id(url) + locale, video_id = re.match(self._VALID_URL, url).groups() webpage, urlh = self._download_webpage_handle(url, video_id) embed_code = self._search_regex( r'embedCode=([^&\'"]+)', webpage, @@ -153,10 +158,11 @@ class ViceIE(ViceBaseIE): r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None) if youtube_id: return self.url_result(youtube_id, 'Youtube') - return self._extract_preplay_video(urlh.geturl(), webpage) + return self._extract_preplay_video(urlh.geturl(), locale, webpage) class ViceShowIE(InfoExtractor): + IE_NAME = 'vice:show' _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)' _TEST = { @@ -183,6 +189,86 @@ class ViceShowIE(InfoExtractor): r'<title>(.+?)</title>', webpage, 'title', default=None) if title: title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip() - description = self._html_search_meta('description', webpage, 'description') + description = self._html_search_meta( + 'description', webpage, 'description') return self.playlist_result(entries, show_id, title, description) + + +class ViceArticleIE(InfoExtractor): + IE_NAME = 'vice:article' + _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P<id>[^?#]+)' + + _TESTS = [{ + 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah', + 'info_dict': { + 'id': '58dc0a3dee202d2a0ccfcbd8', + 'ext': 'mp4', + 'title': 'Mormon War on Porn ', + 'description': 'md5:ad396a2481e7f8afb5ed486878421090', + 'uploader': 'VICE', + 'uploader_id': '57a204088cb727dec794c693', + 'timestamp': 1489160690, + 'upload_date': '20170310', + }, + 'params': { + # AES-encrypted m3u8 + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + }, { + 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car', + 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2', + 'info_dict': { + 'id': '3jstaBeXgAs', + 'ext': 'mp4', + 'title': 'How to Hack a Car: Phreaked Out (Episode 2)', + 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30', + 'uploader_id': 'MotherboardTV', + 'uploader': 'Motherboard', + 'upload_date': '20140529', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1', + 'only_matching': True, + }, { + 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + prefetch_data = self._parse_json(self._search_regex( + r'window\.__PREFETCH_DATA\s*=\s*({.*});', + webpage, 'prefetch data'), display_id) + body = prefetch_data['body'] + + def _url_res(video_url, ie_key): + return { + '_type': 'url_transparent', + 'url': video_url, + 'display_id': display_id, + 'ie_key': ie_key, + } + + embed_code = self._search_regex( + r'embedCode=([^&\'"]+)', body, + 'ooyala embed code', default=None) + if embed_code: + return _url_res('ooyala:%s' % embed_code, 'Ooyala') + + youtube_url = self._html_search_regex( + r'<iframe[^>]+src="(.*youtube\.com/.*)"', + body, 'YouTube URL', default=None) + if youtube_url: + return _url_res(youtube_url, 'Youtube') + + video_url = self._html_search_regex( + r'data-video-url="([^"]+)"', + prefetch_data['embed_code'], 'video URL') + + return _url_res(video_url, ViceIE.ie_key()) diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py index 87f9216..bd60235 100644 --- a/youtube_dl/extractor/viceland.py +++ b/youtube_dl/extractor/viceland.py @@ -1,11 +1,13 @@ # coding: utf-8 from __future__ import unicode_literals +import re + from .vice import ViceBaseIE class VicelandIE(ViceBaseIE): - _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)' + _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)' _TEST = { 'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316', 'info_dict': { @@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE): 'skip_download': True, }, 'add_ie': ['UplynkPreplay'], + 'skip': '404', } _PREPLAY_HOST = 'www.viceland' def _real_extract(self, url): - video_id = self._match_id(url) + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + locale = mobj.group('locale') webpage = self._download_webpage(url, video_id) - return self._extract_preplay_video(url, webpage) + return self._extract_preplay_video(url, locale, webpage) diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py index 049db25..e5f964d 100644 --- a/youtube_dl/extractor/videopress.py +++ b/youtube_dl/extractor/videopress.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import random import re from .common import InfoExtractor @@ -11,6 +10,7 @@ from ..utils import ( float_or_none, parse_age_limit, qualities, + random_birthday, try_get, unified_timestamp, urljoin, @@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) + query = random_birthday('birth_year', 'birth_month', 'birth_day') video = self._download_json( 'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id, - video_id, query={ - 'birth_month': random.randint(1, 12), - 'birth_day': random.randint(1, 31), - 'birth_year': random.randint(1950, 1995), - }) + video_id, query=query) title = video['title'] diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py index 4e4b4e3..701bb1d 100644 --- a/youtube_dl/extractor/vidio.py +++ b/youtube_dl/extractor/vidio.py @@ -49,8 +49,11 @@ class VidioIE(InfoExtractor): thumbnail = clip.get('image') m3u8_url = m3u8_url or self._search_regex( - r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url') - formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?!\1).+)\1', + webpage, 'hls url') + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) duration = int_or_none(duration or self._search_regex( r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration')) diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index d055629..e64873b 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -42,14 +42,15 @@ class VidziIE(InfoExtractor): title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') - packed_codes = [mobj.group(0) for mobj in re.finditer( - PACKED_CODES_RE, webpage)] - for num, pc in enumerate(packed_codes, 1): - code = decode_packed_codes(pc).replace('\\\'', '\'') + codes = [webpage] + codes.extend([ + decode_packed_codes(mobj.group(0)).replace('\\\'', '\'') + for mobj in re.finditer(PACKED_CODES_RE, webpage)]) + for num, code in enumerate(codes, 1): jwplayer_data = self._parse_json( self._search_regex( r'setup\(([^)]+)\)', code, 'jwplayer data', - default=NO_DEFAULT if num == len(packed_codes) else '{}'), + default=NO_DEFAULT if num == len(codes) else '{}'), video_id, transform_source=js_to_json) if jwplayer_data: break diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py index 5086f59..3e67eb8 100644 --- a/youtube_dl/extractor/vier.py +++ b/youtube_dl/extractor/vier.py @@ -5,23 +5,30 @@ import re import itertools from .common import InfoExtractor +from ..utils import ( + urlencode_postdata, + int_or_none, + unified_strdate, +) class VierIE(InfoExtractor): IE_NAME = 'vier' + IE_DESC = 'vier.be and vijf.be' _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))' + _NETRC_MACHINE = 'vier' _TESTS = [{ 'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129', + 'md5': 'e4ae2054a6b040ef1e289e20d111b46e', 'info_dict': { 'id': '16129', 'display_id': 'het-wordt-warm-de-moestuin', 'ext': 'mp4', 'title': 'Het wordt warm in De Moestuin', 'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...', - }, - 'params': { - # m3u8 download - 'skip_download': True, + 'upload_date': '20121025', + 'series': 'Plan B', + 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'], }, }, { 'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614', @@ -29,32 +36,103 @@ class VierIE(InfoExtractor): 'id': '2561614', 'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas', 'ext': 'mp4', - 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s', - 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.', + 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7', + 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe', + 'upload_date': '20170228', + 'series': 'Temptation Island', + 'tags': list, + }, + 'params': { + 'skip_download': True, + }, + }, { + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'Jani gaat naar Tokio - Aflevering 4', + 'description': 'md5:aa8d611541db6ae9e863125704511f88', + 'upload_date': '20170501', + 'series': 'Jani gaat', + 'episode_number': 4, + 'tags': ['Jani Gaat', 'Volledige Aflevering'], + }, + 'params': { + 'skip_download': True, + }, + 'skip': 'Requires account credentials', + }, { + # Requires account credentials but bypassed extraction via v3/embed page + # without metadata + 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839', + 'info_dict': { + 'id': '2674839', + 'display_id': 'jani-gaat-naar-tokio-aflevering-4', + 'ext': 'mp4', + 'title': 'jani-gaat-naar-tokio-aflevering-4', }, 'params': { - # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Log in to extract metadata'], }, { - 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen', + # Without video id in URL + 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b', 'only_matching': True, }, { 'url': 'http://www.vier.be/video/v3/embed/16129', 'only_matching': True, }] + def _real_initialize(self): + self._logged_in = False + + def _login(self, site): + username, password = self._get_login_info() + if username is None or password is None: + return + + login_page = self._download_webpage( + 'http://www.%s.be/user/login' % site, + None, note='Logging in', errnote='Unable to log in', + data=urlencode_postdata({ + 'form_id': 'user_login', + 'name': username, + 'pass': password, + }), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + + login_error = self._html_search_regex( + r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<', + login_page, 'login error', default=None) + if login_error: + self.report_warning('Unable to log in: %s' % login_error) + else: + self._logged_in = True + def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) embed_id = mobj.group('embed_id') display_id = mobj.group('display_id') or embed_id + video_id = mobj.group('id') or embed_id site = mobj.group('site') + if not self._logged_in: + self._login(site) + webpage = self._download_webpage(url, display_id) + if r'id="user-login"' in webpage: + self.report_warning( + 'Log in to extract metadata', video_id=display_id) + webpage = self._download_webpage( + 'http://www.%s.be/video/v3/embed/%s' % (site, video_id), + display_id) + video_id = self._search_regex( [r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'], - webpage, 'video id') + webpage, 'video id', default=video_id or display_id) application = self._search_regex( [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'], webpage, 'application', default=site + '_vod') @@ -63,12 +141,25 @@ class VierIE(InfoExtractor): webpage, 'filename') playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename) - formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash']) + formats = self._extract_wowza_formats( + playlist_url, display_id, skip_protocols=['dash']) self._sort_formats(formats) title = self._og_search_title(webpage, default=display_id) - description = self._og_search_description(webpage, default=None) + description = self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>', + webpage, 'description', default=None, group='value') thumbnail = self._og_search_thumbnail(webpage, default=None) + upload_date = unified_strdate(self._html_search_regex( + r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})', + webpage, 'upload date', default=None, group='value')) + + series = self._search_regex( + r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage, + 'series', default=None, group='value') + episode_number = int_or_none(self._search_regex( + r'(?i)aflevering (\d+)', title, 'episode number', default=None)) + tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage) return { 'id': video_id, @@ -76,6 +167,10 @@ class VierIE(InfoExtractor): 'title': title, 'description': description, 'thumbnail': thumbnail, + 'upload_date': upload_date, + 'series': series, + 'episode_number': episode_number, + 'tags': tags, 'formats': formats, } diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py index 18735cf..1f29c27 100644 --- a/youtube_dl/extractor/viewlift.py +++ b/youtube_dl/extractor/viewlift.py @@ -68,7 +68,7 @@ class ViewLiftEmbedIE(ViewLiftBaseIE): type_ = source.get('type') ext = determine_ext(file_) format_id = source.get('label') or ext - if all(v == 'm3u8' or v == 'hls' for v in (type_, ext)): + if all(v in ('m3u8', 'hls') for v in (type_, ext)): formats.extend(self._extract_m3u8_formats( file_, video_id, 'mp4', m3u8_id='hls')) else: diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py index fcf0cb1..d5d5b4c 100644 --- a/youtube_dl/extractor/viewster.py +++ b/youtube_dl/extractor/viewster.py @@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor): if m3u8_formats: self._sort_formats(m3u8_formats) m3u8_formats = list(filter( - lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple', - m3u8_formats)) + lambda f: f.get('vcodec') != 'none', m3u8_formats)) if len(qualities) == len(m3u8_formats): for q, m3u8_format in zip(qualities, m3u8_formats): f = m3u8_format.copy() diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py index b971890..e589406 100644 --- a/youtube_dl/extractor/vlive.py +++ b/youtube_dl/extractor/vlive.py @@ -70,9 +70,9 @@ class VLiveIE(InfoExtractor): status, long_video_id, key = params[2], params[5], params[6] status = remove_start(status, 'PRODUCT_') - if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR': + if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'): return self._live(video_id, webpage) - elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO': + elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'): if long_video_id and key: return self._replay(video_id, webpage, long_video_id, key) else: diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 00c72e3..444295d 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -10,6 +10,7 @@ from ..utils import ( class VRTIE(InfoExtractor): + IE_DESC = 'deredactie.be, sporza.be, cobra.be and cobra.canvas.be' _VALID_URL = r'https?://(?:deredactie|sporza|cobra(?:\.canvas)?)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*' _TESTS = [ # deredactie.be diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py new file mode 100644 index 0000000..9959627 --- /dev/null +++ b/youtube_dl/extractor/vrv.py @@ -0,0 +1,212 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import base64 +import json +import hashlib +import hmac +import random +import string +import time + +from .common import InfoExtractor +from ..compat import ( + compat_urllib_parse_urlencode, + compat_urlparse, +) +from ..utils import ( + float_or_none, + int_or_none, +) + + +class VRVBaseIE(InfoExtractor): + _API_DOMAIN = None + _API_PARAMS = {} + _CMS_SIGNING = {} + + def _call_api(self, path, video_id, note, data=None): + base_url = self._API_DOMAIN + '/core/' + path + encoded_query = compat_urllib_parse_urlencode({ + 'oauth_consumer_key': self._API_PARAMS['oAuthKey'], + 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]), + 'oauth_signature_method': 'HMAC-SHA1', + 'oauth_timestamp': int(time.time()), + 'oauth_version': '1.0', + }) + headers = self.geo_verification_headers() + if data: + data = json.dumps(data).encode() + headers['Content-Type'] = 'application/json' + method = 'POST' if data else 'GET' + base_string = '&'.join([method, compat_urlparse.quote(base_url, ''), compat_urlparse.quote(encoded_query, '')]) + oauth_signature = base64.b64encode(hmac.new( + (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'), + base_string.encode(), hashlib.sha1).digest()).decode() + encoded_query += '&oauth_signature=' + compat_urlparse.quote(oauth_signature, '') + return self._download_json( + '?'.join([base_url, encoded_query]), video_id, + note='Downloading %s JSON metadata' % note, headers=headers, data=data) + + def _call_cms(self, path, video_id, note): + if not self._CMS_SIGNING: + self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing'] + return self._download_json( + self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING, + note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers()) + + def _set_api_params(self, webpage, video_id): + if not self._API_PARAMS: + self._API_PARAMS = self._parse_json(self._search_regex( + r'window\.__APP_CONFIG__\s*=\s*({.+?})</script>', + webpage, 'api config'), video_id)['cxApiParams'] + self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co') + + def _get_cms_resource(self, resource_key, video_id): + return self._call_api( + 'cms_resource', video_id, 'resource path', data={ + 'resource_key': resource_key, + })['__links__']['cms_resource']['href'] + + +class VRVIE(VRVBaseIE): + IE_NAME = 'vrv' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT', + 'info_dict': { + 'id': 'GR9PNZ396', + 'ext': 'mp4', + 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT', + 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f', + 'uploader_id': 'seeso', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage( + url, video_id, + headers=self.geo_verification_headers()) + media_resource = self._parse_json(self._search_regex( + r'window\.__INITIAL_STATE__\s*=\s*({.+?})</script>', + webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {} + + video_data = media_resource.get('json') + if not video_data: + self._set_api_params(webpage, video_id) + episode_path = self._get_cms_resource( + 'cms:/episodes/' + video_id, video_id) + video_data = self._call_cms(episode_path, video_id, 'video') + title = video_data['title'] + + streams_json = media_resource.get('streams', {}).get('json', {}) + if not streams_json: + self._set_api_params(webpage, video_id) + streams_path = video_data['__links__']['streams']['href'] + streams_json = self._call_cms(streams_path, video_id, 'streams') + + audio_locale = streams_json.get('audio_locale') + formats = [] + for stream_type, streams in streams_json.get('streams', {}).items(): + if stream_type in ('adaptive_hls', 'adaptive_dash'): + for stream in streams.values(): + stream_url = stream.get('url') + if not stream_url: + continue + stream_id = stream.get('hardsub_locale') or audio_locale + format_id = '%s-%s' % (stream_type.split('_')[1], stream_id) + if stream_type == 'adaptive_hls': + adaptive_formats = self._extract_m3u8_formats( + stream_url, video_id, 'mp4', m3u8_id=format_id, + note='Downloading %s m3u8 information' % stream_id, + fatal=False) + else: + adaptive_formats = self._extract_mpd_formats( + stream_url, video_id, mpd_id=format_id, + note='Downloading %s MPD information' % stream_id, + fatal=False) + if audio_locale: + for f in adaptive_formats: + if f.get('acodec') != 'none': + f['language'] = audio_locale + formats.extend(adaptive_formats) + self._sort_formats(formats) + + subtitles = {} + for subtitle in streams_json.get('subtitles', {}).values(): + subtitle_url = subtitle.get('url') + if not subtitle_url: + continue + subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({ + 'url': subtitle_url, + 'ext': subtitle.get('format', 'ass'), + }) + + thumbnails = [] + for thumbnail in video_data.get('images', {}).get('thumbnails', []): + thumbnail_url = thumbnail.get('source') + if not thumbnail_url: + continue + thumbnails.append({ + 'url': thumbnail_url, + 'width': int_or_none(thumbnail.get('width')), + 'height': int_or_none(thumbnail.get('height')), + }) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + 'subtitles': subtitles, + 'thumbnails': thumbnails, + 'description': video_data.get('description'), + 'duration': float_or_none(video_data.get('duration_ms'), 1000), + 'uploader_id': video_data.get('channel_id'), + 'series': video_data.get('series_title'), + 'season': video_data.get('season_title'), + 'season_number': int_or_none(video_data.get('season_number')), + 'season_id': video_data.get('season_id'), + 'episode': title, + 'episode_number': int_or_none(video_data.get('episode_number')), + 'episode_id': video_data.get('production_episode_id'), + } + + +class VRVSeriesIE(VRVBaseIE): + IE_NAME = 'vrv:series' + _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P<id>[A-Z0-9]+)' + _TEST = { + 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider', + 'info_dict': { + 'id': 'G68VXG3G6', + }, + 'playlist_mincount': 11, + } + + def _real_extract(self, url): + series_id = self._match_id(url) + webpage = self._download_webpage( + url, series_id, + headers=self.geo_verification_headers()) + + self._set_api_params(webpage, series_id) + seasons_path = self._get_cms_resource( + 'cms:/seasons?series_id=' + series_id, series_id) + seasons_data = self._call_cms(seasons_path, series_id, 'seasons') + + entries = [] + for season in seasons_data.get('items', []): + episodes_path = season['__links__']['season/episodes']['href'] + episodes = self._call_cms(episodes_path, series_id, 'episodes') + for episode in episodes.get('items', []): + episode_id = episode['id'] + entries.append(self.url_result( + 'https://vrv.co/watch/' + episode_id, + 'VRV', episode_id, episode.get('title'))) + + return self.playlist_result(entries, series_id) diff --git a/youtube_dl/extractor/vshare.py b/youtube_dl/extractor/vshare.py new file mode 100644 index 0000000..5addbc2 --- /dev/null +++ b/youtube_dl/extractor/vshare.py @@ -0,0 +1,38 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class VShareIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'https://vshare.io/d/0f64ce6', + 'md5': '16d7b8fef58846db47419199ff1ab3e7', + 'info_dict': { + 'id': '0f64ce6', + 'title': 'vl14062007715967', + 'ext': 'mp4', + } + }, { + 'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + 'https://vshare.io/d/%s' % video_id, video_id) + + title = self._html_search_regex( + r'(?s)<div id="root-container">(.+?)<br/>', webpage, 'title') + video_url = self._search_regex( + r'<a[^>]+href=(["\'])(?P<url>(?:https?:)?//.+?)\1[^>]*>[Cc]lick\s+here', + webpage, 'video url', group='url') + + return { + 'id': video_id, + 'title': title, + 'url': video_url, + } diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py index 839cad9..625d0a1 100644 --- a/youtube_dl/extractor/washingtonpost.py +++ b/youtube_dl/extractor/washingtonpost.py @@ -13,6 +13,7 @@ from ..utils import ( class WashingtonPostIE(InfoExtractor): IE_NAME = 'washingtonpost' _VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})' + _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}' _TEST = { 'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d', 'md5': '6f537e1334b714eb15f9563bd4b9cdfa', @@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor): }, } + @classmethod + def _extract_urls(cls, webpage): + return re.findall( + r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage) + def _real_extract(self, url): video_id = self._match_id(url) video_data = self._download_json( diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py index c634b8d..2182d6f 100644 --- a/youtube_dl/extractor/wistia.py +++ b/youtube_dl/extractor/wistia.py @@ -1,10 +1,13 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, float_or_none, + unescapeHTML, ) @@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor): 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + match = re.search( + r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage) + if match: + return unescapeHTML(match.group('url')) + + match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage) + if match: + return 'wistia:%s' % match.group('id') + + match = re.search( + r'''(?sx) + <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*? + <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2 + ''', webpage) + if match: + return 'wistia:%s' % match.group('id') + def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py index 09415b5..82587b4 100644 --- a/youtube_dl/extractor/worldstarhiphop.py +++ b/youtube_dl/extractor/worldstarhiphop.py @@ -1,12 +1,10 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor class WorldStarHipHopIE(InfoExtractor): - _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P<id>.*)' + _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?.*?\bv=(?P<id>[^&]+)' _TESTS = [{ 'url': 'http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO', 'md5': '9d04de741161603bf7071bbf4e883186', @@ -17,48 +15,26 @@ class WorldStarHipHopIE(InfoExtractor): } }, { 'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO', - 'md5': 'dc1c76c83ecc4190bb1eb143899b87d3', - 'info_dict': { - 'id': 'wshh6a7q1ny0G34ZwuIO', - 'ext': 'mp4', - 'title': 'KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!' - } + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) - m_vevo_id = re.search(r'videoId=(.*?)&?', webpage) - if m_vevo_id is not None: - return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo') - - video_url = self._search_regex( - [r'so\.addVariable\("file","(.*?)"\)', - r'<div class="artlist">\s*<a[^>]+href="([^"]+)">'], - webpage, 'video URL') + entries = self._parse_html5_media_entries(url, webpage, video_id) - if 'youtube' in video_url: - return self.url_result(video_url, ie='Youtube') + if not entries: + return self.url_result(url, 'Generic') - video_title = self._html_search_regex( + title = self._html_search_regex( [r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>', r'<span[^>]+class="tc-sp-pinned-title">(.*)</span>'], webpage, 'title') - # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video. - thumbnail = self._html_search_regex( - r'rel="image_src" href="(.*)" />', webpage, 'thumbnail', - default=None) - if not thumbnail: - _title = r'candytitles.*>(.*)</span>' - mobj = re.search(_title, webpage) - if mobj is not None: - video_title = mobj.group(1) - - return { + info = entries[0] + info.update({ 'id': video_id, - 'url': video_url, - 'title': video_title, - 'thumbnail': thumbnail, - } + 'title': title, + }) + return info diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py index deb7483..45cfca7 100644 --- a/youtube_dl/extractor/wsj.py +++ b/youtube_dl/extractor/wsj.py @@ -10,12 +10,14 @@ from ..utils import ( class WSJIE(InfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?: - video-api\.wsj\.com/api-video/player/iframe\.html\?guid=| - (?:www\.)?wsj\.com/video/[^/]+/ - ) - (?P<id>[a-zA-Z0-9-]+)''' + _VALID_URL = r'''(?x) + (?: + https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=| + https?://(?:www\.)?wsj\.com/video/[^/]+/| + wsj: + ) + (?P<id>[a-fA-F0-9-]{36}) + ''' IE_DESC = 'Wall Street Journal' _TESTS = [{ 'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A', @@ -38,12 +40,17 @@ class WSJIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - api_url = ( - 'http://video-api.wsj.com/api-video/find_all_videos.asp?' - 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,' - 'thumbnailList,author,description,name,duration,videoURL,' - 'titletag,formattedCreationDate,keywords,editor' % video_id) - info = self._download_json(api_url, video_id)['items'][0] + info = self._download_json( + 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id, + query={ + 'type': 'guid', + 'count': 1, + 'query': video_id, + 'fields': ','.join(( + 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author', + 'description', 'name', 'duration', 'videoURL', 'titletag', + 'formattedCreationDate', 'keywords', 'editor')), + })['items'][0] title = info.get('name', info.get('titletag')) formats = [] @@ -87,3 +94,24 @@ class WSJIE(InfoExtractor): 'title': title, 'categories': info.get('keywords'), } + + +class WSJArticleIE(InfoExtractor): + _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?', + 'info_dict': { + 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362', + 'ext': 'mp4', + 'upload_date': '20170221', + 'uploader_id': 'ralcaraz', + 'title': 'Bao Bao the Panda Leaves for China', + } + } + + def _real_extract(self, url): + article_id = self._match_id(url) + webpage = self._download_webpage(url, article_id) + video_id = self._search_regex( + r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id') + return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id) diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py index e616adc..13f8be6 100644 --- a/youtube_dl/extractor/xfileshare.py +++ b/youtube_dl/extractor/xfileshare.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( decode_packed_codes, + determine_ext, ExtractorError, int_or_none, NO_DEFAULT, @@ -16,21 +17,24 @@ from ..utils import ( class XFileShareIE(InfoExtractor): _SITES = ( - ('daclips.in', 'DaClips'), - ('filehoot.com', 'FileHoot'), - ('gorillavid.in', 'GorillaVid'), - ('movpod.in', 'MovPod'), - ('powerwatch.pw', 'PowerWatch'), - ('rapidvideo.ws', 'Rapidvideo.ws'), - ('thevideobee.to', 'TheVideoBee'), - ('vidto.me', 'Vidto'), - ('streamin.to', 'Streamin.To'), - ('xvidstage.com', 'XVIDSTAGE'), + (r'daclips\.(?:in|com)', 'DaClips'), + (r'filehoot\.com', 'FileHoot'), + (r'gorillavid\.(?:in|com)', 'GorillaVid'), + (r'movpod\.in', 'MovPod'), + (r'powerwatch\.pw', 'PowerWatch'), + (r'rapidvideo\.ws', 'Rapidvideo.ws'), + (r'thevideobee\.to', 'TheVideoBee'), + (r'vidto\.me', 'Vidto'), + (r'streamin\.to', 'Streamin.To'), + (r'xvidstage\.com', 'XVIDSTAGE'), + (r'vidabc\.com', 'Vid ABC'), + (r'vidbom\.com', 'VidBom'), + (r'vidlo\.us', 'vidlo'), ) IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1]) _VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' - % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0])) + % '|'.join(site for site in list(zip(*_SITES))[0])) _FILE_NOT_FOUND_REGEXES = ( r'>(?:404 - )?File Not Found<', @@ -95,6 +99,16 @@ class XFileShareIE(InfoExtractor): # removed by administrator 'url': 'http://xvidstage.com/amfy7atlkx25', 'only_matching': True, + }, { + 'url': 'http://vidabc.com/i8ybqscrphfv', + 'info_dict': { + 'id': 'i8ybqscrphfv', + 'ext': 'mp4', + 'title': 're:Beauty and the Beast 2017', + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): @@ -133,31 +147,45 @@ class XFileShareIE(InfoExtractor): webpage, 'title', default=None) or self._og_search_title( webpage, default=None) or video_id).strip() - def extract_video_url(default=NO_DEFAULT): - return self._search_regex( - (r'file\s*:\s*(["\'])(?P<url>http.+?)\1,', - r'file_link\s*=\s*(["\'])(?P<url>http.+?)\1', - r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http.+?)\2\)', - r'<embed[^>]+src=(["\'])(?P<url>http.+?)\1'), - webpage, 'file url', default=default, group='url') - - video_url = extract_video_url(default=None) - - if not video_url: + def extract_formats(default=NO_DEFAULT): + urls = [] + for regex in ( + r'file\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1', + r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1', + r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)', + r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'): + for mobj in re.finditer(regex, webpage): + video_url = mobj.group('url') + if video_url not in urls: + urls.append(video_url) + formats = [] + for video_url in urls: + if determine_ext(video_url) == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', + fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': 'sd', + }) + if not formats and default is not NO_DEFAULT: + return default + self._sort_formats(formats) + return formats + + formats = extract_formats(default=None) + + if not formats: webpage = decode_packed_codes(self._search_regex( r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))", webpage, 'packed code')) - video_url = extract_video_url() + formats = extract_formats() thumbnail = self._search_regex( r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None) - formats = [{ - 'format_id': 'sd', - 'url': video_url, - 'quality': 1, - }] - return { 'id': video_id, 'title': title, diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 5584674..bea9b87 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -6,6 +6,7 @@ import re from .common import InfoExtractor from ..utils import ( int_or_none, + js_to_json, orderedSet, parse_duration, sanitized_Request, @@ -38,6 +39,22 @@ class XTubeIE(InfoExtractor): 'age_limit': 18, } }, { + # FLV videos with duplicated formats + 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752', + 'md5': 'a406963eb349dd43692ec54631efd88b', + 'info_dict': { + 'id': '9299752', + 'display_id': 'A-Super-Run-Part-1-YT', + 'ext': 'flv', + 'title': 'A Super Run - Part 1 (YT)', + 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93', + 'uploader': 'tshirtguy59', + 'duration': 579, + 'view_count': int, + 'comment_count': int, + 'age_limit': 18, + }, + }, { # new URL schema 'url': 'http://www.xtube.com/video-watch/strange-erotica-625837', 'only_matching': True, @@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor): }) sources = self._parse_json(self._search_regex( - r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),', - webpage, 'sources', group='sources'), video_id) + r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),', + webpage, 'sources', group='sources'), video_id, + transform_source=js_to_json) formats = [] for format_id, format_url in sources.items(): @@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor): 'format_id': format_id, 'height': int_or_none(format_id), }) + self._remove_duplicate_formats(formats) self._sort_formats(formats) title = self._search_regex( diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 30825da..eca6030 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -6,8 +6,10 @@ from .common import InfoExtractor from ..compat import compat_urllib_parse_unquote from ..utils import ( clean_html, - ExtractorError, determine_ext, + ExtractorError, + int_or_none, + parse_duration, ) @@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor): 'id': '4588838', 'ext': 'mp4', 'title': 'Biker Takes his Girl', + 'duration': 108, 'age_limit': 18, } } @@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor): r'<title>(.*?)\s+-\s+XVID', webpage, 'title') video_thumbnail = self._search_regex( r'url_bigthumb=(.+?)&', webpage, 'thumbnail', fatal=False) + video_duration = int_or_none(self._og_search_property( + 'duration', webpage, default=None)) or parse_duration( + self._search_regex( + r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)', + webpage, 'duration', fatal=False)) formats = [] @@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor): 'id': video_id, 'formats': formats, 'title': video_title, + 'duration': video_duration, 'thumbnail': video_thumbnail, 'age_limit': 18, } diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 4951414..38f82bf 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -258,7 +258,7 @@ class YahooIE(InfoExtractor): return self.url_result(bc_url, BrightcoveLegacyIE.ie_key()) # Look for Brightcove New Studio embeds - bc_url = BrightcoveNewIE._extract_url(webpage) + bc_url = BrightcoveNewIE._extract_url(self, webpage) if bc_url: return self.url_result(bc_url, BrightcoveNewIE.ie_key()) diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index fd6268b..eb10621 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE): 'overembed': 'false', })['playlist'] - tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds']) + tracks = playlist['tracks'] + track_ids = [compat_str(track_id) for track_id in playlist['trackIds']] # tracks dictionary shipped with playlist.jsx API is limited to 150 tracks, # missing tracks should be retrieved manually. diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index e37f237..73ebe57 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -10,12 +10,14 @@ import time from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_ord, + compat_str, + compat_urllib_parse_urlencode, ) from ..utils import ( ExtractorError, get_element_by_attribute, + try_get, ) @@ -105,7 +107,9 @@ class YoukuIE(InfoExtractor): if stream.get('channel_type') == 'tail': continue format = stream.get('stream_type') - fileid = stream['stream_fileid'] + fileid = try_get( + stream, lambda x: x['segs'][0]['fileid'], + compat_str) or stream['stream_fileid'] fileid_dict[format] = fileid def get_fileid(format, n): diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index ca40de5..44a3928 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -38,7 +38,6 @@ from ..utils import ( parse_duration, remove_quotes, remove_start, - sanitized_Request, smuggle_url, str_to_int, try_get, @@ -54,7 +53,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' - _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' + + _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup' + _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge' + _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}' + _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -96,72 +99,150 @@ class YoutubeBaseInfoExtractor(InfoExtractor): login_form = self._hidden_inputs(login_page) - login_form.update({ - 'checkConnection': 'youtube', - 'Email': username, - 'Passwd': password, - }) + def req(url, f_req, note, errnote): + data = login_form.copy() + data.update({ + 'pstMsg': 1, + 'checkConnection': 'youtube', + 'checkedDomains': 'youtube', + 'hl': 'en', + 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]', + 'f.req': json.dumps(f_req), + 'flowName': 'GlifWebSignIn', + 'flowEntry': 'ServiceLogin', + }) + return self._download_json( + url, None, note=note, errnote=errnote, + transform_source=lambda s: re.sub(r'^[^[]*', '', s), + fatal=False, + data=urlencode_postdata(data), headers={ + 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8', + 'Google-Accounts-XSRF': 1, + }) - login_results = self._download_webpage( - self._PASSWORD_CHALLENGE_URL, None, - note='Logging in', errnote='unable to log in', fatal=False, - data=urlencode_postdata(login_form)) - if login_results is False: - return False + def warn(message): + self._downloader.report_warning(message) + + lookup_req = [ + username, + None, [], None, 'US', None, None, 2, False, True, + [ + None, None, + [2, 1, None, 1, + 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', + None, [], 4], + 1, [None, None, []], None, None, None, True + ], + username, + ] - error_msg = self._html_search_regex( - r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<', - login_results, 'error message', default=None) - if error_msg: - raise ExtractorError('Unable to login: %s' % error_msg, expected=True) + lookup_results = req( + self._LOOKUP_URL, lookup_req, + 'Looking up account info', 'Unable to look up account info') - if re.search(r'id="errormsg_0_Passwd"', login_results) is not None: - raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True) + if lookup_results is False: + return False - # Two-Factor - # TODO add SMS and phone call support - these require making a request and then prompting the user + user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str) + if not user_hash: + warn('Unable to extract user hash') + return False - if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None: - tfa_code = self._get_tfa_info('2-step verification code') + challenge_req = [ + user_hash, + None, 1, None, [1, None, None, None, [password, None, True]], + [ + None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4], + 1, [None, None, []], None, None, None, True + ]] - if not tfa_code: - self._downloader.report_warning( - 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' - '(Note that only TOTP (Google Authenticator App) codes work at this time.)') - return False + challenge_results = req( + self._CHALLENGE_URL, challenge_req, + 'Logging in', 'Unable to log in') - tfa_code = remove_start(tfa_code, 'G-') + if challenge_results is False: + return - tfa_form_strs = self._form_hidden_inputs('challenge', login_results) + login_res = try_get(challenge_results, lambda x: x[0][5], list) + if login_res: + login_msg = try_get(login_res, lambda x: x[5], compat_str) + warn( + 'Unable to login: %s' % 'Invalid password' + if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg) + return False - tfa_form_strs.update({ - 'Pin': tfa_code, - 'TrustDevice': 'on', - }) + res = try_get(challenge_results, lambda x: x[0][-1], list) + if not res: + warn('Unable to extract result entry') + return False - tfa_data = urlencode_postdata(tfa_form_strs) + tfa = try_get(res, lambda x: x[0][0], list) + if tfa: + tfa_str = try_get(tfa, lambda x: x[2], compat_str) + if tfa_str == 'TWO_STEP_VERIFICATION': + # SEND_SUCCESS - TFA code has been successfully sent to phone + # QUOTA_EXCEEDED - reached the limit of TFA codes + status = try_get(tfa, lambda x: x[5], compat_str) + if status == 'QUOTA_EXCEEDED': + warn('Exceeded the limit of TFA codes, try later') + return False + + tl = try_get(challenge_results, lambda x: x[1][2], compat_str) + if not tl: + warn('Unable to extract TL') + return False + + tfa_code = self._get_tfa_info('2-step verification code') + + if not tfa_code: + warn( + 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>' + '(Note that only TOTP (Google Authenticator App) codes work at this time.)') + return False + + tfa_code = remove_start(tfa_code, 'G-') + + tfa_req = [ + user_hash, None, 2, None, + [ + 9, None, None, None, None, None, None, None, + [None, tfa_code, True, 2] + ]] + + tfa_results = req( + self._TFA_URL.format(tl), tfa_req, + 'Submitting TFA code', 'Unable to submit TFA code') + + if tfa_results is False: + return False + + tfa_res = try_get(tfa_results, lambda x: x[0][5], list) + if tfa_res: + tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str) + warn( + 'Unable to finish TFA: %s' % 'Invalid TFA code' + if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg) + return False + + check_cookie_url = try_get( + tfa_results, lambda x: x[0][-1][2], compat_str) + else: + check_cookie_url = try_get(res, lambda x: x[2], compat_str) - tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data) - tfa_results = self._download_webpage( - tfa_req, None, - note='Submitting TFA code', errnote='unable to submit tfa', fatal=False) + if not check_cookie_url: + warn('Unable to extract CheckCookie URL') + return False - if tfa_results is False: - return False + check_cookie_results = self._download_webpage( + check_cookie_url, None, 'Checking cookie', fatal=False) - if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None: - self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') - return False - if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None: - self._downloader.report_warning('unable to log in - did the page structure change?') - return False - if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: - self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') - return False + if check_cookie_results is False: + return False - if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None: - self._downloader.report_warning('unable to log in: bad username or password') + if 'https://myaccount.google.com/' not in check_cookie_results: + warn('Unable to log in') return False + return True def _real_initialize(self): @@ -317,60 +398,60 @@ class YoutubeIE(YoutubeBaseInfoExtractor): '151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10}, # DASH mp4 video - '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) - '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, - '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, - '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40}, - '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, + '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559) + '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'}, + '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60}, + '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'}, # Dash mp4 audio - '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'}, - '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'}, - '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'}, - '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, - '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'}, - '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'preference': -50, 'container': 'm4a_dash'}, - '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'preference': -50, 'container': 'm4a_dash'}, + '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'}, + '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'}, + '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'}, + '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'}, + '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'}, + '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'}, # Dash webm - '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40}, - '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40}, - '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, + '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'}, + '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'}, + '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'}, # itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug) - '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, - '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40}, - '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40}, + '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, + '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'}, + '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60}, # Dash webm audio - '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50}, - '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50}, + '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128}, + '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256}, # Dash webm audio with opus inside - '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50}, - '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50}, - '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50}, + '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50}, + '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70}, + '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160}, # RTMP (unnamed) '_rtmp': {'protocol': 'rtmp'}, @@ -963,7 +1044,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): def _extract_signature_function(self, video_id, player_url, example_sig): id_m = re.match( - r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$', + r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$', player_url) if not id_m: raise ExtractorError('Cannot identify player %r' % player_url) @@ -1253,25 +1334,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_id = mobj.group(2) return video_id - def _extract_from_m3u8(self, manifest_url, video_id): - url_map = {} - - def _get_urls(_manifest): - lines = _manifest.split('\n') - urls = filter(lambda l: l and not l.startswith('#'), - lines) - return urls - manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest') - formats_urls = _get_urls(manifest) - for format_url in formats_urls: - itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag') - url_map[itag] = format_url - return url_map - def _extract_annotations(self, video_id): url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.') + @staticmethod + def _extract_chapters(description, duration): + if not description: + return None + chapter_lines = re.findall( + r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)', + description) + if not chapter_lines: + return None + chapters = [] + for next_num, (chapter_line, time_point) in enumerate( + chapter_lines, start=1): + start_time = parse_duration(time_point) + if start_time is None: + continue + end_time = (duration if next_num == len(chapter_lines) + else parse_duration(chapter_lines[next_num][1])) + if end_time is None: + continue + chapter_title = re.sub( + r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-') + chapter_title = re.sub(r'\s+', ' ', chapter_title) + chapters.append({ + 'start_time': start_time, + 'end_time': end_time, + 'title': chapter_title, + }) + return chapters + def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -1414,9 +1509,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): video_title = '_' # description - video_description = get_element_by_id("eow-description", video_webpage) + description_original = video_description = get_element_by_id("eow-description", video_webpage) if video_description: - video_description = re.sub(r'''(?x) + description_original = video_description = re.sub(r'''(?x) <a\s+ (?:[a-zA-Z-]+="[^"]*"\s+)*? (?:title|href)="([^"]+)"\s+ @@ -1573,18 +1668,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): if self._downloader.params.get('writeannotations', False): video_annotations = self._extract_annotations(video_id) - def _map_to_format_list(urlmap): - formats = [] - for itag, video_real_url in urlmap.items(): - dct = { - 'format_id': itag, - 'url': video_real_url, - 'player_url': player_url, - } - if itag in self._formats: - dct.update(self._formats[itag]) - formats.append(dct) - return formats + chapters = self._extract_chapters(description_original, video_duration) if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'): self.report_rtmp_download() @@ -1657,7 +1741,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor): player_desc = 'flash player %s' % player_version else: player_version = self._search_regex( - [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'], + [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', + r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'], player_url, 'html5 player', fatal=False) player_desc = 'html5 player %s' % player_version @@ -1718,11 +1803,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor): formats.append(dct) elif video_info.get('hlsvp'): manifest_url = video_info['hlsvp'][0] - url_map = self._extract_from_m3u8(manifest_url, video_id) - formats = _map_to_format_list(url_map) - # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming - for a_format in formats: + formats = [] + m3u8_formats = self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', fatal=False) + for a_format in m3u8_formats: + itag = self._search_regex( + r'/itag/(\d+)/', a_format['url'], 'itag', default=None) + if itag: + a_format['format_id'] = itag + if itag in self._formats: + dct = self._formats[itag].copy() + dct.update(a_format) + a_format = dct + a_format['player_url'] = player_url + # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True' + formats.append(a_format) else: unavailable_message = self._html_search_regex( r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>', @@ -1806,6 +1902,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): 'duration': video_duration, 'age_limit': 18 if age_gate else 0, 'annotations': video_annotations, + 'chapters': chapters, 'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id, 'view_count': view_count, 'like_count': like_count, diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py new file mode 100644 index 0000000..889aff5 --- /dev/null +++ b/youtube_dl/extractor/zaq1.py @@ -0,0 +1,101 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + unified_timestamp, +) + + +class Zaq1IE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://zaq1.pl/video/xev0e', + 'md5': '24a5eb3f052e604ae597c4d0d19b351e', + 'info_dict': { + 'id': 'xev0e', + 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa', + 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147', + 'ext': 'mp4', + 'duration': 511, + 'timestamp': 1490896361, + 'uploader': 'Anonim', + 'upload_date': '20170330', + 'view_count': int, + } + }, { + # malformed JSON-LD + 'url': 'http://zaq1.pl/video/x81vn', + 'info_dict': { + 'id': 'x81vn', + 'title': 'SEKRETNE ŻYCIE WALTERA MITTY', + 'ext': 'mp4', + 'duration': 6234, + 'timestamp': 1493494860, + 'uploader': 'Anonim', + 'upload_date': '20170429', + 'view_count': int, + }, + 'params': { + 'skip_download': True, + }, + 'expected_warnings': ['Failed to parse JSON'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_url = self._search_regex( + r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage, + 'video url', group='url') + + info = self._search_json_ld(webpage, video_id, fatal=False) + + def extract_data(field, name, fatal=False): + return self._search_regex( + r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field, + webpage, field, fatal=fatal, group='field') + + if not info.get('title'): + info['title'] = extract_data('file-name', 'title', fatal=True) + + if not info.get('duration'): + info['duration'] = int_or_none(extract_data('duration', 'duration')) + + if not info.get('thumbnail'): + info['thumbnail'] = extract_data('photo-url', 'thumbnail') + + if not info.get('timestamp'): + info['timestamp'] = unified_timestamp(self._html_search_meta( + 'uploadDate', webpage, 'timestamp')) + + if not info.get('interactionCount'): + info['view_count'] = int_or_none(self._html_search_meta( + 'interactionCount', webpage, 'view count')) + + uploader = self._html_search_regex( + r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader', + fatal=False) + + width = int_or_none(self._html_search_meta( + 'width', webpage, fatal=False)) + height = int_or_none(self._html_search_meta( + 'height', webpage, fatal=False)) + + info.update({ + 'id': video_id, + 'formats': [{ + 'url': video_url, + 'width': width, + 'height': height, + 'http_headers': { + 'Referer': url, + }, + }], + 'uploader': uploader, + }) + + return info |