diff options
author | Rogério Brito <rbrito@ime.usp.br> | 2016-08-17 09:19:41 -0300 |
---|---|---|
committer | Rogério Brito <rbrito@ime.usp.br> | 2016-08-17 09:19:41 -0300 |
commit | ced7488f6d3a519b2c1b1cbd31048743fb8285bd (patch) | |
tree | 868396b5d0031b626ea3e2ef822dad6430d70c67 /youtube_dl | |
parent | 9dc487f48b50767cf540fa36c3de2c386fd74c04 (diff) | |
download | youtube-dl-ced7488f6d3a519b2c1b1cbd31048743fb8285bd.zip youtube-dl-ced7488f6d3a519b2c1b1cbd31048743fb8285bd.tar.gz youtube-dl-ced7488f6d3a519b2c1b1cbd31048743fb8285bd.tar.bz2 |
Imported Upstream version 2016.08.17
Diffstat (limited to 'youtube_dl')
203 files changed, 8422 insertions, 4148 deletions
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py index 5036289..e844dc9 100755 --- a/youtube_dl/YoutubeDL.py +++ b/youtube_dl/YoutubeDL.py @@ -5,6 +5,7 @@ from __future__ import absolute_import, unicode_literals import collections import contextlib +import copy import datetime import errno import fileinput @@ -196,8 +197,8 @@ class YoutubeDL(object): prefer_insecure: Use HTTP instead of HTTPS to retrieve information. At the moment, this is only supported by YouTube. proxy: URL of the proxy server to use - cn_verification_proxy: URL of the proxy to use for IP address verification - on Chinese sites. (Experimental) + geo_verification_proxy: URL of the proxy to use for IP address verification + on geo-restricted sites. (Experimental) socket_timeout: Time to wait for unresponsive hosts, in seconds bidi_workaround: Work around buggy terminals without bidirectional text support, using fridibi @@ -248,7 +249,16 @@ class YoutubeDL(object): source_address: (Experimental) Client-side IP address to bind to. call_home: Boolean, true iff we are allowed to contact the youtube-dl servers for debugging. - sleep_interval: Number of seconds to sleep before each download. + sleep_interval: Number of seconds to sleep before each download when + used alone or a lower bound of a range for randomized + sleep before each download (minimum possible number + of seconds to sleep) when used along with + max_sleep_interval. + max_sleep_interval:Upper bound of a range for randomized sleep before each + download (maximum possible number of seconds to sleep). + Must only be used along with sleep_interval. + Actual sleep time will be a random float from range + [sleep_interval; max_sleep_interval]. listformats: Print an overview of available video formats and exit. list_thumbnails: Print a table of all thumbnails and exit. match_filter: A function that gets called with the info_dict of @@ -304,6 +314,11 @@ class YoutubeDL(object): self.params.update(params) self.cache = Cache(self) + if self.params.get('cn_verification_proxy') is not None: + self.report_warning('--cn-verification-proxy is deprecated. Use --geo-verification-proxy instead.') + if self.params.get('geo_verification_proxy') is None: + self.params['geo_verification_proxy'] = self.params['cn_verification_proxy'] + if params.get('bidi_workaround', False): try: import pty @@ -1046,9 +1061,9 @@ class YoutubeDL(object): if isinstance(selector, list): fs = [_build_selector_function(s) for s in selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - for format in f(formats): + for format in f(ctx): yield format return selector_function elif selector.type == GROUP: @@ -1056,17 +1071,17 @@ class YoutubeDL(object): elif selector.type == PICKFIRST: fs = [_build_selector_function(s) for s in selector.selector] - def selector_function(formats): + def selector_function(ctx): for f in fs: - picked_formats = list(f(formats)) + picked_formats = list(f(ctx)) if picked_formats: return picked_formats return [] elif selector.type == SINGLE: format_spec = selector.selector - def selector_function(formats): - formats = list(formats) + def selector_function(ctx): + formats = list(ctx['formats']) if not formats: return if format_spec == 'all': @@ -1079,9 +1094,10 @@ class YoutubeDL(object): if f.get('vcodec') != 'none' and f.get('acodec') != 'none'] if audiovideo_formats: yield audiovideo_formats[format_idx] - # for audio only (soundcloud) or video only (imgur) urls, select the best/worst audio format - elif (all(f.get('acodec') != 'none' for f in formats) or - all(f.get('vcodec') != 'none' for f in formats)): + # for extractors with incomplete formats (audio only (soundcloud) + # or video only (imgur)) we will fallback to best/worst + # {video,audio}-only format + elif ctx['incomplete_formats']: yield formats[format_idx] elif format_spec == 'bestaudio': audio_formats = [ @@ -1155,17 +1171,18 @@ class YoutubeDL(object): } video_selector, audio_selector = map(_build_selector_function, selector.selector) - def selector_function(formats): - formats = list(formats) - for pair in itertools.product(video_selector(formats), audio_selector(formats)): + def selector_function(ctx): + for pair in itertools.product( + video_selector(copy.deepcopy(ctx)), audio_selector(copy.deepcopy(ctx))): yield _merge(pair) filters = [self._build_format_filter(f) for f in selector.filters] - def final_selector(formats): + def final_selector(ctx): + ctx_copy = copy.deepcopy(ctx) for _filter in filters: - formats = list(filter(_filter, formats)) - return selector_function(formats) + ctx_copy['formats'] = list(filter(_filter, ctx_copy['formats'])) + return selector_function(ctx_copy) return final_selector stream = io.BytesIO(format_spec.encode('utf-8')) @@ -1372,7 +1389,34 @@ class YoutubeDL(object): req_format_list.append('best') req_format = '/'.join(req_format_list) format_selector = self.build_format_selector(req_format) - formats_to_download = list(format_selector(formats)) + + # While in format selection we may need to have an access to the original + # format set in order to calculate some metrics or do some processing. + # For now we need to be able to guess whether original formats provided + # by extractor are incomplete or not (i.e. whether extractor provides only + # video-only or audio-only formats) for proper formats selection for + # extractors with such incomplete formats (see + # https://github.com/rg3/youtube-dl/pull/5556). + # Since formats may be filtered during format selection and may not match + # the original formats the results may be incorrect. Thus original formats + # or pre-calculated metrics should be passed to format selection routines + # as well. + # We will pass a context object containing all necessary additional data + # instead of just formats. + # This fixes incorrect format selection issue (see + # https://github.com/rg3/youtube-dl/issues/10083). + incomplete_formats = ( + # All formats are video-only or + all(f.get('vcodec') != 'none' and f.get('acodec') == 'none' for f in formats) or + # all formats are audio-only + all(f.get('vcodec') == 'none' and f.get('acodec') != 'none' for f in formats)) + + ctx = { + 'formats': formats, + 'incomplete_formats': incomplete_formats, + } + + formats_to_download = list(format_selector(ctx)) if not formats_to_download: raise ExtractorError('requested format not available', expected=True) @@ -1559,7 +1603,9 @@ class YoutubeDL(object): self.to_screen('[info] Video subtitle %s.%s is already_present' % (sub_lang, sub_format)) else: self.to_screen('[info] Writing video subtitles to: ' + sub_filename) - with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8') as subfile: + # Use newline='' to prevent conversion of newline characters + # See https://github.com/rg3/youtube-dl/issues/10268 + with io.open(encodeFilename(sub_filename), 'w', encoding='utf-8', newline='') as subfile: subfile.write(sub_data) except (OSError, IOError): self.report_error('Cannot write subtitles file ' + sub_filename) diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py index 4905674..a973029 100644 --- a/youtube_dl/__init__.py +++ b/youtube_dl/__init__.py @@ -145,6 +145,16 @@ def _real_main(argv=None): if numeric_limit is None: parser.error('invalid max_filesize specified') opts.max_filesize = numeric_limit + if opts.sleep_interval is not None: + if opts.sleep_interval < 0: + parser.error('sleep interval must be positive or 0') + if opts.max_sleep_interval is not None: + if opts.max_sleep_interval < 0: + parser.error('max sleep interval must be positive or 0') + if opts.max_sleep_interval < opts.sleep_interval: + parser.error('max sleep interval must be greater than or equal to min sleep interval') + else: + opts.max_sleep_interval = opts.sleep_interval def parse_retries(retries): if retries in ('inf', 'infinite'): @@ -370,6 +380,7 @@ def _real_main(argv=None): 'source_address': opts.source_address, 'call_home': opts.call_home, 'sleep_interval': opts.sleep_interval, + 'max_sleep_interval': opts.max_sleep_interval, 'external_downloader': opts.external_downloader, 'list_thumbnails': opts.list_thumbnails, 'playlist_items': opts.playlist_items, @@ -382,6 +393,8 @@ def _real_main(argv=None): 'external_downloader_args': external_downloader_args, 'postprocessor_args': postprocessor_args, 'cn_verification_proxy': opts.cn_verification_proxy, + 'geo_verification_proxy': opts.geo_verification_proxy, + } with YoutubeDL(ydl_opts) as ydl: diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py index 67db1c7..b8aaf5a 100644 --- a/youtube_dl/compat.py +++ b/youtube_dl/compat.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import binascii @@ -2594,15 +2595,19 @@ except ImportError: # Python < 3.3 return "'" + s.replace("'", "'\"'\"'") + "'" -if sys.version_info >= (2, 7, 3): +try: + args = shlex.split('中文') + assert (isinstance(args, list) and + isinstance(args[0], compat_str) and + args[0] == '中文') compat_shlex_split = shlex.split -else: +except (AssertionError, UnicodeEncodeError): # Working around shlex issue with unicode strings on some python 2 # versions (see http://bugs.python.org/issue1548891) def compat_shlex_split(s, comments=False, posix=True): if isinstance(s, compat_str): s = s.encode('utf-8') - return shlex.split(s, comments, posix) + return list(map(lambda s: s.decode('utf-8'), shlex.split(s, comments, posix))) def compat_ord(c): diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py index 1dba9f4..8482cbd 100644 --- a/youtube_dl/downloader/common.py +++ b/youtube_dl/downloader/common.py @@ -4,6 +4,7 @@ import os import re import sys import time +import random from ..compat import compat_os_name from ..utils import ( @@ -342,8 +343,11 @@ class FileDownloader(object): }) return True - sleep_interval = self.params.get('sleep_interval') - if sleep_interval: + min_sleep_interval = self.params.get('sleep_interval') + if min_sleep_interval: + max_sleep_interval = self.params.get('max_sleep_interval', min_sleep_interval) + print(min_sleep_interval, max_sleep_interval) + sleep_interval = random.uniform(min_sleep_interval, max_sleep_interval) self.to_screen('[download] Sleeping %s seconds...' % sleep_interval) time.sleep(sleep_interval) diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py index fae2450..cf45562 100644 --- a/youtube_dl/downloader/external.py +++ b/youtube_dl/downloader/external.py @@ -96,6 +96,12 @@ class CurlFD(ExternalFD): cmd = [self.exe, '--location', '-o', tmpfilename] for key, val in info_dict['http_headers'].items(): cmd += ['--header', '%s: %s' % (key, val)] + cmd += self._bool_option('--continue-at', 'continuedl', '-', '0') + cmd += self._valueless_option('--silent', 'noprogress') + cmd += self._valueless_option('--verbose', 'verbose') + cmd += self._option('--limit-rate', 'ratelimit') + cmd += self._option('--retry', 'retries') + cmd += self._option('--max-filesize', 'max_filesize') cmd += self._option('--interface', 'source_address') cmd += self._option('--proxy', 'proxy') cmd += self._valueless_option('--insecure', 'nocheckcertificate') @@ -103,6 +109,16 @@ class CurlFD(ExternalFD): cmd += ['--', info_dict['url']] return cmd + def _call_downloader(self, tmpfilename, info_dict): + cmd = [encodeArgument(a) for a in self._make_cmd(tmpfilename, info_dict)] + + self._debug_cmd(cmd) + + # curl writes the progress to stderr so don't capture it. + p = subprocess.Popen(cmd) + p.communicate() + return p.returncode + class AxelFD(ExternalFD): AVAILABLE_OPT = '-V' diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py index 8f88b02..80c21d4 100644 --- a/youtube_dl/downloader/f4m.py +++ b/youtube_dl/downloader/f4m.py @@ -196,6 +196,11 @@ def build_fragments_list(boot_info): first_frag_number = fragment_run_entry_table[0]['first'] fragments_counter = itertools.count(first_frag_number) for segment, fragments_count in segment_run_table['segment_run']: + # In some live HDS streams (for example Rai), `fragments_count` is + # abnormal and causing out-of-memory errors. It's OK to change the + # number of fragments for live streams as they are updated periodically + if fragments_count == 4294967295 and boot_info['live']: + fragments_count = 2 for _ in range(fragments_count): res.append((segment, next(fragments_counter))) @@ -329,7 +334,11 @@ class F4mFD(FragmentFD): base_url = compat_urlparse.urljoin(man_url, media.attrib['url']) bootstrap_node = doc.find(_add_ns('bootstrapInfo')) - boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, base_url) + # From Adobe F4M 3.0 spec: + # The <baseURL> element SHALL be the base URL for all relative + # (HTTP-based) URLs in the manifest. If <baseURL> is not present, said + # URLs should be relative to the location of the containing document. + boot_info, bootstrap_url = self._parse_bootstrap_node(bootstrap_node, man_url) live = boot_info['live'] metadata_node = media.find(_add_ns('metadata')) if metadata_node is not None: diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py index 3b7bb35..8d7971e 100644 --- a/youtube_dl/downloader/hls.py +++ b/youtube_dl/downloader/hls.py @@ -20,6 +20,7 @@ from ..utils import ( encodeFilename, sanitize_open, parse_m3u8_attributes, + update_url_query, ) @@ -82,6 +83,7 @@ class HlsFD(FragmentFD): self._prepare_and_start_frag_download(ctx) + extra_param_to_segment_url = info_dict.get('extra_param_to_segment_url') i = 0 media_sequence = 0 decrypt_info = {'METHOD': 'NONE'} @@ -95,6 +97,8 @@ class HlsFD(FragmentFD): if re.match(r'^https?://', line) else compat_urlparse.urljoin(man_url, line)) frag_filename = '%s-Frag%d' % (ctx['tmpfilename'], i) + if extra_param_to_segment_url: + frag_url = update_url_query(frag_url, extra_param_to_segment_url) success = ctx['dl'].download(frag_filename, {'url': frag_url}) if not success: return False @@ -120,6 +124,8 @@ class HlsFD(FragmentFD): if not re.match(r'^https?://', decrypt_info['URI']): decrypt_info['URI'] = compat_urlparse.urljoin( man_url, decrypt_info['URI']) + if extra_param_to_segment_url: + decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_param_to_segment_url) decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read() elif line.startswith('#EXT-X-MEDIA-SEQUENCE'): media_sequence = int(line[22:]) diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py new file mode 100644 index 0000000..9e3a3e3 --- /dev/null +++ b/youtube_dl/extractor/adobepass.py @@ -0,0 +1,134 @@ +# -*- coding: utf-8 -*- +from __future__ import unicode_literals + +import re +import time +import xml.etree.ElementTree as etree + +from .common import InfoExtractor +from ..utils import ( + unescapeHTML, + urlencode_postdata, + unified_timestamp, +) + + +class AdobePassIE(InfoExtractor): + _SERVICE_PROVIDER_TEMPLATE = 'https://sp.auth.adobe.com/adobe-services/%s' + _USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0' + + @staticmethod + def _get_mvpd_resource(provider_id, title, guid, rating): + channel = etree.Element('channel') + channel_title = etree.SubElement(channel, 'title') + channel_title.text = provider_id + item = etree.SubElement(channel, 'item') + resource_title = etree.SubElement(item, 'title') + resource_title.text = title + resource_guid = etree.SubElement(item, 'guid') + resource_guid.text = guid + resource_rating = etree.SubElement(item, 'media:rating') + resource_rating.attrib = {'scheme': 'urn:v-chip'} + resource_rating.text = rating + return '<rss version="2.0" xmlns:media="http://search.yahoo.com/mrss/">' + etree.tostring(channel).decode() + '</rss>' + + def _extract_mvpd_auth(self, url, video_id, requestor_id, resource): + def xml_text(xml_str, tag): + return self._search_regex( + '<%s>(.+?)</%s>' % (tag, tag), xml_str, tag) + + mvpd_headers = { + 'ap_42': 'anonymous', + 'ap_11': 'Linux i686', + 'ap_z': self._USER_AGENT, + 'User-Agent': self._USER_AGENT, + } + + guid = xml_text(resource, 'guid') + requestor_info = self._downloader.cache.load('mvpd', requestor_id) or {} + authn_token = requestor_info.get('authn_token') + if authn_token: + token_expires = unified_timestamp(re.sub(r'[_ ]GMT', '', xml_text(authn_token, 'simpleTokenExpires'))) + if token_expires and token_expires <= int(time.time()): + authn_token = None + requestor_info = {} + if not authn_token: + # TODO add support for other TV Providers + mso_id = 'DTV' + username, password = self._get_netrc_login_info(mso_id) + if not username or not password: + return '' + + def post_form(form_page, note, data={}): + post_url = self._html_search_regex(r'<form[^>]+action=(["\'])(?P<url>.+?)\1', form_page, 'post url', group='url') + return self._download_webpage( + post_url, video_id, note, data=urlencode_postdata(data or self._hidden_inputs(form_page)), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) + + provider_redirect_page = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authenticate/saml', video_id, + 'Downloading Provider Redirect Page', query={ + 'noflash': 'true', + 'mso_id': mso_id, + 'requestor_id': requestor_id, + 'no_iframe': 'false', + 'domain_name': 'adobe.com', + 'redirect_url': url, + }) + provider_login_page = post_form( + provider_redirect_page, 'Downloading Provider Login Page') + mvpd_confirm_page = post_form(provider_login_page, 'Logging in', { + 'username': username, + 'password': password, + }) + post_form(mvpd_confirm_page, 'Confirming Login') + + session = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'session', video_id, + 'Retrieving Session', data=urlencode_postdata({ + '_method': 'GET', + 'requestor_id': requestor_id, + }), headers=mvpd_headers) + if '<pendingLogout' in session: + self._downloader.cache.store('mvpd', requestor_id, {}) + return self._extract_mvpd_auth(url, video_id, requestor_id, resource) + authn_token = unescapeHTML(xml_text(session, 'authnToken')) + requestor_info['authn_token'] = authn_token + self._downloader.cache.store('mvpd', requestor_id, requestor_info) + + authz_token = requestor_info.get(guid) + if not authz_token: + authorize = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'authorize', video_id, + 'Retrieving Authorization Token', data=urlencode_postdata({ + 'resource_id': resource, + 'requestor_id': requestor_id, + 'authentication_token': authn_token, + 'mso_id': xml_text(authn_token, 'simpleTokenMsoID'), + 'userMeta': '1', + }), headers=mvpd_headers) + if '<pendingLogout' in authorize: + self._downloader.cache.store('mvpd', requestor_id, {}) + return self._extract_mvpd_auth(url, video_id, requestor_id, resource) + authz_token = unescapeHTML(xml_text(authorize, 'authzToken')) + requestor_info[guid] = authz_token + self._downloader.cache.store('mvpd', requestor_id, requestor_info) + + mvpd_headers.update({ + 'ap_19': xml_text(authn_token, 'simpleSamlNameID'), + 'ap_23': xml_text(authn_token, 'simpleSamlSessionIndex'), + }) + + short_authorize = self._download_webpage( + self._SERVICE_PROVIDER_TEMPLATE % 'shortAuthorize', + video_id, 'Retrieving Media Token', data=urlencode_postdata({ + 'authz_token': authz_token, + 'requestor_id': requestor_id, + 'session_guid': xml_text(authn_token, 'simpleTokenAuthenticationGuid'), + 'hashed_guid': 'false', + }), headers=mvpd_headers) + if '<pendingLogout' in short_authorize: + self._downloader.cache.store('mvpd', requestor_id, {}) + return self._extract_mvpd_auth(url, video_id, requestor_id, resource) + return short_authorize diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py index 8157da2..3f7f8c0 100644 --- a/youtube_dl/extractor/adultswim.py +++ b/youtube_dl/extractor/adultswim.py @@ -83,6 +83,20 @@ class AdultSwimIE(InfoExtractor): # m3u8 download 'skip_download': True, } + }, { + # heroMetadata.trailer + 'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/', + 'info_dict': { + 'id': 'I0LQFQkaSUaFp8PnAWHhoQ', + 'ext': 'mp4', + 'title': 'Decker - Inside Decker: A New Hero', + 'description': 'md5:c916df071d425d62d70c86d4399d3ee0', + 'duration': 249.008, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } }] @staticmethod @@ -133,20 +147,26 @@ class AdultSwimIE(InfoExtractor): if video_info is None: if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path: video_info = bootstrapped_data['slugged_video'] - else: - raise ExtractorError('Unable to find video info') + if not video_info: + video_info = bootstrapped_data.get('heroMetadata', {}).get('trailer').get('video') + if not video_info: + raise ExtractorError('Unable to find video info') show = bootstrapped_data['show'] show_title = show['title'] stream = video_info.get('stream') - clips = [stream] if stream else video_info.get('clips') - if not clips: + if stream and stream.get('videoPlaybackID'): + segment_ids = [stream['videoPlaybackID']] + elif video_info.get('clips'): + segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']] + elif video_info.get('videoPlaybackID'): + segment_ids = [video_info['videoPlaybackID']] + else: raise ExtractorError( 'This video is only available via cable service provider subscription that' ' is not currently supported. You may want to use --cookies.' if video_info.get('auth') is True else 'Unable to find stream or clips', expected=True) - segment_ids = [clip['videoPlaybackID'] for clip in clips] episode_id = video_info['id'] episode_title = video_info['title'] diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py index 1bbfe26..6adb6d8 100644 --- a/youtube_dl/extractor/aenetworks.py +++ b/youtube_dl/extractor/aenetworks.py @@ -2,41 +2,33 @@ from __future__ import unicode_literals import re -from .common import InfoExtractor +from .theplatform import ThePlatformIE from ..utils import ( smuggle_url, update_url_query, unescapeHTML, + extract_attributes, + get_element_by_attribute, ) +from ..compat import ( + compat_urlparse, +) + +class AENetworksBaseIE(ThePlatformIE): + _THEPLATFORM_KEY = 'crazyjava' + _THEPLATFORM_SECRET = 's3cr3t' -class AENetworksIE(InfoExtractor): + +class AENetworksIE(AENetworksBaseIE): IE_NAME = 'aenetworks' IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network' - _VALID_URL = r'https?://(?:www\.)?(?:(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?P<type>[^/]+)/(?:[^/]+/)+(?P<id>[^/]+?)(?:$|[?#])' - + _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)/full-movie)' _TESTS = [{ - 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', - 'info_dict': { - 'id': 'g12m5Gyt3fdR', - 'ext': 'mp4', - 'title': "Bet You Didn't Know: Valentine's Day", - 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', - 'timestamp': 1375819729, - 'upload_date': '20130806', - 'uploader': 'AENE-NEW', - }, - 'params': { - # m3u8 download - 'skip_download': True, - }, - 'add_ie': ['ThePlatform'], - 'expected_warnings': ['JSON-LD'], - }, { 'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1', 'md5': '8ff93eb073449f151d6b90c0ae1ef0c7', 'info_dict': { - 'id': 'eg47EERs_JsZ', + 'id': '22253814', 'ext': 'mp4', 'title': 'Winter Is Coming', 'description': 'md5:641f424b7a19d8e24f26dea22cf59d74', @@ -46,42 +38,171 @@ class AENetworksIE(InfoExtractor): }, 'add_ie': ['ThePlatform'], }, { - 'url': 'http://www.aetv.com/shows/duck-dynasty/video/inlawful-entry', + 'url': 'http://www.history.com/shows/ancient-aliens/season-1', + 'info_dict': { + 'id': '71889446852', + }, + 'playlist_mincount': 5, + }, { + 'url': 'http://www.mylifetime.com/shows/atlanta-plastic', + 'info_dict': { + 'id': 'SERIES4317', + 'title': 'Atlanta Plastic', + }, + 'playlist_mincount': 2, + }, { + 'url': 'http://www.aetv.com/shows/duck-dynasty/season-9/episode-1', 'only_matching': True }, { - 'url': 'http://www.fyi.tv/shows/tiny-house-nation/videos/207-sq-ft-minnesota-prairie-cottage', + 'url': 'http://www.fyi.tv/shows/tiny-house-nation/season-1/episode-8', 'only_matching': True }, { - 'url': 'http://www.mylifetime.com/shows/project-runway-junior/video/season-1/episode-6/superstar-clients', + 'url': 'http://www.mylifetime.com/shows/project-runway-junior/season-1/episode-6', + 'only_matching': True + }, { + 'url': 'http://www.mylifetime.com/movies/center-stage-on-pointe/full-movie', 'only_matching': True }] + _DOMAIN_TO_REQUESTOR_ID = { + 'history.com': 'HISTORY', + 'aetv.com': 'AETV', + 'mylifetime.com': 'LIFETIME', + 'fyi.tv': 'FYI', + } def _real_extract(self, url): - page_type, video_id = re.match(self._VALID_URL, url).groups() + domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups() + display_id = show_path or movie_display_id + webpage = self._download_webpage(url, display_id) + if show_path: + url_parts = show_path.split('/') + url_parts_len = len(url_parts) + if url_parts_len == 1: + entries = [] + for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, season_url_path), 'AENetworks')) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeriesId', webpage), + self._html_search_meta('aetn:SeriesTitle', webpage)) + elif url_parts_len == 2: + entries = [] + for episode_item in re.findall(r'(?s)<div[^>]+class="[^"]*episode-item[^"]*"[^>]*>', webpage): + episode_attributes = extract_attributes(episode_item) + episode_url = compat_urlparse.urljoin( + url, episode_attributes['data-canonical']) + entries.append(self.url_result( + episode_url, 'AENetworks', + episode_attributes['data-videoid'])) + return self.playlist_result( + entries, self._html_search_meta('aetn:SeasonId', webpage)) + + query = { + 'mbr': 'true', + 'assetTypes': 'medium_video_s3' + } + video_id = self._html_search_meta('aetn:VideoID', webpage) + media_url = self._search_regex( + r"media_url\s*=\s*'([^']+)'", webpage, 'video url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + if theplatform_metadata.get('AETN$isBehindWall'): + requestor_id = self._DOMAIN_TO_REQUESTOR_ID[domain] + resource = self._get_mvpd_resource( + requestor_id, theplatform_metadata['title'], + theplatform_metadata.get('AETN$PPL_pplProgramId') or theplatform_metadata.get('AETN$PPL_pplProgramId_OLD'), + theplatform_metadata['ratings'][0]['rating']) + query['auth'] = self._extract_mvpd_auth( + url, video_id, requestor_id, resource) + info.update(self._search_json_ld(webpage, video_id, fatal=False)) + media_url = update_url_query(media_url, query) + media_url = self._sign_url(media_url, self._THEPLATFORM_KEY, self._THEPLATFORM_SECRET) + formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'formats': formats, + 'subtitles': subtitles, + }) + return info - webpage = self._download_webpage(url, video_id) - video_url_re = [ - r'data-href="[^"]*/%s"[^>]+data-release-url="([^"]+)"' % video_id, - r"media_url\s*=\s*'([^']+)'" - ] - video_url = unescapeHTML(self._search_regex(video_url_re, webpage, 'video url')) - query = {'mbr': 'true'} - if page_type == 'shows': - query['assetTypes'] = 'medium_video_s3' - if 'switch=hds' in video_url: - query['switch'] = 'hls' +class HistoryTopicIE(AENetworksBaseIE): + IE_NAME = 'history:topic' + IE_DESC = 'History.com Topic' + _VALID_URL = r'https?://(?:www\.)?history\.com/topics/(?:[^/]+/)?(?P<topic_id>[^/]+)(?:/[^/]+(?:/(?P<video_display_id>[^/?#]+))?)?' + _TESTS = [{ + 'url': 'http://www.history.com/topics/valentines-day/history-of-valentines-day/videos/bet-you-didnt-know-valentines-day?m=528e394da93ae&s=undefined&f=1&free=false', + 'info_dict': { + 'id': '40700995724', + 'ext': 'mp4', + 'title': "Bet You Didn't Know: Valentine's Day", + 'description': 'md5:7b57ea4829b391995b405fa60bd7b5f7', + 'timestamp': 1375819729, + 'upload_date': '20130806', + 'uploader': 'AENE-NEW', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/videos', + 'info_dict': + { + 'id': 'world-war-i-history', + 'title': 'World War I History', + }, + 'playlist_mincount': 24, + }, { + 'url': 'http://www.history.com/topics/world-war-i-history/videos', + 'only_matching': True, + }, { + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history', + 'only_matching': True, + }, { + 'url': 'http://www.history.com/topics/world-war-i/world-war-i-history/speeches', + 'only_matching': True, + }] - info = self._search_json_ld(webpage, video_id, fatal=False) - info.update({ + def theplatform_url_result(self, theplatform_url, video_id, query): + return { '_type': 'url_transparent', + 'id': video_id, 'url': smuggle_url( - update_url_query(video_url, query), + update_url_query(theplatform_url, query), { 'sig': { - 'key': 'crazyjava', - 'secret': 's3cr3t'}, + 'key': self._THEPLATFORM_KEY, + 'secret': self._THEPLATFORM_SECRET, + }, 'force_smil_url': True }), - }) - return info + 'ie_key': 'ThePlatform', + } + + def _real_extract(self, url): + topic_id, video_display_id = re.match(self._VALID_URL, url).groups() + if video_display_id: + webpage = self._download_webpage(url, video_display_id) + release_url, video_id = re.search(r"_videoPlayer.play\('([^']+)'\s*,\s*'[^']+'\s*,\s*'(\d+)'\)", webpage).groups() + release_url = unescapeHTML(release_url) + + return self.theplatform_url_result( + release_url, video_id, { + 'mbr': 'true', + 'switch': 'hls' + }) + else: + webpage = self._download_webpage(url, topic_id) + entries = [] + for episode_item in re.findall(r'<a.+?data-release-url="[^"]+"[^>]*>', webpage): + video_attributes = extract_attributes(episode_item) + entries.append(self.theplatform_url_result( + video_attributes['data-release-url'], video_attributes['data-id'], { + 'mbr': 'true', + 'switch': 'hls' + })) + return self.playlist_result(entries, topic_id, get_element_by_attribute('class', 'show-title', webpage)) diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py new file mode 100644 index 0000000..c739d2c --- /dev/null +++ b/youtube_dl/extractor/amcnetworks.py @@ -0,0 +1,91 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .theplatform import ThePlatformIE +from ..utils import ( + update_url_query, + parse_age_limit, + int_or_none, +) + + +class AMCNetworksIE(ThePlatformIE): + _VALID_URL = r'https?://(?:www\.)?(?:amc|bbcamerica|ifc|wetv)\.com/(?:movies/|shows/[^/]+/(?:full-episodes/)?season-\d+/episode-\d+(?:-(?:[^/]+/)?|/))(?P<id>[^/?#]+)' + _TESTS = [{ + 'url': 'http://www.ifc.com/shows/maron/season-04/episode-01/step-1', + 'md5': '', + 'info_dict': { + 'id': 's3MX01Nl4vPH', + 'ext': 'mp4', + 'title': 'Maron - Season 4 - Step 1', + 'description': 'In denial about his current situation, Marc is reluctantly convinced by his friends to enter rehab. Starring Marc Maron and Constance Zimmer.', + 'age_limit': 17, + 'upload_date': '20160505', + 'timestamp': 1462468831, + 'uploader': 'AMCN', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { + 'url': 'http://www.bbcamerica.com/shows/the-hunt/full-episodes/season-1/episode-01-the-hardest-challenge', + 'only_matching': True, + }, { + 'url': 'http://www.amc.com/shows/preacher/full-episodes/season-01/episode-00/pilot', + 'only_matching': True, + }, { + 'url': 'http://www.wetv.com/shows/million-dollar-matchmaker/season-01/episode-06-the-dumped-dj-and-shallow-hal', + 'only_matching': True, + }, { + 'url': 'http://www.ifc.com/movies/chaos', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + media_url = self._search_regex(r'window\.platformLinkURL\s*=\s*[\'"]([^\'"]+)', webpage, 'media url') + theplatform_metadata = self._download_theplatform_metadata(self._search_regex( + r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), display_id) + info = self._parse_theplatform_metadata(theplatform_metadata) + video_id = theplatform_metadata['pid'] + title = theplatform_metadata['title'] + rating = theplatform_metadata['ratings'][0]['rating'] + auth_required = self._search_regex(r'window\.authRequired\s*=\s*(true|false);', webpage, 'auth required') + if auth_required == 'true': + requestor_id = self._search_regex(r'window\.requestor_id\s*=\s*[\'"]([^\'"]+)', webpage, 'requestor id') + resource = self._get_mvpd_resource(requestor_id, title, video_id, rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, requestor_id, resource) + media_url = update_url_query(media_url, query) + formats, subtitles = self._extract_theplatform_smil(media_url, video_id) + self._sort_formats(formats) + info.update({ + 'id': video_id, + 'subtitles': subtitles, + 'formats': formats, + 'age_limit': parse_age_limit(parse_age_limit(rating)), + }) + ns_keys = theplatform_metadata.get('$xmlns', {}).keys() + if ns_keys: + ns = list(ns_keys)[0] + series = theplatform_metadata.get(ns + '$show') + season_number = int_or_none(theplatform_metadata.get(ns + '$season')) + episode = theplatform_metadata.get(ns + '$episodeTitle') + episode_number = int_or_none(theplatform_metadata.get(ns + '$episode')) + if season_number: + title = 'Season %d - %s' % (season_number, title) + if series: + title = '%s - %s' % (series, title) + info.update({ + 'title': title, + 'series': series, + 'season_number': season_number, + 'episode': episode, + 'episode_number': episode_number, + }) + return info diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py index 8545681..e8e4012 100644 --- a/youtube_dl/extractor/amp.py +++ b/youtube_dl/extractor/amp.py @@ -5,6 +5,8 @@ from .common import InfoExtractor from ..utils import ( int_or_none, parse_iso8601, + mimetype2ext, + determine_ext, ) @@ -50,21 +52,25 @@ class AMPIE(InfoExtractor): if isinstance(media_content, dict): media_content = [media_content] for media_data in media_content: - media = media_data['@attributes'] - media_type = media['type'] - if media_type in ('video/f4m', 'application/f4m+xml'): + media = media_data.get('@attributes', {}) + media_url = media.get('url') + if not media_url: + continue + ext = mimetype2ext(media.get('type')) or determine_ext(media_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( - media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', + media_url + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124', video_id, f4m_id='hds', fatal=False)) - elif media_type == 'application/x-mpegURL': + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False)) + media_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: formats.append({ 'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'), 'url': media['url'], 'tbr': int_or_none(media.get('bitrate')), 'filesize': int_or_none(media.get('fileSize')), + 'ext': ext, }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py index 9b01e38..9e28f25 100644 --- a/youtube_dl/extractor/animeondemand.py +++ b/youtube_dl/extractor/animeondemand.py @@ -22,6 +22,7 @@ class AnimeOnDemandIE(InfoExtractor): _APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply' _NETRC_MACHINE = 'animeondemand' _TESTS = [{ + # jap, OmU 'url': 'https://www.anime-on-demand.de/anime/161', 'info_dict': { 'id': '161', @@ -30,17 +31,21 @@ class AnimeOnDemandIE(InfoExtractor): }, 'playlist_mincount': 4, }, { - # Film wording is used instead of Episode + # Film wording is used instead of Episode, ger/jap, Dub/OmU 'url': 'https://www.anime-on-demand.de/anime/39', 'only_matching': True, }, { - # Episodes without titles + # Episodes without titles, jap, OmU 'url': 'https://www.anime-on-demand.de/anime/162', 'only_matching': True, }, { # ger/jap, Dub/OmU, account required 'url': 'https://www.anime-on-demand.de/anime/169', 'only_matching': True, + }, { + # Full length film, non-series, ger/jap, Dub/OmU, account required + 'url': 'https://www.anime-on-demand.de/anime/185', + 'only_matching': True, }] def _login(self): @@ -110,35 +115,12 @@ class AnimeOnDemandIE(InfoExtractor): entries = [] - for num, episode_html in enumerate(re.findall( - r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', webpage), 1): - episodebox_title = self._search_regex( - (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', - r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), - episode_html, 'episodebox title', default=None, group='title') - if not episodebox_title: - continue - - episode_number = int(self._search_regex( - r'(?:Episode|Film)\s*(\d+)', - episodebox_title, 'episode number', default=num)) - episode_title = self._search_regex( - r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', - episodebox_title, 'episode title', default=None) - - video_id = 'episode-%d' % episode_number - - common_info = { - 'id': video_id, - 'series': anime_title, - 'episode': episode_title, - 'episode_number': episode_number, - } - + def extract_info(html, video_id, num=None): + title, description = [None] * 2 formats = [] for input_ in re.findall( - r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', episode_html): + r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html): attributes = extract_attributes(input_) playlist_urls = [] for playlist_key in ('data-playlist', 'data-otherplaylist'): @@ -161,7 +143,7 @@ class AnimeOnDemandIE(InfoExtractor): format_id_list.append(lang) if kind: format_id_list.append(kind) - if not format_id_list: + if not format_id_list and num is not None: format_id_list.append(compat_str(num)) format_id = '-'.join(format_id_list) format_note = ', '.join(filter(None, (kind, lang_note))) @@ -215,28 +197,74 @@ class AnimeOnDemandIE(InfoExtractor): }) formats.extend(file_formats) - if formats: - self._sort_formats(formats) + return { + 'title': title, + 'description': description, + 'formats': formats, + } + + def extract_entries(html, video_id, common_info, num=None): + info = extract_info(html, video_id, num) + + if info['formats']: + self._sort_formats(info['formats']) f = common_info.copy() - f.update({ - 'title': title, - 'description': description, - 'formats': formats, - }) + f.update(info) entries.append(f) - # Extract teaser only when full episode is not available - if not formats: + # Extract teaser/trailer only when full episode is not available + if not info['formats']: m = re.search( - r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>Teaser<', - episode_html) + r'data-dialog-header=(["\'])(?P<title>.+?)\1[^>]+href=(["\'])(?P<href>.+?)\3[^>]*>(?P<kind>Teaser|Trailer)<', + html) if m: f = common_info.copy() f.update({ - 'id': '%s-teaser' % f['id'], + 'id': '%s-%s' % (f['id'], m.group('kind').lower()), 'title': m.group('title'), 'url': compat_urlparse.urljoin(url, m.group('href')), }) entries.append(f) + def extract_episodes(html): + for num, episode_html in enumerate(re.findall( + r'(?s)<h3[^>]+class="episodebox-title".+?>Episodeninhalt<', html), 1): + episodebox_title = self._search_regex( + (r'class="episodebox-title"[^>]+title=(["\'])(?P<title>.+?)\1', + r'class="episodebox-title"[^>]+>(?P<title>.+?)<'), + episode_html, 'episodebox title', default=None, group='title') + if not episodebox_title: + continue + + episode_number = int(self._search_regex( + r'(?:Episode|Film)\s*(\d+)', + episodebox_title, 'episode number', default=num)) + episode_title = self._search_regex( + r'(?:Episode|Film)\s*\d+\s*-\s*(.+)', + episodebox_title, 'episode title', default=None) + + video_id = 'episode-%d' % episode_number + + common_info = { + 'id': video_id, + 'series': anime_title, + 'episode': episode_title, + 'episode_number': episode_number, + } + + extract_entries(episode_html, video_id, common_info) + + def extract_film(html, video_id): + common_info = { + 'id': anime_id, + 'title': anime_title, + 'description': anime_description, + } + extract_entries(html, video_id, common_info) + + extract_episodes(webpage) + + if not entries: + extract_film(webpage, anime_id) + return self.playlist_result(entries, anime_id, anime_title, anime_description) diff --git a/youtube_dl/extractor/aol.py b/youtube_dl/extractor/aol.py index 42c21bf..2cdee33 100644 --- a/youtube_dl/extractor/aol.py +++ b/youtube_dl/extractor/aol.py @@ -123,6 +123,10 @@ class AolFeaturesIE(InfoExtractor): 'title': 'What To Watch - February 17, 2016', }, 'add_ie': ['FiveMin'], + 'params': { + # encrypted m3u8 download + 'skip_download': True, + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py index 6342978..025e29a 100644 --- a/youtube_dl/extractor/aparat.py +++ b/youtube_dl/extractor/aparat.py @@ -1,8 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( ExtractorError, @@ -15,7 +13,7 @@ class AparatIE(InfoExtractor): _TEST = { 'url': 'http://www.aparat.com/v/wP8On', - 'md5': '6714e0af7e0d875c5a39c4dc4ab46ad1', + 'md5': '131aca2e14fe7c4dcb3c4877ba300c89', 'info_dict': { 'id': 'wP8On', 'ext': 'mp4', @@ -31,13 +29,13 @@ class AparatIE(InfoExtractor): # Note: There is an easier-to-parse configuration at # http://www.aparat.com/video/video/config/videohash/%video_id # but the URL in there does not work - embed_url = ('http://www.aparat.com/video/video/embed/videohash/' + - video_id + '/vt/frame') + embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id webpage = self._download_webpage(embed_url, video_id) - video_urls = [video_url.replace('\\/', '/') for video_url in re.findall( - r'(?:fileList\[[0-9]+\]\s*=|"file"\s*:)\s*"([^"]+)"', webpage)] - for i, video_url in enumerate(video_urls): + file_list = self._parse_json(self._search_regex( + r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id) + for i, item in enumerate(file_list[0]): + video_url = item['file'] req = HEADRequest(video_url) res = self._request_webpage( req, video_id, note='Testing video URL %d' % i, errnote=False) diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py index 8feb7cb..486dff8 100644 --- a/youtube_dl/extractor/archiveorg.py +++ b/youtube_dl/extractor/archiveorg.py @@ -1,67 +1,65 @@ from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import unified_strdate +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + unified_strdate, + clean_html, +) -class ArchiveOrgIE(InfoExtractor): +class ArchiveOrgIE(JWPlatformBaseIE): IE_NAME = 'archive.org' IE_DESC = 'archive.org videos' - _VALID_URL = r'https?://(?:www\.)?archive\.org/details/(?P<id>[^?/]+)(?:[?].*)?$' + _VALID_URL = r'https?://(?:www\.)?archive\.org/(?:details|embed)/(?P<id>[^/?#]+)(?:[?].*)?$' _TESTS = [{ 'url': 'http://archive.org/details/XD300-23_68HighlightsAResearchCntAugHumanIntellect', 'md5': '8af1d4cf447933ed3c7f4871162602db', 'info_dict': { 'id': 'XD300-23_68HighlightsAResearchCntAugHumanIntellect', - 'ext': 'ogv', + 'ext': 'ogg', 'title': '1968 Demo - FJCC Conference Presentation Reel #1', - 'description': 'md5:1780b464abaca9991d8968c877bb53ed', + 'description': 'md5:da45c349df039f1cc8075268eb1b5c25', 'upload_date': '19681210', 'uploader': 'SRI International' } }, { 'url': 'https://archive.org/details/Cops1922', - 'md5': '18f2a19e6d89af8425671da1cf3d4e04', + 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba', 'info_dict': { 'id': 'Cops1922', - 'ext': 'ogv', + 'ext': 'mp4', 'title': 'Buster Keaton\'s "Cops" (1922)', - 'description': 'md5:70f72ee70882f713d4578725461ffcc3', + 'description': 'md5:b4544662605877edd99df22f9620d858', } + }, { + 'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) + webpage = self._download_webpage( + 'http://archive.org/embed/' + video_id, video_id) + jwplayer_playlist = self._parse_json(self._search_regex( + r"(?s)Play\('[^']+'\s*,\s*(\[.+\])\s*,\s*{.*?}\);", + webpage, 'jwplayer playlist'), video_id) + info = self._parse_jwplayer_data( + {'playlist': jwplayer_playlist}, video_id, base_url=url) - json_url = url + ('&' if '?' in url else '?') + 'output=json' - data = self._download_json(json_url, video_id) - - def get_optional(data_dict, field): - return data_dict['metadata'].get(field, [None])[0] - - title = get_optional(data, 'title') - description = get_optional(data, 'description') - uploader = get_optional(data, 'creator') - upload_date = unified_strdate(get_optional(data, 'date')) + def get_optional(metadata, field): + return metadata.get(field, [None])[0] - formats = [ - { - 'format': fdata['format'], - 'url': 'http://' + data['server'] + data['dir'] + fn, - 'file_size': int(fdata['size']), - } - for fn, fdata in data['files'].items() - if 'Video' in fdata['format']] - - self._sort_formats(formats) - - return { - '_type': 'video', - 'id': video_id, - 'title': title, - 'formats': formats, - 'description': description, - 'uploader': uploader, - 'upload_date': upload_date, - 'thumbnail': data.get('misc', {}).get('image'), - } + metadata = self._download_json( + 'http://archive.org/details/' + video_id, video_id, query={ + 'output': 'json', + })['metadata'] + info.update({ + 'title': get_optional(metadata, 'title') or info.get('title'), + 'description': clean_html(get_optional(metadata, 'description')), + }) + if info.get('_type') != 'playlist': + info.update({ + 'uploader': get_optional(metadata, 'creator'), + 'upload_date': unified_strdate(get_optional(metadata, 'date')), + }) + return info diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py index fd45b3e..07e67dd 100644 --- a/youtube_dl/extractor/ard.py +++ b/youtube_dl/extractor/ard.py @@ -13,13 +13,14 @@ from ..utils import ( parse_duration, unified_strdate, xpath_text, + update_url_query, ) from ..compat import compat_etree_fromstring class ARDMediathekIE(InfoExtractor): IE_NAME = 'ARD:mediathek' - _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.daserste\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' + _VALID_URL = r'^https?://(?:(?:www\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' _TESTS = [{ 'url': 'http://www.ardmediathek.de/tv/Dokumentation-und-Reportage/Ich-liebe-das-Leben-trotzdem/rbb-Fernsehen/Video?documentId=29582122&bcastId=3822114', @@ -34,6 +35,7 @@ class ARDMediathekIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://www.ardmediathek.de/tv/Tatort/Tatort-Scheinwelten-H%C3%B6rfassung-Video/Das-Erste/Video?documentId=29522730&bcastId=602916', 'md5': 'f4d98b10759ac06c0072bbcd1f0b9e3e', @@ -44,6 +46,7 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:196392e79876d0ac94c94e8cdb2875f1', 'duration': 5252, }, + 'skip': 'HTTP Error 404: Not Found', }, { # audio 'url': 'http://www.ardmediathek.de/tv/WDR-H%C3%B6rspiel-Speicher/Tod-eines-Fu%C3%9Fballers/WDR-3/Audio-Podcast?documentId=28488308&bcastId=23074086', @@ -55,9 +58,22 @@ class ARDMediathekIE(InfoExtractor): 'description': 'md5:f6e39f3461f0e1f54bfa48c8875c86ef', 'duration': 3240, }, + 'skip': 'HTTP Error 404: Not Found', }, { 'url': 'http://mediathek.daserste.de/sendungen_a-z/328454_anne-will/22429276_vertrauen-ist-gut-spionieren-ist-besser-geht', 'only_matching': True, + }, { + # audio + 'url': 'http://mediathek.rbb-online.de/radio/Hörspiel/Vor-dem-Fest/kulturradio/Audio?documentId=30796318&topRessort=radio&bcastId=9839158', + 'md5': '4e8f00631aac0395fee17368ac0e9867', + 'info_dict': { + 'id': '30796318', + 'ext': 'mp3', + 'title': 'Vor dem Fest', + 'description': 'md5:c0c1c8048514deaed2a73b3a60eecacb', + 'duration': 3287, + }, + 'skip': 'Video is no longer available', }] def _extract_media_info(self, media_info_url, webpage, video_id): @@ -113,11 +129,14 @@ class ARDMediathekIE(InfoExtractor): continue if ext == 'f4m': formats.extend(self._extract_f4m_formats( - stream_url + '?hdcore=3.1.1&plugin=aasp-3.1.1.69.124', - video_id, preference=-1, f4m_id='hds', fatal=False)) + update_url_query(stream_url, { + 'hdcore': '3.1.1', + 'plugin': 'aasp-3.1.1.69.124' + }), + video_id, f4m_id='hds', fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - stream_url, video_id, 'mp4', preference=1, m3u8_id='hls', fatal=False)) + stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) else: if server and server.startswith('rtmp'): f = { @@ -231,7 +250,8 @@ class ARDIE(InfoExtractor): 'title': 'Die Story im Ersten: Mission unter falscher Flagge', 'upload_date': '20140804', 'thumbnail': 're:^https?://.*\.jpg$', - } + }, + 'skip': 'HTTP Error 404: Not Found', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/arkena.py b/youtube_dl/extractor/arkena.py new file mode 100644 index 0000000..d45cae3 --- /dev/null +++ b/youtube_dl/extractor/arkena.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + float_or_none, + int_or_none, + mimetype2ext, + parse_iso8601, + strip_jsonp, +) + + +class ArkenaIE(InfoExtractor): + _VALID_URL = r'https?://play\.arkena\.com/(?:config|embed)/avp/v\d/player/media/(?P<id>[^/]+)/[^/]+/(?P<account_id>\d+)' + _TESTS = [{ + 'url': 'https://play.arkena.com/embed/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + }, { + 'url': 'https://play.arkena.com/config/avp/v2/player/media/b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe/1/129411/?callbackMethod=jQuery1111023664739129262213_1469227693893', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/config/avp/v1/player/media/327336/darkmatter/131064/?callbackMethod=jQuery1111002221189684892677_1469227595972', + 'only_matching': True, + }, { + 'url': 'http://play.arkena.com/embed/avp/v1/player/media/327336/darkmatter/131064/', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + # See https://support.arkena.com/display/PLAY/Ways+to+embed+your+video + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//play\.arkena\.com/embed/avp/.+?)\1', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + account_id = mobj.group('account_id') + + playlist = self._download_json( + 'https://play.arkena.com/config/avp/v2/player/media/%s/0/%s/?callbackMethod=_' + % (video_id, account_id), + video_id, transform_source=strip_jsonp)['Playlist'][0] + + media_info = playlist['MediaInfo'] + title = media_info['Title'] + media_files = playlist['MediaFiles'] + + is_live = False + formats = [] + for kind_case, kind_formats in media_files.items(): + kind = kind_case.lower() + for f in kind_formats: + f_url = f.get('Url') + if not f_url: + continue + is_live = f.get('Live') == 'true' + exts = (mimetype2ext(f.get('Type')), determine_ext(f_url, None)) + if kind == 'm3u8' or 'm3u8' in exts: + formats.extend(self._extract_m3u8_formats( + f_url, video_id, 'mp4', + entry_protocol='m3u8' if is_live else 'm3u8_native', + m3u8_id=kind, fatal=False, live=is_live)) + elif kind == 'flash' or 'f4m' in exts: + formats.extend(self._extract_f4m_formats( + f_url, video_id, f4m_id=kind, fatal=False)) + elif kind == 'dash' or 'mpd' in exts: + formats.extend(self._extract_mpd_formats( + f_url, video_id, mpd_id=kind, fatal=False)) + elif kind == 'silverlight': + # TODO: process when ism is supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + continue + else: + tbr = float_or_none(f.get('Bitrate'), 1000) + formats.append({ + 'url': f_url, + 'format_id': '%s-%d' % (kind, tbr) if tbr else kind, + 'tbr': tbr, + }) + self._sort_formats(formats) + + description = media_info.get('Description') + video_id = media_info.get('VideoId') or video_id + timestamp = parse_iso8601(media_info.get('PublishDate')) + thumbnails = [{ + 'url': thumbnail['Url'], + 'width': int_or_none(thumbnail.get('Size')), + } for thumbnail in (media_info.get('Poster') or []) if thumbnail.get('Url')] + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'timestamp': timestamp, + 'is_live': is_live, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py index 049f1fa..e0c5c18 100644 --- a/youtube_dl/extractor/arte.py +++ b/youtube_dl/extractor/arte.py @@ -419,6 +419,7 @@ class ArteTVPlaylistIE(ArteTVBaseIE): 'info_dict': { 'id': 'PL-013263', 'title': 'Areva & Uramin', + 'description': 'md5:a1dc0312ce357c262259139cfd48c9bf', }, 'playlist_mincount': 6, }, { diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py index 4b3cd8c..deb9cc1 100644 --- a/youtube_dl/extractor/bbc.py +++ b/youtube_dl/extractor/bbc.py @@ -2,19 +2,23 @@ from __future__ import unicode_literals import re +import itertools from .common import InfoExtractor from ..utils import ( + dict_get, ExtractorError, float_or_none, int_or_none, parse_duration, parse_iso8601, + try_get, unescapeHTML, ) from ..compat import ( compat_etree_fromstring, compat_HTTPError, + compat_urlparse, ) @@ -229,51 +233,6 @@ class BBCCoUkIE(InfoExtractor): asx = self._download_xml(connection.get('href'), programme_id, 'Downloading ASX playlist') return [ref.get('href') for ref in asx.findall('./Entry/ref')] - def _extract_connection(self, connection, programme_id): - formats = [] - kind = connection.get('kind') - protocol = connection.get('protocol') - supplier = connection.get('supplier') - if protocol == 'http': - href = connection.get('href') - transfer_format = connection.get('transferFormat') - # ASX playlist - if supplier == 'asx': - for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): - formats.append({ - 'url': ref, - 'format_id': 'ref%s_%s' % (i, supplier), - }) - # Skip DASH until supported - elif transfer_format == 'dash': - pass - elif transfer_format == 'hls': - formats.extend(self._extract_m3u8_formats( - href, programme_id, ext='mp4', entry_protocol='m3u8_native', - m3u8_id=supplier, fatal=False)) - # Direct link - else: - formats.append({ - 'url': href, - 'format_id': supplier or kind or protocol, - }) - elif protocol == 'rtmp': - application = connection.get('application', 'ondemand') - auth_string = connection.get('authString') - identifier = connection.get('identifier') - server = connection.get('server') - formats.append({ - 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), - 'play_path': identifier, - 'app': '%s?%s' % (application, auth_string), - 'page_url': 'http://www.bbc.co.uk', - 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', - 'rtmp_live': False, - 'ext': 'flv', - 'format_id': supplier, - }) - return formats - def _extract_items(self, playlist): return playlist.findall('./{%s}item' % self._EMP_PLAYLIST_NS) @@ -294,46 +253,6 @@ class BBCCoUkIE(InfoExtractor): def _extract_connections(self, media): return self._findall_ns(media, './{%s}connection') - def _extract_video(self, media, programme_id): - formats = [] - vbr = int_or_none(media.get('bitrate')) - vcodec = media.get('encoding') - service = media.get('service') - width = int_or_none(media.get('width')) - height = int_or_none(media.get('height')) - file_size = int_or_none(media.get('media_file_size')) - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'width': width, - 'height': height, - 'vbr': vbr, - 'vcodec': vcodec, - 'filesize': file_size, - }) - if service: - format['format_id'] = '%s_%s' % (service, format['format_id']) - formats.extend(conn_formats) - return formats - - def _extract_audio(self, media, programme_id): - formats = [] - abr = int_or_none(media.get('bitrate')) - acodec = media.get('encoding') - service = media.get('service') - for connection in self._extract_connections(media): - conn_formats = self._extract_connection(connection, programme_id) - for format in conn_formats: - format.update({ - 'format_id': '%s_%s' % (service, format['format_id']), - 'abr': abr, - 'acodec': acodec, - 'vcodec': 'none', - }) - formats.extend(conn_formats) - return formats - def _get_subtitles(self, media, programme_id): subtitles = {} for connection in self._extract_connections(media): @@ -379,13 +298,87 @@ class BBCCoUkIE(InfoExtractor): def _process_media_selector(self, media_selection, programme_id): formats = [] subtitles = None + urls = [] for media in self._extract_medias(media_selection): kind = media.get('kind') - if kind == 'audio': - formats.extend(self._extract_audio(media, programme_id)) - elif kind == 'video': - formats.extend(self._extract_video(media, programme_id)) + if kind in ('video', 'audio'): + bitrate = int_or_none(media.get('bitrate')) + encoding = media.get('encoding') + service = media.get('service') + width = int_or_none(media.get('width')) + height = int_or_none(media.get('height')) + file_size = int_or_none(media.get('media_file_size')) + for connection in self._extract_connections(media): + href = connection.get('href') + if href in urls: + continue + if href: + urls.append(href) + conn_kind = connection.get('kind') + protocol = connection.get('protocol') + supplier = connection.get('supplier') + transfer_format = connection.get('transferFormat') + format_id = supplier or conn_kind or protocol + if service: + format_id = '%s_%s' % (service, format_id) + # ASX playlist + if supplier == 'asx': + for i, ref in enumerate(self._extract_asx_playlist(connection, programme_id)): + formats.append({ + 'url': ref, + 'format_id': 'ref%s_%s' % (i, format_id), + }) + elif transfer_format == 'dash': + formats.extend(self._extract_mpd_formats( + href, programme_id, mpd_id=format_id, fatal=False)) + elif transfer_format == 'hls': + formats.extend(self._extract_m3u8_formats( + href, programme_id, ext='mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) + elif transfer_format == 'hds': + formats.extend(self._extract_f4m_formats( + href, programme_id, f4m_id=format_id, fatal=False)) + else: + if not service and not supplier and bitrate: + format_id += '-%d' % bitrate + fmt = { + 'format_id': format_id, + 'filesize': file_size, + } + if kind == 'video': + fmt.update({ + 'width': width, + 'height': height, + 'vbr': bitrate, + 'vcodec': encoding, + }) + else: + fmt.update({ + 'abr': bitrate, + 'acodec': encoding, + 'vcodec': 'none', + }) + if protocol == 'http': + # Direct link + fmt.update({ + 'url': href, + }) + elif protocol == 'rtmp': + application = connection.get('application', 'ondemand') + auth_string = connection.get('authString') + identifier = connection.get('identifier') + server = connection.get('server') + fmt.update({ + 'url': '%s://%s/%s?%s' % (protocol, server, application, auth_string), + 'play_path': identifier, + 'app': '%s?%s' % (application, auth_string), + 'page_url': 'http://www.bbc.co.uk', + 'player_url': 'http://www.bbc.co.uk/emp/releases/iplayer/revisions/617463_618125_4/617463_618125_4_emp.swf', + 'rtmp_live': False, + 'ext': 'flv', + }) + formats.append(fmt) elif kind == 'captions': subtitles = self.extract_subtitles(media, programme_id) return formats, subtitles @@ -590,6 +583,7 @@ class BBCIE(BBCCoUkIE): 'id': '150615_telabyad_kentin_cogu', 'ext': 'mp4', 'title': "YPG: Tel Abyad'ın tamamı kontrolümüzde", + 'description': 'md5:33a4805a855c9baf7115fcbde57e7025', 'timestamp': 1434397334, 'upload_date': '20150615', }, @@ -603,6 +597,7 @@ class BBCIE(BBCCoUkIE): 'id': '150619_video_honduras_militares_hospitales_corrupcion_aw', 'ext': 'mp4', 'title': 'Honduras militariza sus hospitales por nuevo escándalo de corrupción', + 'description': 'md5:1525f17448c4ee262b64b8f0c9ce66c8', 'timestamp': 1434713142, 'upload_date': '20150619', }, @@ -653,6 +648,23 @@ class BBCIE(BBCCoUkIE): 'skip_download': True, } }, { + # single video embedded with Morph + 'url': 'http://www.bbc.co.uk/sport/live/olympics/36895975', + 'info_dict': { + 'id': 'p041vhd0', + 'ext': 'mp4', + 'title': "Nigeria v Japan - Men's First Round", + 'description': 'Live coverage of the first round from Group B at the Amazonia Arena.', + 'duration': 7980, + 'uploader': 'BBC Sport', + 'uploader_id': 'bbc_sport', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'skip': 'Georestricted to UK', + }, { # single video with playlist.sxml URL in playlist param 'url': 'http://www.bbc.com/sport/0/football/33653409', 'info_dict': { @@ -749,7 +761,7 @@ class BBCIE(BBCCoUkIE): webpage = self._download_webpage(url, playlist_id) - json_ld_info = self._search_json_ld(webpage, playlist_id, default=None) + json_ld_info = self._search_json_ld(webpage, playlist_id, default={}) timestamp = json_ld_info.get('timestamp') playlist_title = json_ld_info.get('title') @@ -818,8 +830,29 @@ class BBCIE(BBCCoUkIE): # http://www.bbc.com/turkce/multimedya/2015/10/151010_vid_ankara_patlama_ani) playlist = data_playable.get('otherSettings', {}).get('playlist', {}) if playlist: - entries.append(self._extract_from_playlist_sxml( - playlist.get('progressiveDownloadUrl'), playlist_id, timestamp)) + entry = None + for key in ('streaming', 'progressiveDownload'): + playlist_url = playlist.get('%sUrl' % key) + if not playlist_url: + continue + try: + info = self._extract_from_playlist_sxml( + playlist_url, playlist_id, timestamp) + if not entry: + entry = info + else: + entry['title'] = info['title'] + entry['formats'].extend(info['formats']) + except Exception as e: + # Some playlist URL may fail with 500, at the same time + # the other one may work fine (e.g. + # http://www.bbc.com/turkce/haberler/2015/06/150615_telabyad_kentin_cogu) + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 500: + continue + raise + if entry: + self._sort_formats(entry['formats']) + entries.append(entry) if entries: return self.playlist_result(entries, playlist_id, playlist_title, playlist_description) @@ -852,6 +885,50 @@ class BBCIE(BBCCoUkIE): 'subtitles': subtitles, } + # Morph based embed (e.g. http://www.bbc.co.uk/sport/live/olympics/36895975) + # There are several setPayload calls may be present but the video + # seems to be always related to the first one + morph_payload = self._parse_json( + self._search_regex( + r'Morph\.setPayload\([^,]+,\s*({.+?})\);', + webpage, 'morph payload', default='{}'), + playlist_id, fatal=False) + if morph_payload: + components = try_get(morph_payload, lambda x: x['body']['components'], list) or [] + for component in components: + if not isinstance(component, dict): + continue + lead_media = try_get(component, lambda x: x['props']['leadMedia'], dict) + if not lead_media: + continue + identifiers = lead_media.get('identifiers') + if not identifiers or not isinstance(identifiers, dict): + continue + programme_id = identifiers.get('vpid') or identifiers.get('playablePid') + if not programme_id: + continue + title = lead_media.get('title') or self._og_search_title(webpage) + formats, subtitles = self._download_media_selector(programme_id) + self._sort_formats(formats) + description = lead_media.get('summary') + uploader = lead_media.get('masterBrand') + uploader_id = lead_media.get('mid') + duration = None + duration_d = lead_media.get('duration') + if isinstance(duration_d, dict): + duration = parse_duration(dict_get( + duration_d, ('rawDuration', 'formattedDuration', 'spokenDuration'))) + return { + 'id': programme_id, + 'title': title, + 'description': description, + 'duration': duration, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'formats': formats, + 'subtitles': subtitles, + } + def extract_all(pattern): return list(filter(None, map( lambda s: self._parse_json(s, playlist_id, fatal=False), @@ -869,7 +946,7 @@ class BBCIE(BBCCoUkIE): r'setPlaylist\("(%s)"\)' % EMBED_URL, webpage)) if entries: return self.playlist_result( - [self.url_result(entry, 'BBCCoUk') for entry in entries], + [self.url_result(entry_, 'BBCCoUk') for entry_ in entries], playlist_id, playlist_title, playlist_description) # Multiple video article (e.g. http://www.bbc.com/news/world-europe-32668511) @@ -981,27 +1058,43 @@ class BBCCoUkArticleIE(InfoExtractor): class BBCCoUkPlaylistBaseIE(InfoExtractor): + def _entries(self, webpage, url, playlist_id): + single_page = 'page' in compat_urlparse.parse_qs( + compat_urlparse.urlparse(url).query) + for page_num in itertools.count(2): + for video_id in re.findall( + self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage): + yield self.url_result( + self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) + if single_page: + return + next_page = self._search_regex( + r'<li[^>]+class=(["\'])pagination_+next\1[^>]*><a[^>]+href=(["\'])(?P<url>(?:(?!\2).)+)\2', + webpage, 'next page url', default=None, group='url') + if not next_page: + break + webpage = self._download_webpage( + compat_urlparse.urljoin(url, next_page), playlist_id, + 'Downloading page %d' % page_num, page_num) + def _real_extract(self, url): playlist_id = self._match_id(url) webpage = self._download_webpage(url, playlist_id) - entries = [ - self.url_result(self._URL_TEMPLATE % video_id, BBCCoUkIE.ie_key()) - for video_id in re.findall( - self._VIDEO_ID_TEMPLATE % BBCCoUkIE._ID_REGEX, webpage)] - title, description = self._extract_title_and_description(webpage) - return self.playlist_result(entries, playlist_id, title, description) + return self.playlist_result( + self._entries(webpage, url, playlist_id), + playlist_id, title, description) class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): IE_NAME = 'bbc.co.uk:iplayer:playlist' - _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/episodes/(?P<id>%s)' % BBCCoUkIE._ID_REGEX + _VALID_URL = r'https?://(?:www\.)?bbc\.co\.uk/iplayer/(?:episodes|group)/(?P<id>%s)' % BBCCoUkIE._ID_REGEX _URL_TEMPLATE = 'http://www.bbc.co.uk/iplayer/episode/%s' _VIDEO_ID_TEMPLATE = r'data-ip-id=["\'](%s)' - _TEST = { + _TESTS = [{ 'url': 'http://www.bbc.co.uk/iplayer/episodes/b05rcz9v', 'info_dict': { 'id': 'b05rcz9v', @@ -1009,7 +1102,17 @@ class BBCCoUkIPlayerPlaylistIE(BBCCoUkPlaylistBaseIE): 'description': 'French thriller serial about a missing teenager.', }, 'playlist_mincount': 6, - } + 'skip': 'This programme is not currently available on BBC iPlayer', + }, { + # Available for over a year unlike 30 days for most other programmes + 'url': 'http://www.bbc.co.uk/iplayer/group/p02tcc32', + 'info_dict': { + 'id': 'p02tcc32', + 'title': 'Bohemian Icons', + 'description': 'md5:683e901041b2fe9ba596f2ab04c4dbe7', + }, + 'playlist_mincount': 10, + }] def _extract_title_and_description(self, webpage): title = self._search_regex(r'<h1>([^<]+)</h1>', webpage, 'title', fatal=False) @@ -1033,6 +1136,24 @@ class BBCCoUkPlaylistIE(BBCCoUkPlaylistBaseIE): }, 'playlist_mincount': 7, }, { + # multipage playlist, explicit page + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips?page=1', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 24, + }, { + # multipage playlist, all pages + 'url': 'http://www.bbc.co.uk/programmes/b00mfl7n/clips', + 'info_dict': { + 'id': 'b00mfl7n', + 'title': 'Frozen Planet - Clips - BBC One', + 'description': 'md5:65dcbf591ae628dafe32aa6c4a4a0d8c', + }, + 'playlist_mincount': 142, + }, { 'url': 'http://www.bbc.co.uk/programmes/b05rcz9v/broadcasts/2016/06', 'only_matching': True, }, { diff --git a/youtube_dl/extractor/bigflix.py b/youtube_dl/extractor/bigflix.py index 33762ad..b4ce767 100644 --- a/youtube_dl/extractor/bigflix.py +++ b/youtube_dl/extractor/bigflix.py @@ -11,22 +11,13 @@ from ..compat import compat_urllib_parse_unquote class BigflixIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?bigflix\.com/.+/(?P<id>[0-9]+)' _TESTS = [{ - 'url': 'http://www.bigflix.com/Hindi-movies/Action-movies/Singham-Returns/16537', - 'md5': 'ec76aa9b1129e2e5b301a474e54fab74', - 'info_dict': { - 'id': '16537', - 'ext': 'mp4', - 'title': 'Singham Returns', - 'description': 'md5:3d2ba5815f14911d5cc6a501ae0cf65d', - } - }, { # 2 formats 'url': 'http://www.bigflix.com/Tamil-movies/Drama-movies/Madarasapatinam/16070', 'info_dict': { 'id': '16070', 'ext': 'mp4', 'title': 'Madarasapatinam', - 'description': 'md5:63b9b8ed79189c6f0418c26d9a3452ca', + 'description': 'md5:9f0470b26a4ba8e824c823b5d95c2f6b', 'formats': 'mincount:2', }, 'params': { diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py index b17047b..d8eb718 100644 --- a/youtube_dl/extractor/bilibili.py +++ b/youtube_dl/extractor/bilibili.py @@ -25,13 +25,13 @@ class BiliBiliIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.bilibili.tv/video/av1074402/', - 'md5': '5f7d29e1a2872f3df0cf76b1f87d3788', + 'md5': '9fa226fe2b8a9a4d5a69b4c6a183417e', 'info_dict': { 'id': '1554319', - 'ext': 'flv', + 'ext': 'mp4', 'title': '【金坷垃】金泡沫', 'description': 'md5:ce18c2a2d2193f0df2917d270f2e5923', - 'duration': 308.067, + 'duration': 308.315, 'timestamp': 1398012660, 'upload_date': '20140420', 'thumbnail': 're:^https?://.+\.jpg', @@ -41,73 +41,33 @@ class BiliBiliIE(InfoExtractor): }, { 'url': 'http://www.bilibili.com/video/av1041170/', 'info_dict': { - 'id': '1041170', + 'id': '1507019', + 'ext': 'mp4', 'title': '【BD1080P】刀语【诸神&异域】', 'description': '这是个神奇的故事~每个人不留弹幕不给走哦~切利哦!~', + 'timestamp': 1396530060, + 'upload_date': '20140403', + 'uploader': '枫叶逝去', + 'uploader_id': '520116', }, - 'playlist_count': 9, }, { 'url': 'http://www.bilibili.com/video/av4808130/', 'info_dict': { - 'id': '4808130', + 'id': '7802182', + 'ext': 'mp4', 'title': '【长篇】哆啦A梦443【钉铛】', 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', + 'timestamp': 1464564180, + 'upload_date': '20160529', + 'uploader': '喜欢拉面', + 'uploader_id': '151066', }, - 'playlist': [{ - 'md5': '55cdadedf3254caaa0d5d27cf20a8f9c', - 'info_dict': { - 'id': '4808130_part1', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '926f9f67d0c482091872fbd8eca7ea3d', - 'info_dict': { - 'id': '4808130_part2', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '4b7b225b968402d7c32348c646f1fd83', - 'info_dict': { - 'id': '4808130_part3', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }, { - 'md5': '7b795e214166501e9141139eea236e91', - 'info_dict': { - 'id': '4808130_part4', - 'ext': 'flv', - 'title': '【长篇】哆啦A梦443【钉铛】', - 'description': '(2016.05.27)来组合客人的脸吧&amp;寻母六千里锭 抱歉,又轮到周日上班现在才到家 封面www.pixiv.net/member_illust.php?mode=medium&amp;illust_id=56912929', - 'timestamp': 1464564180, - 'upload_date': '20160529', - 'uploader': '喜欢拉面', - 'uploader_id': '151066', - }, - }], }, { # Missing upload time 'url': 'http://www.bilibili.com/video/av1867637/', 'info_dict': { 'id': '2880301', - 'ext': 'flv', + 'ext': 'mp4', 'title': '【HDTV】【喜剧】岳父岳母真难当 (2014)【法国票房冠军】', 'description': '一个信奉天主教的法国旧式传统资产阶级家庭中有四个女儿。三个女儿却分别找了阿拉伯、犹太、中国丈夫,老夫老妻唯独期盼剩下未嫁的小女儿能找一个信奉天主教的法国白人,结果没想到小女儿找了一位非裔黑人……【这次应该不会跳帧了】', 'uploader': '黑夜为猫', diff --git a/youtube_dl/extractor/biobiochiletv.py b/youtube_dl/extractor/biobiochiletv.py index 1332281..7608c0a 100644 --- a/youtube_dl/extractor/biobiochiletv.py +++ b/youtube_dl/extractor/biobiochiletv.py @@ -2,11 +2,15 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import remove_end +from ..utils import ( + ExtractorError, + remove_end, +) +from .rudo import RudoIE class BioBioChileTVIE(InfoExtractor): - _VALID_URL = r'https?://tv\.biobiochile\.cl/notas/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' + _VALID_URL = r'https?://(?:tv|www)\.biobiochile\.cl/(?:notas|noticias)/(?:[^/]+/)+(?P<id>[^/]+)\.shtml' _TESTS = [{ 'url': 'http://tv.biobiochile.cl/notas/2015/10/21/sobre-camaras-y-camarillas-parlamentarias.shtml', @@ -18,6 +22,7 @@ class BioBioChileTVIE(InfoExtractor): 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'Fernando Atria', }, + 'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html', }, { # different uploader layout 'url': 'http://tv.biobiochile.cl/notas/2016/03/18/natalia-valdebenito-repasa-a-diputado-hasbun-paso-a-la-categoria-de-hablar-brutalidades.shtml', @@ -32,6 +37,16 @@ class BioBioChileTVIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'URL expired and redirected to http://www.biobiochile.cl/portada/bbtv/index.html', + }, { + 'url': 'http://www.biobiochile.cl/noticias/bbtv/comentarios-bio-bio/2016/07/08/edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos.shtml', + 'info_dict': { + 'id': 'edecanes-del-congreso-figuras-decorativas-que-le-cuestan-muy-caro-a-los-chilenos', + 'ext': 'mp4', + 'uploader': '(none)', + 'upload_date': '20160708', + 'title': 'Edecanes del Congreso: Figuras decorativas que le cuestan muy caro a los chilenos', + }, }, { 'url': 'http://tv.biobiochile.cl/notas/2015/10/22/ninos-transexuales-de-quien-es-la-decision.shtml', 'only_matching': True, @@ -45,42 +60,22 @@ class BioBioChileTVIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') + rudo_url = RudoIE._extract_url(webpage) + if not rudo_url: + raise ExtractorError('No videos found') - file_url = self._search_regex( - r'loadFWPlayerVideo\([^,]+,\s*(["\'])(?P<url>.+?)\1', - webpage, 'file url', group='url') - - base_url = self._search_regex( - r'file\s*:\s*(["\'])(?P<url>.+?)\1\s*\+\s*fileURL', webpage, - 'base url', default='http://unlimited2-cl.digitalproserver.com/bbtv/', - group='url') - - formats = self._extract_m3u8_formats( - '%s%s/playlist.m3u8' % (base_url, file_url), video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False) - f = { - 'url': '%s%s' % (base_url, file_url), - 'format_id': 'http', - 'protocol': 'http', - 'preference': 1, - } - if formats: - f_copy = formats[-1].copy() - f_copy.update(f) - f = f_copy - formats.append(f) - self._sort_formats(formats) + title = remove_end(self._og_search_title(webpage), ' - BioBioChile TV') thumbnail = self._og_search_thumbnail(webpage) uploader = self._html_search_regex( - r'<a[^>]+href=["\']https?://busca\.biobiochile\.cl/author[^>]+>(.+?)</a>', + r'<a[^>]+href=["\']https?://(?:busca|www)\.biobiochile\.cl/(?:lista/)?(?:author|autor)[^>]+>(.+?)</a>', webpage, 'uploader', fatal=False) return { + '_type': 'url_transparent', + 'url': rudo_url, 'id': video_id, 'title': title, 'thumbnail': thumbnail, 'uploader': uploader, - 'formats': formats, } diff --git a/youtube_dl/extractor/biqle.py b/youtube_dl/extractor/biqle.py index ae4579b..beaebfd 100644 --- a/youtube_dl/extractor/biqle.py +++ b/youtube_dl/extractor/biqle.py @@ -24,7 +24,8 @@ class BIQLEIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ребенок в шоке от автоматической мойки', 'uploader': 'Dmitry Kotov', - } + }, + 'skip': ' This video was marked as adult. Embedding adult videos on external sites is prohibited.', }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/bloomberg.py b/youtube_dl/extractor/bloomberg.py index bd538be..2a8cd64 100644 --- a/youtube_dl/extractor/bloomberg.py +++ b/youtube_dl/extractor/bloomberg.py @@ -1,3 +1,4 @@ +# coding: utf-8 from __future__ import unicode_literals import re @@ -21,6 +22,18 @@ class BloombergIE(InfoExtractor): 'format': 'best[format_id^=hds]', }, }, { + # video ID in BPlayer(...) + 'url': 'http://www.bloomberg.com/features/2016-hello-world-new-zealand/', + 'info_dict': { + 'id': '938c7e72-3f25-4ddb-8b85-a9be731baa74', + 'ext': 'flv', + 'title': 'Meet the Real-Life Tech Wizards of Middle Earth', + 'description': 'Hello World, Episode 1: New Zealand’s freaky AI babies, robot exoskeletons, and a virtual you.', + }, + 'params': { + 'format': 'best[format_id^=hds]', + }, + }, { 'url': 'http://www.bloomberg.com/news/articles/2015-11-12/five-strange-things-that-have-been-happening-in-financial-markets', 'only_matching': True, }, { @@ -33,7 +46,11 @@ class BloombergIE(InfoExtractor): webpage = self._download_webpage(url, name) video_id = self._search_regex( r'["\']bmmrId["\']\s*:\s*(["\'])(?P<url>.+?)\1', - webpage, 'id', group='url') + webpage, 'id', group='url', default=None) + if not video_id: + bplayer_data = self._parse_json(self._search_regex( + r'BPlayer\(null,\s*({[^;]+})\);', webpage, 'id'), name) + video_id = bplayer_data['id'] title = re.sub(': Video$', '', self._og_search_title(webpage)) embed_info = self._download_json( diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py index ef560b5..aeb22be 100644 --- a/youtube_dl/extractor/brightcove.py +++ b/youtube_dl/extractor/brightcove.py @@ -26,6 +26,8 @@ from ..utils import ( unescapeHTML, unsmuggle_url, update_url_query, + clean_html, + mimetype2ext, ) @@ -90,6 +92,7 @@ class BrightcoveLegacyIE(InfoExtractor): 'description': 'md5:363109c02998fee92ec02211bd8000df', 'uploader': 'National Ballet of Canada', }, + 'skip': 'Video gone', }, { # test flv videos served by akamaihd.net @@ -108,7 +111,7 @@ class BrightcoveLegacyIE(InfoExtractor): }, }, { - # playlist test + # playlist with 'videoList' # from http://support.brightcove.com/en/video-cloud/docs/playlist-support-single-video-players 'url': 'http://c.brightcove.com/services/viewer/htmlFederated?playerID=3550052898001&playerKey=AQ%7E%7E%2CAAABmA9XpXk%7E%2C-Kp7jNgisre1fG5OdqpAFUTcs0lP_ZoL', 'info_dict': { @@ -117,6 +120,15 @@ class BrightcoveLegacyIE(InfoExtractor): }, 'playlist_mincount': 7, }, + { + # playlist with 'playlistTab' (https://github.com/rg3/youtube-dl/issues/9965) + 'url': 'http://c.brightcove.com/services/json/experience/runtime/?command=get_programming_for_experience&playerKey=AQ%7E%7E,AAABXlLMdok%7E,NJ4EoMlZ4rZdx9eU1rkMVd8EaYPBBUlg', + 'info_dict': { + 'id': '1522758701001', + 'title': 'Lesson 08', + }, + 'playlist_mincount': 10, + }, ] FLV_VCODECS = { 1: 'SORENSON', @@ -298,13 +310,19 @@ class BrightcoveLegacyIE(InfoExtractor): info_url, player_key, 'Downloading playlist information') json_data = json.loads(playlist_info) - if 'videoList' not in json_data: + if 'videoList' in json_data: + playlist_info = json_data['videoList'] + playlist_dto = playlist_info['mediaCollectionDTO'] + elif 'playlistTabs' in json_data: + playlist_info = json_data['playlistTabs'] + playlist_dto = playlist_info['lineupListDTO']['playlistDTOs'][0] + else: raise ExtractorError('Empty playlist') - playlist_info = json_data['videoList'] - videos = [self._extract_video_info(video_info) for video_info in playlist_info['mediaCollectionDTO']['videoDTOs']] + + videos = [self._extract_video_info(video_info) for video_info in playlist_dto['videoDTOs']] return self.playlist_result(videos, playlist_id='%s' % playlist_info['id'], - playlist_title=playlist_info['mediaCollectionDTO']['displayName']) + playlist_title=playlist_dto['displayName']) def _extract_video_info(self, video_info): video_id = compat_str(video_info['id']) @@ -528,14 +546,16 @@ class BrightcoveNewIE(InfoExtractor): formats = [] for source in json_data.get('sources', []): container = source.get('container') - source_type = source.get('type') + ext = mimetype2ext(source.get('type')) src = source.get('src') - if source_type == 'application/x-mpegURL' or container == 'M2TS': + if ext == 'ism': + continue + elif ext == 'm3u8' or container == 'M2TS': if not src: continue formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) - elif source_type == 'application/dash+xml': + elif ext == 'mpd': if not src: continue formats.extend(self._extract_mpd_formats(src, video_id, 'dash', fatal=False)) @@ -551,7 +571,7 @@ class BrightcoveNewIE(InfoExtractor): 'tbr': tbr, 'filesize': int_or_none(source.get('size')), 'container': container, - 'ext': container.lower(), + 'ext': ext or container.lower(), } if width == 0 and height == 0: f.update({ @@ -585,6 +605,13 @@ class BrightcoveNewIE(InfoExtractor): 'format_id': build_format_id('rtmp'), }) formats.append(f) + + errors = json_data.get('errors') + if not formats and errors: + error = errors[0] + raise ExtractorError( + error.get('message') or error.get('error_subcode') or error['error_code'], expected=True) + self._sort_formats(formats) subtitles = {} @@ -597,7 +624,7 @@ class BrightcoveNewIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': json_data.get('description'), + 'description': clean_html(json_data.get('description')), 'thumbnail': json_data.get('thumbnail') or json_data.get('poster'), 'duration': float_or_none(json_data.get('duration'), 1000), 'timestamp': parse_iso8601(json_data.get('published_at')), diff --git a/youtube_dl/extractor/buzzfeed.py b/youtube_dl/extractor/buzzfeed.py index df503ec..75fa92d 100644 --- a/youtube_dl/extractor/buzzfeed.py +++ b/youtube_dl/extractor/buzzfeed.py @@ -5,6 +5,7 @@ import json import re from .common import InfoExtractor +from .facebook import FacebookIE class BuzzFeedIE(InfoExtractor): @@ -20,11 +21,11 @@ class BuzzFeedIE(InfoExtractor): 'info_dict': { 'id': 'aVCR29aE_OQ', 'ext': 'mp4', + 'title': 'Angry Ram destroys a punching bag..', + 'description': 'md5:c59533190ef23fd4458a5e8c8c872345', 'upload_date': '20141024', 'uploader_id': 'Buddhanz1', - 'description': 'He likes to stay in shape with his heavy bag, he wont stop until its on the ground\n\nFollow Angry Ram on Facebook for regular updates -\nhttps://www.facebook.com/pages/Angry-Ram/1436897249899558?ref=hl', - 'uploader': 'Buddhanz', - 'title': 'Angry Ram destroys a punching bag', + 'uploader': 'Angry Ram', } }] }, { @@ -41,13 +42,30 @@ class BuzzFeedIE(InfoExtractor): 'info_dict': { 'id': 'mVmBL8B-In0', 'ext': 'mp4', + 'title': 're:Munchkin the Teddy Bear gets her exercise', + 'description': 'md5:28faab95cda6e361bcff06ec12fc21d8', 'upload_date': '20141124', 'uploader_id': 'CindysMunchkin', - 'description': 're:© 2014 Munchkin the', 'uploader': 're:^Munchkin the', - 'title': 're:Munchkin the Teddy Bear gets her exercise', }, }] + }, { + 'url': 'http://www.buzzfeed.com/craigsilverman/the-most-adorable-crash-landing-ever#.eq7pX0BAmK', + 'info_dict': { + 'id': 'the-most-adorable-crash-landing-ever', + 'title': 'Watch This Baby Goose Make The Most Adorable Crash Landing', + 'description': 'This gosling knows how to stick a landing.', + }, + 'playlist': [{ + 'md5': '763ca415512f91ca62e4621086900a23', + 'info_dict': { + 'id': '971793786185728', + 'ext': 'mp4', + 'title': 'We set up crash pads so that the goslings on our roof would have a safe landi...', + 'uploader': 'Calgary Outdoor Centre-University of Calgary', + }, + }], + 'add_ie': ['Facebook'], }] def _real_extract(self, url): @@ -66,6 +84,10 @@ class BuzzFeedIE(InfoExtractor): continue entries.append(self.url_result(video['url'])) + facebook_url = FacebookIE._extract_url(webpage) + if facebook_url: + entries.append(self.url_result(facebook_url)) + return { '_type': 'playlist', 'id': playlist_id, diff --git a/youtube_dl/extractor/camdemy.py b/youtube_dl/extractor/camdemy.py index 6ffbeab..268c343 100644 --- a/youtube_dl/extractor/camdemy.py +++ b/youtube_dl/extractor/camdemy.py @@ -1,7 +1,6 @@ # coding: utf-8 from __future__ import unicode_literals -import datetime import re from .common import InfoExtractor @@ -10,8 +9,10 @@ from ..compat import ( compat_urlparse, ) from ..utils import ( - parse_iso8601, + clean_html, + parse_duration, str_to_int, + unified_strdate, ) @@ -26,14 +27,14 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'Ch1-1 Introduction, Signals (02-23-2012)', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': '', 'creator': 'ss11spring', + 'duration': 1591, 'upload_date': '20130114', - 'timestamp': 1358154556, 'view_count': int, } }, { # With non-empty description + # webpage returns "No permission or not login" 'url': 'http://www.camdemy.com/media/13885', 'md5': '4576a3bb2581f86c61044822adbd1249', 'info_dict': { @@ -41,64 +42,71 @@ class CamdemyIE(InfoExtractor): 'ext': 'mp4', 'title': 'EverCam + Camdemy QuickStart', 'thumbnail': 're:^https?://.*\.jpg$', - 'description': 'md5:050b62f71ed62928f8a35f1a41e186c9', + 'description': 'md5:2a9f989c2b153a2342acee579c6e7db6', 'creator': 'evercam', - 'upload_date': '20140620', - 'timestamp': 1403271569, + 'duration': 318, } }, { - # External source + # External source (YouTube) 'url': 'http://www.camdemy.com/media/14842', - 'md5': '50e1c3c3aa233d3d7b7daa2fa10b1cf7', 'info_dict': { 'id': '2vsYQzNIsJo', 'ext': 'mp4', + 'title': 'Excel 2013 Tutorial - How to add Password Protection', + 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'upload_date': '20130211', 'uploader': 'Hun Kim', - 'description': 'Excel 2013 Tutorial for Beginners - How to add Password Protection', 'uploader_id': 'hunkimtutorials', - 'title': 'Excel 2013 Tutorial - How to add Password Protection', - } + }, + 'params': { + 'skip_download': True, + }, }] def _real_extract(self, url): video_id = self._match_id(url) - page = self._download_webpage(url, video_id) + + webpage = self._download_webpage(url, video_id) src_from = self._html_search_regex( - r"<div class='srcFrom'>Source: <a title='([^']+)'", page, - 'external source', default=None) + r"class=['\"]srcFrom['\"][^>]*>Sources?(?:\s+from)?\s*:\s*<a[^>]+(?:href|title)=(['\"])(?P<url>(?:(?!\1).)+)\1", + webpage, 'external source', default=None, group='url') if src_from: return self.url_result(src_from) oembed_obj = self._download_json( 'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id) + title = oembed_obj['title'] thumb_url = oembed_obj['thumbnail_url'] video_folder = compat_urlparse.urljoin(thumb_url, 'video/') file_list_doc = self._download_xml( compat_urlparse.urljoin(video_folder, 'fileList.xml'), - video_id, 'Filelist XML') + video_id, 'Downloading filelist XML') file_name = file_list_doc.find('./video/item/fileName').text video_url = compat_urlparse.urljoin(video_folder, file_name) - timestamp = parse_iso8601(self._html_search_regex( - r"<div class='title'>Posted\s*:</div>\s*<div class='value'>([^<>]+)<", - page, 'creation time', fatal=False), - delimiter=' ', timezone=datetime.timedelta(hours=8)) - view_count = str_to_int(self._html_search_regex( - r"<div class='title'>Views\s*:</div>\s*<div class='value'>([^<>]+)<", - page, 'view count', fatal=False)) + # Some URLs return "No permission or not login" in a webpage despite being + # freely available via oembed JSON URL (e.g. http://www.camdemy.com/media/13885) + upload_date = unified_strdate(self._search_regex( + r'>published on ([^<]+)<', webpage, + 'upload date', default=None)) + view_count = str_to_int(self._search_regex( + r'role=["\']viewCnt["\'][^>]*>([\d,.]+) views', + webpage, 'view count', default=None)) + description = self._html_search_meta( + 'description', webpage, default=None) or clean_html( + oembed_obj.get('description')) return { 'id': video_id, 'url': video_url, - 'title': oembed_obj['title'], + 'title': title, 'thumbnail': thumb_url, - 'description': self._html_search_meta('description', page), - 'creator': oembed_obj['author_name'], - 'duration': oembed_obj['duration'], - 'timestamp': timestamp, + 'description': description, + 'creator': oembed_obj.get('author_name'), + 'duration': parse_duration(oembed_obj.get('duration')), + 'upload_date': upload_date, 'view_count': view_count, } diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py index ff663d0..a87e971 100644 --- a/youtube_dl/extractor/cbc.py +++ b/youtube_dl/extractor/cbc.py @@ -4,9 +4,11 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( js_to_json, smuggle_url, + try_get, ) @@ -25,8 +27,22 @@ class CBCIE(InfoExtractor): 'upload_date': '20160203', 'uploader': 'CBCC-NEW', }, + 'skip': 'Geo-restricted to Canada', }, { - # with clipId + # with clipId, feed available via tpfeed.cbc.ca and feed.theplatform.com + 'url': 'http://www.cbc.ca/22minutes/videos/22-minutes-update/22-minutes-update-episode-4', + 'md5': '162adfa070274b144f4fdc3c3b8207db', + 'info_dict': { + 'id': '2414435309', + 'ext': 'mp4', + 'title': '22 Minutes Update: What Not To Wear Quebec', + 'description': "This week's latest Canadian top political story is What Not To Wear Quebec.", + 'upload_date': '20131025', + 'uploader': 'CBCC-NEW', + 'timestamp': 1382717907, + }, + }, { + # with clipId, feed only available via tpfeed.cbc.ca 'url': 'http://www.cbc.ca/archives/entry/1978-robin-williams-freestyles-on-90-minutes-live', 'md5': '0274a90b51a9b4971fe005c63f592f12', 'info_dict': { @@ -64,6 +80,7 @@ class CBCIE(InfoExtractor): 'uploader': 'CBCC-NEW', }, }], + 'skip': 'Geo-restricted to Canada', }] @classmethod @@ -81,9 +98,15 @@ class CBCIE(InfoExtractor): media_id = player_info.get('mediaId') if not media_id: clip_id = player_info['clipId'] - media_id = self._download_json( - 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, - clip_id)['entries'][0]['id'].split('/')[-1] + feed = self._download_json( + 'http://tpfeed.cbc.ca/f/ExhSPC/vms_5akSXx4Ng_Zn?byCustomValue={:mpsReleases}{%s}' % clip_id, + clip_id, fatal=False) + if feed: + media_id = try_get(feed, lambda x: x['entries'][0]['guid'], compat_str) + if not media_id: + media_id = self._download_json( + 'http://feed.theplatform.com/f/h9dtGB/punlNGjMlc1F?fields=id&byContent=byReleases%3DbyId%253D' + clip_id, + clip_id)['entries'][0]['id'].split('/')[-1] return self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) else: entries = [self.url_result('cbcplayer:%s' % media_id, 'CBCPlayer', media_id) for media_id in re.findall(r'<iframe[^>]+src="[^"]+?mediaId=(\d+)"', webpage)] @@ -104,6 +127,7 @@ class CBCPlayerIE(InfoExtractor): 'upload_date': '20160210', 'uploader': 'CBCC-NEW', }, + 'skip': 'Geo-restricted to Canada', }, { # Redirected from http://www.cbc.ca/player/AudioMobile/All%20in%20a%20Weekend%20Montreal/ID/2657632011/ 'url': 'http://www.cbc.ca/player/play/2657631896', diff --git a/youtube_dl/extractor/cbsinteractive.py b/youtube_dl/extractor/cbsinteractive.py index 0011c30..821db20 100644 --- a/youtube_dl/extractor/cbsinteractive.py +++ b/youtube_dl/extractor/cbsinteractive.py @@ -80,9 +80,6 @@ class CBSInteractiveIE(ThePlatformIE): media_guid_path = 'media/guid/%d/%s' % (self.MPX_ACCOUNTS[site], vdata['mpxRefId']) formats, subtitles = [], {} - if site == 'cnet': - formats, subtitles = self._extract_theplatform_smil( - self.TP_RELEASE_URL_TEMPLATE % media_guid_path, video_id) for (fkey, vid) in vdata['files'].items(): if fkey == 'hls_phone' and 'hls_tablet' in vdata['files']: continue @@ -94,7 +91,7 @@ class CBSInteractiveIE(ThePlatformIE): subtitles = self._merge_subtitles(subtitles, tp_subtitles) self._sort_formats(formats) - info = self.get_metadata('kYEXFC/%s' % media_guid_path, video_id) + info = self._extract_theplatform_metadata('kYEXFC/%s' % media_guid_path, video_id) info.update({ 'id': video_id, 'display_id': display_id, diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py index 74adb38..4bcd104 100644 --- a/youtube_dl/extractor/cbslocal.py +++ b/youtube_dl/extractor/cbslocal.py @@ -1,12 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -import calendar -import datetime - from .anvato import AnvatoIE from .sendtonews import SendtoNewsIE from ..compat import compat_urlparse +from ..utils import unified_timestamp class CBSLocalIE(AnvatoIE): @@ -43,13 +41,8 @@ class CBSLocalIE(AnvatoIE): 'url': 'http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/', 'info_dict': { 'id': 'GxfCe0Zo7D-175909-5588', - 'ext': 'mp4', - 'title': 'Recap: CLE 15, CIN 6', - 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', - 'upload_date': '20160516', - 'timestamp': 1463433840, - 'duration': 49, }, + 'playlist_count': 9, 'params': { # m3u8 download 'skip_download': True, @@ -62,19 +55,15 @@ class CBSLocalIE(AnvatoIE): sendtonews_url = SendtoNewsIE._extract_url(webpage) if sendtonews_url: - info_dict = { - '_type': 'url_transparent', - 'url': compat_urlparse.urljoin(url, sendtonews_url), - } - else: - info_dict = self._extract_anvato_videos(webpage, display_id) + return self.url_result( + compat_urlparse.urljoin(url, sendtonews_url), + ie=SendtoNewsIE.ie_key()) + + info_dict = self._extract_anvato_videos(webpage, display_id) time_str = self._html_search_regex( r'class="entry-date">([^<]+)<', webpage, 'released date', fatal=False) - timestamp = None - if time_str: - timestamp = calendar.timegm(datetime.datetime.strptime( - time_str, '%b %d, %Y %I:%M %p').timetuple()) + timestamp = unified_timestamp(time_str) info_dict.update({ 'display_id': display_id, diff --git a/youtube_dl/extractor/cbsnews.py b/youtube_dl/extractor/cbsnews.py index 387537e..9d3b755 100644 --- a/youtube_dl/extractor/cbsnews.py +++ b/youtube_dl/extractor/cbsnews.py @@ -26,6 +26,7 @@ class CBSNewsIE(CBSBaseIE): # rtmp download 'skip_download': True, }, + 'skip': 'Subscribers only', }, { 'url': 'http://www.cbsnews.com/videos/fort-hood-shooting-army-downplays-mental-illness-as-cause-of-attack/', @@ -69,6 +70,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): IE_DESC = 'CBS News Live Videos' _VALID_URL = r'https?://(?:www\.)?cbsnews\.com/live/video/(?P<id>[\da-z_-]+)' + # Live videos get deleted soon. See http://www.cbsnews.com/live/ for the latest examples _TEST = { 'url': 'http://www.cbsnews.com/live/video/clinton-sanders-prepare-to-face-off-in-nh/', 'info_dict': { @@ -77,6 +79,7 @@ class CBSNewsLiveVideoIE(InfoExtractor): 'title': 'Clinton, Sanders Prepare To Face Off In NH', 'duration': 334, }, + 'skip': 'Video gone', } def _real_extract(self, url): diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py index b223454..29a8820 100644 --- a/youtube_dl/extractor/chaturbate.py +++ b/youtube_dl/extractor/chaturbate.py @@ -17,7 +17,8 @@ class ChaturbateIE(InfoExtractor): }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Room is offline', }, { 'url': 'https://en.chaturbate.com/siswet19/', 'only_matching': True, diff --git a/youtube_dl/extractor/chirbit.py b/youtube_dl/extractor/chirbit.py index b1eeaf1..b435186 100644 --- a/youtube_dl/extractor/chirbit.py +++ b/youtube_dl/extractor/chirbit.py @@ -1,30 +1,33 @@ # coding: utf-8 from __future__ import unicode_literals +import base64 + from .common import InfoExtractor -from ..utils import ( - parse_duration, - int_or_none, -) +from ..utils import parse_duration class ChirbitIE(InfoExtractor): IE_NAME = 'chirbit' _VALID_URL = r'https?://(?:www\.)?chirb\.it/(?:(?:wp|pl)/|fb_chirbit_player\.swf\?key=)?(?P<id>[\da-zA-Z]+)' _TESTS = [{ - 'url': 'http://chirb.it/PrIPv5', - 'md5': '9847b0dad6ac3e074568bf2cfb197de8', + 'url': 'http://chirb.it/be2abG', 'info_dict': { - 'id': 'PrIPv5', + 'id': 'be2abG', 'ext': 'mp3', - 'title': 'Фасадстрой', - 'duration': 52, - 'view_count': int, - 'comment_count': int, + 'title': 'md5:f542ea253f5255240be4da375c6a5d7e', + 'description': 'md5:f24a4e22a71763e32da5fed59e47c770', + 'duration': 306, + }, + 'params': { + 'skip_download': True, } }, { 'url': 'https://chirb.it/fb_chirbit_player.swf?key=PrIPv5', 'only_matching': True, + }, { + 'url': 'https://chirb.it/wp/MN58c2', + 'only_matching': True, }] def _real_extract(self, url): @@ -33,27 +36,30 @@ class ChirbitIE(InfoExtractor): webpage = self._download_webpage( 'http://chirb.it/%s' % audio_id, audio_id) - audio_url = self._search_regex( - r'"setFile"\s*,\s*"([^"]+)"', webpage, 'audio url') + data_fd = self._search_regex( + r'data-fd=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'data fd', group='url') + + # Reverse engineered from https://chirb.it/js/chirbit.player.js (look + # for soundURL) + audio_url = base64.b64decode( + data_fd[::-1].encode('ascii')).decode('utf-8') title = self._search_regex( - r'itemprop="name">([^<]+)', webpage, 'title') - duration = parse_duration(self._html_search_meta( - 'duration', webpage, 'duration', fatal=False)) - view_count = int_or_none(self._search_regex( - r'itemprop="playCount"\s*>(\d+)', webpage, - 'listen count', fatal=False)) - comment_count = int_or_none(self._search_regex( - r'>(\d+) Comments?:', webpage, - 'comment count', fatal=False)) + r'class=["\']chirbit-title["\'][^>]*>([^<]+)', webpage, 'title') + description = self._search_regex( + r'<h3>Description</h3>\s*<pre[^>]*>([^<]+)</pre>', + webpage, 'description', default=None) + duration = parse_duration(self._search_regex( + r'class=["\']c-length["\'][^>]*>([^<]+)', + webpage, 'duration', fatal=False)) return { 'id': audio_id, 'url': audio_url, 'title': title, + 'description': description, 'duration': duration, - 'view_count': view_count, - 'comment_count': comment_count, } diff --git a/youtube_dl/extractor/cliphunter.py b/youtube_dl/extractor/cliphunter.py index 19f8b39..252c2e8 100644 --- a/youtube_dl/extractor/cliphunter.py +++ b/youtube_dl/extractor/cliphunter.py @@ -23,7 +23,7 @@ class CliphunterIE(InfoExtractor): (?P<id>[0-9]+)/ (?P<seo>.+?)(?:$|[#\?]) ''' - _TEST = { + _TESTS = [{ 'url': 'http://www.cliphunter.com/w/1012420/Fun_Jynx_Maze_solo', 'md5': 'b7c9bbd4eb3a226ab91093714dcaa480', 'info_dict': { @@ -32,8 +32,19 @@ class CliphunterIE(InfoExtractor): 'title': 'Fun Jynx Maze solo', 'thumbnail': 're:^https?://.*\.jpg$', 'age_limit': 18, - } - } + }, + 'skip': 'Video gone', + }, { + 'url': 'http://www.cliphunter.com/w/2019449/ShesNew__My_booty_girlfriend_Victoria_Paradices_pussy_filled_with_jizz', + 'md5': '55a723c67bfc6da6b0cfa00d55da8a27', + 'info_dict': { + 'id': '2019449', + 'ext': 'mp4', + 'title': 'ShesNew - My booty girlfriend, Victoria Paradice\'s pussy filled with jizz', + 'thumbnail': 're:^https?://.*\.jpg$', + 'age_limit': 18, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/cliprs.py b/youtube_dl/extractor/cliprs.py index 4f9320e..d55b26d 100644 --- a/youtube_dl/extractor/cliprs.py +++ b/youtube_dl/extractor/cliprs.py @@ -1,16 +1,10 @@ # coding: utf-8 from __future__ import unicode_literals -from .common import InfoExtractor -from ..utils import ( - ExtractorError, - float_or_none, - int_or_none, - parse_iso8601, -) +from .onet import OnetBaseIE -class ClipRsIE(InfoExtractor): +class ClipRsIE(OnetBaseIE): _VALID_URL = r'https?://(?:www\.)?clip\.rs/(?P<id>[^/]+)/\d+' _TEST = { 'url': 'http://www.clip.rs/premijera-frajle-predstavljaju-novi-spot-za-pesmu-moli-me-moli/3732', @@ -27,64 +21,13 @@ class ClipRsIE(InfoExtractor): } def _real_extract(self, url): - video_id = self._match_id(url) + display_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, display_id) - video_id = self._search_regex( - r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + mvp_id = self._search_mvp_id(webpage) - response = self._download_json( - 'http://qi.ckm.onetapi.pl/', video_id, - query={ - 'body[id]': video_id, - 'body[jsonrpc]': '2.0', - 'body[method]': 'get_asset_detail', - 'body[params][ID_Publikacji]': video_id, - 'body[params][Service]': 'www.onet.pl', - 'content-type': 'application/jsonp', - 'x-onet-app': 'player.front.onetapi.pl', - }) + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict['display_id'] = display_id - error = response.get('error') - if error: - raise ExtractorError( - '%s said: %s' % (self.IE_NAME, error['message']), expected=True) - - video = response['result'].get('0') - - formats = [] - for _, formats_dict in video['formats'].items(): - if not isinstance(formats_dict, dict): - continue - for format_id, format_list in formats_dict.items(): - if not isinstance(format_list, list): - continue - for f in format_list: - if not f.get('url'): - continue - formats.append({ - 'url': f['url'], - 'format_id': format_id, - 'height': int_or_none(f.get('vertical_resolution')), - 'width': int_or_none(f.get('horizontal_resolution')), - 'abr': float_or_none(f.get('audio_bitrate')), - 'vbr': float_or_none(f.get('video_bitrate')), - }) - self._sort_formats(formats) - - meta = video.get('meta', {}) - - title = self._og_search_title(webpage, default=None) or meta['title'] - description = self._og_search_description(webpage, default=None) or meta.get('description') - duration = meta.get('length') or meta.get('lenght') - timestamp = parse_iso8601(meta.get('addDate'), ' ') - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'duration': duration, - 'timestamp': timestamp, - 'formats': formats, - } + return info_dict diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py index 9a28ef3..ae5ba00 100644 --- a/youtube_dl/extractor/cloudy.py +++ b/youtube_dl/extractor/cloudy.py @@ -6,7 +6,6 @@ import re from .common import InfoExtractor from ..compat import ( compat_parse_qs, - compat_urllib_parse_urlencode, compat_HTTPError, ) from ..utils import ( @@ -17,37 +16,26 @@ from ..utils import ( class CloudyIE(InfoExtractor): - _IE_DESC = 'cloudy.ec and videoraj.ch' + _IE_DESC = 'cloudy.ec' _VALID_URL = r'''(?x) - https?://(?:www\.)?(?P<host>cloudy\.ec|videoraj\.(?:ch|to))/ + https?://(?:www\.)?cloudy\.ec/ (?:v/|embed\.php\?id=) (?P<id>[A-Za-z0-9]+) ''' - _EMBED_URL = 'http://www.%s/embed.php?id=%s' - _API_URL = 'http://www.%s/api/player.api.php?%s' + _EMBED_URL = 'http://www.cloudy.ec/embed.php?id=%s' + _API_URL = 'http://www.cloudy.ec/api/player.api.php' _MAX_TRIES = 2 - _TESTS = [ - { - 'url': 'https://www.cloudy.ec/v/af511e2527aac', - 'md5': '5cb253ace826a42f35b4740539bedf07', - 'info_dict': { - 'id': 'af511e2527aac', - 'ext': 'flv', - 'title': 'Funny Cats and Animals Compilation june 2013', - } - }, - { - 'url': 'http://www.videoraj.to/v/47f399fd8bb60', - 'md5': '7d0f8799d91efd4eda26587421c3c3b0', - 'info_dict': { - 'id': '47f399fd8bb60', - 'ext': 'flv', - 'title': 'Burning a New iPhone 5 with Gasoline - Will it Survive?', - } + _TEST = { + 'url': 'https://www.cloudy.ec/v/af511e2527aac', + 'md5': '5cb253ace826a42f35b4740539bedf07', + 'info_dict': { + 'id': 'af511e2527aac', + 'ext': 'flv', + 'title': 'Funny Cats and Animals Compilation june 2013', } - ] + } - def _extract_video(self, video_host, video_id, file_key, error_url=None, try_num=0): + def _extract_video(self, video_id, file_key, error_url=None, try_num=0): if try_num > self._MAX_TRIES - 1: raise ExtractorError('Unable to extract video URL', expected=True) @@ -64,9 +52,8 @@ class CloudyIE(InfoExtractor): 'errorUrl': error_url, }) - data_url = self._API_URL % (video_host, compat_urllib_parse_urlencode(form)) player_data = self._download_webpage( - data_url, video_id, 'Downloading player data') + self._API_URL, video_id, 'Downloading player data', query=form) data = compat_parse_qs(player_data) try_num += 1 @@ -88,7 +75,7 @@ class CloudyIE(InfoExtractor): except ExtractorError as e: if isinstance(e.cause, compat_HTTPError) and e.cause.code in [404, 410]: self.report_warning('Invalid video URL, requesting another', video_id) - return self._extract_video(video_host, video_id, file_key, video_url, try_num) + return self._extract_video(video_id, file_key, video_url, try_num) return { 'id': video_id, @@ -98,14 +85,13 @@ class CloudyIE(InfoExtractor): def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - video_host = mobj.group('host') video_id = mobj.group('id') - url = self._EMBED_URL % (video_host, video_id) + url = self._EMBED_URL % video_id webpage = self._download_webpage(url, video_id) file_key = self._search_regex( [r'key\s*:\s*"([^"]+)"', r'filekey\s*=\s*"([^"]+)"'], webpage, 'file_key') - return self._extract_video(video_host, video_id, file_key) + return self._extract_video(video_id, file_key) diff --git a/youtube_dl/extractor/cmt.py b/youtube_dl/extractor/cmt.py index f1311b1..f24568d 100644 --- a/youtube_dl/extractor/cmt.py +++ b/youtube_dl/extractor/cmt.py @@ -1,5 +1,7 @@ from __future__ import unicode_literals + from .mtv import MTVIE +from ..utils import ExtractorError class CMTIE(MTVIE): @@ -16,7 +18,27 @@ class CMTIE(MTVIE): 'title': 'Garth Brooks - "The Call (featuring Trisha Yearwood)"', 'description': 'Blame It All On My Roots', }, + 'skip': 'Video not available', + }, { + 'url': 'http://www.cmt.com/videos/misc/1504699/still-the-king-ep-109-in-3-minutes.jhtml#id=1739908', + 'md5': 'e61a801ca4a183a466c08bd98dccbb1c', + 'info_dict': { + 'id': '1504699', + 'ext': 'mp4', + 'title': 'Still The King Ep. 109 in 3 Minutes', + 'description': 'Relive or catch up with Still The King by watching this recap of season 1, episode 9. New episodes Sundays 9/8c.', + 'timestamp': 1469421000.0, + 'upload_date': '20160725', + }, }, { 'url': 'http://www.cmt.com/shows/party-down-south/party-down-south-ep-407-gone-girl/1738172/playlist/#id=1738172', 'only_matching': True, }] + + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + if 'error_not_available.swf' in rtmp_video_url: + raise ExtractorError( + '%s said: video is not available' % cls.IE_NAME, expected=True) + + return super(CMTIE, cls)._transform_rtmp_url(rtmp_video_url) diff --git a/youtube_dl/extractor/comedycentral.py b/youtube_dl/extractor/comedycentral.py index 2b6aaa3..88346dd 100644 --- a/youtube_dl/extractor/comedycentral.py +++ b/youtube_dl/extractor/comedycentral.py @@ -1,17 +1,7 @@ from __future__ import unicode_literals -import re - from .mtv import MTVServicesInfoExtractor -from ..compat import ( - compat_str, - compat_urllib_parse_urlencode, -) -from ..utils import ( - ExtractorError, - float_or_none, - unified_strdate, -) +from .common import InfoExtractor class ComedyCentralIE(MTVServicesInfoExtractor): @@ -26,8 +16,10 @@ class ComedyCentralIE(MTVServicesInfoExtractor): 'info_dict': { 'id': 'cef0cbb3-e776-4bc9-b62e-8016deccb354', 'ext': 'mp4', - 'title': 'CC:Stand-Up|Greg Fitzsimmons: Life on Stage|Uncensored - Too Good of a Mother', + 'title': 'CC:Stand-Up|August 18, 2013|1|0101|Uncensored - Too Good of a Mother', 'description': 'After a certain point, breastfeeding becomes c**kblocking.', + 'timestamp': 1376798400, + 'upload_date': '20130818', }, }, { 'url': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/interviews/6yx39d/exclusive-rand-paul-extended-interview', @@ -35,241 +27,92 @@ class ComedyCentralIE(MTVServicesInfoExtractor): }] -class ComedyCentralShowsIE(MTVServicesInfoExtractor): - IE_DESC = 'The Daily Show / The Colbert Report' - # urls can be abbreviations like :thedailyshow - # urls for episodes like: - # or urls for clips like: http://www.thedailyshow.com/watch/mon-december-10-2012/any-given-gun-day - # or: http://www.colbertnation.com/the-colbert-report-videos/421667/november-29-2012/moon-shattering-news - # or: http://www.colbertnation.com/the-colbert-report-collections/422008/festival-of-lights/79524 - _VALID_URL = r'''(?x)^(:(?P<shortname>tds|thedailyshow) - |https?://(:www\.)? - (?P<showname>thedailyshow|thecolbertreport|tosh)\.(?:cc\.)?com/ - ((?:full-)?episodes/(?:[0-9a-z]{6}/)?(?P<episode>.*)| - (?P<clip> - (?:(?:guests/[^/]+|videos|video-(?:clips|playlists)|special-editions|news-team/[^/]+)/[^/]+/(?P<videotitle>[^/?#]+)) - |(the-colbert-report-(videos|collections)/(?P<clipID>[0-9]+)/[^/]*/(?P<cntitle>.*?)) - |(watch/(?P<date>[^/]*)/(?P<tdstitle>.*)) - )| - (?P<interview> - extended-interviews/(?P<interID>[0-9a-z]+)/ - (?:playlist_tds_extended_)?(?P<interview_title>[^/?#]*?) - (?:/[^/?#]?|[?#]|$)))) - ''' +class ToshIE(MTVServicesInfoExtractor): + IE_DESC = 'Tosh.0' + _VALID_URL = r'^https?://tosh\.cc\.com/video-(?:clips|collections)/[^/]+/(?P<videotitle>[^/?#]+)' + _FEED_URL = 'http://tosh.cc.com/feeds/mrss' + _TESTS = [{ - 'url': 'http://thedailyshow.cc.com/watch/thu-december-13-2012/kristen-stewart', - 'md5': '4e2f5cb088a83cd8cdb7756132f9739d', - 'info_dict': { - 'id': 'ab9ab3e7-5a98-4dbe-8b21-551dc0523d55', - 'ext': 'mp4', - 'upload_date': '20121213', - 'description': 'Kristen Stewart learns to let loose in "On the Road."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow kristen-stewart part 1', - } - }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/b6364d/sarah-chayes-extended-interview', + 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', 'info_dict': { - 'id': 'sarah-chayes-extended-interview', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'title': 'thedailyshow Sarah Chayes Extended Interview', + 'description': 'Tosh asked fans to share their summer plans.', + 'title': 'Twitter Users Share Summer Plans', }, - 'playlist': [ - { - 'info_dict': { - 'id': '0baad492-cbec-4ec1-9e50-ad91c291127f', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 1', + 'playlist': [{ + 'md5': 'f269e88114c1805bb6d7653fecea9e06', + 'info_dict': { + 'id': '90498ec2-ed00-11e0-aca6-0026b9414f30', + 'ext': 'mp4', + 'title': 'Tosh.0|June 9, 2077|2|211|Twitter Users Share Summer Plans', + 'description': 'Tosh asked fans to share their summer plans.', + 'thumbnail': 're:^https?://.*\.jpg', + # It's really reported to be published on year 2077 + 'upload_date': '20770610', + 'timestamp': 3390510600, + 'subtitles': { + 'en': 'mincount:3', }, }, - { - 'info_dict': { - 'id': '1e4fb91b-8ce7-4277-bd7c-98c9f1bbd283', - 'ext': 'mp4', - 'upload_date': '20150129', - 'description': 'Carnegie Endowment Senior Associate Sarah Chayes discusses how corrupt institutions function throughout the world in her book "Thieves of State: Why Corruption Threatens Global Security."', - 'uploader': 'thedailyshow', - 'title': 'thedailyshow sarah-chayes-extended-interview part 2', - }, - }, - ], + }] + }, { + 'url': 'http://tosh.cc.com/video-collections/x2iz7k/just-plain-foul/m5q4fp', + 'only_matching': True, + }] + + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): + new_urls = super(ToshIE, cls)._transform_rtmp_url(rtmp_video_url) + new_urls['rtmp'] = rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm') + return new_urls + + +class ComedyCentralTVIE(MTVServicesInfoExtractor): + _VALID_URL = r'https?://(?:www\.)?comedycentral\.tv/(?:staffeln|shows)/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.comedycentral.tv/staffeln/7436-the-mindy-project-staffel-4', + 'info_dict': { + 'id': 'local_playlist-f99b626bdfe13568579a', + 'ext': 'flv', + 'title': 'Episode_the-mindy-project_shows_season-4_episode-3_full-episode_part1', + }, 'params': { + # rtmp download 'skip_download': True, }, }, { - 'url': 'http://thedailyshow.cc.com/extended-interviews/xm3fnq/andrew-napolitano-extended-interview', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/29w6fx/-realhumanpraise-for-fox-news', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/videos/gh6urb/neil-degrasse-tyson-pt--1?xrs=eml_col_031114', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/guests/michael-lewis/3efna8/exclusive---michael-lewis-extended-interview-pt--3', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/episodes/sy7yv0/april-8--2014---denis-leary', - 'only_matching': True, - }, { - 'url': 'http://thecolbertreport.cc.com/episodes/8ase07/april-8--2014---jane-goodall', + 'url': 'http://www.comedycentral.tv/shows/1074-workaholics', 'only_matching': True, }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/npde3s/the-daily-show-19088-highlights', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/video-playlists/t6d9sg/the-daily-show-20038-highlights/be3cwo', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/special-editions/2l8fdb/special-edition---a-look-back-at-food', - 'only_matching': True, - }, { - 'url': 'http://thedailyshow.cc.com/news-team/michael-che/7wnfel/we-need-to-talk-about-israel', - 'only_matching': True, - }, { - 'url': 'http://tosh.cc.com/video-clips/68g93d/twitter-users-share-summer-plans', + 'url': 'http://www.comedycentral.tv/shows/1727-the-mindy-project/bonus', 'only_matching': True, }] - _available_formats = ['3500', '2200', '1700', '1200', '750', '400'] - - _video_extensions = { - '3500': 'mp4', - '2200': 'mp4', - '1700': 'mp4', - '1200': 'mp4', - '750': 'mp4', - '400': 'mp4', - } - _video_dimensions = { - '3500': (1280, 720), - '2200': (960, 540), - '1700': (768, 432), - '1200': (640, 360), - '750': (512, 288), - '400': (384, 216), - } - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - - if mobj.group('shortname'): - return self.url_result('http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes') - - if mobj.group('clip'): - if mobj.group('videotitle'): - epTitle = mobj.group('videotitle') - elif mobj.group('showname') == 'thedailyshow': - epTitle = mobj.group('tdstitle') - else: - epTitle = mobj.group('cntitle') - dlNewest = False - elif mobj.group('interview'): - epTitle = mobj.group('interview_title') - dlNewest = False - else: - dlNewest = not mobj.group('episode') - if dlNewest: - epTitle = mobj.group('showname') - else: - epTitle = mobj.group('episode') - show_name = mobj.group('showname') - - webpage, htmlHandle = self._download_webpage_handle(url, epTitle) - if dlNewest: - url = htmlHandle.geturl() - mobj = re.match(self._VALID_URL, url, re.VERBOSE) - if mobj is None: - raise ExtractorError('Invalid redirected URL: ' + url) - if mobj.group('episode') == '': - raise ExtractorError('Redirected URL is still not specific: ' + url) - epTitle = (mobj.group('episode') or mobj.group('videotitle')).rpartition('/')[-1] + video_id = self._match_id(url) - mMovieParams = re.findall('(?:<param name="movie" value="|var url = ")(http://media.mtvnservices.com/([^"]*(?:episode|video).*?:.*?))"', webpage) - if len(mMovieParams) == 0: - # The Colbert Report embeds the information in a without - # a URL prefix; so extract the alternate reference - # and then add the URL prefix manually. + webpage = self._download_webpage(url, video_id) - altMovieParams = re.findall('data-mgid="([^"]*(?:episode|video|playlist).*?:.*?)"', webpage) - if len(altMovieParams) == 0: - raise ExtractorError('unable to find Flash URL in webpage ' + url) - else: - mMovieParams = [('http://media.mtvnservices.com/' + altMovieParams[0], altMovieParams[0])] + mrss_url = self._search_regex( + r'data-mrss=(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'mrss url', group='url') - uri = mMovieParams[0][1] - # Correct cc.com in uri - uri = re.sub(r'(episode:[^.]+)(\.cc)?\.com', r'\1.com', uri) + return self._get_videos_info_from_url(mrss_url, video_id) - index_url = 'http://%s.cc.com/feeds/mrss?%s' % (show_name, compat_urllib_parse_urlencode({'uri': uri})) - idoc = self._download_xml( - index_url, epTitle, - 'Downloading show index', 'Unable to download episode index') - title = idoc.find('./channel/title').text - description = idoc.find('./channel/description').text - - entries = [] - item_els = idoc.findall('.//item') - for part_num, itemEl in enumerate(item_els): - upload_date = unified_strdate(itemEl.findall('./pubDate')[0].text) - thumbnail = itemEl.find('.//{http://search.yahoo.com/mrss/}thumbnail').attrib.get('url') - - content = itemEl.find('.//{http://search.yahoo.com/mrss/}content') - duration = float_or_none(content.attrib.get('duration')) - mediagen_url = content.attrib['url'] - guid = itemEl.find('./guid').text.rpartition(':')[-1] - - cdoc = self._download_xml( - mediagen_url, epTitle, - 'Downloading configuration for segment %d / %d' % (part_num + 1, len(item_els))) - - turls = [] - for rendition in cdoc.findall('.//rendition'): - finfo = (rendition.attrib['bitrate'], rendition.findall('./src')[0].text) - turls.append(finfo) - - formats = [] - for format, rtmp_video_url in turls: - w, h = self._video_dimensions.get(format, (None, None)) - formats.append({ - 'format_id': 'vhttp-%s' % format, - 'url': self._transform_rtmp_url(rtmp_video_url), - 'ext': self._video_extensions.get(format, 'mp4'), - 'height': h, - 'width': w, - }) - formats.append({ - 'format_id': 'rtmp-%s' % format, - 'url': rtmp_video_url.replace('viacomccstrm', 'viacommtvstrm'), - 'ext': self._video_extensions.get(format, 'mp4'), - 'height': h, - 'width': w, - }) - self._sort_formats(formats) - - subtitles = self._extract_subtitles(cdoc, guid) - - virtual_id = show_name + ' ' + epTitle + ' part ' + compat_str(part_num + 1) - entries.append({ - 'id': guid, - 'title': virtual_id, - 'formats': formats, - 'uploader': show_name, - 'upload_date': upload_date, - 'duration': duration, - 'thumbnail': thumbnail, - 'description': description, - 'subtitles': subtitles, - }) +class ComedyCentralShortnameIE(InfoExtractor): + _VALID_URL = r'^:(?P<id>tds|thedailyshow)$' + _TESTS = [{ + 'url': ':tds', + 'only_matching': True, + }, { + 'url': ':thedailyshow', + 'only_matching': True, + }] - return { - '_type': 'playlist', - 'id': epTitle, - 'entries': entries, - 'title': show_name + ' ' + title, - 'description': description, + def _real_extract(self, url): + video_id = self._match_id(url) + shortcut_map = { + 'tds': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', + 'thedailyshow': 'http://www.cc.com/shows/the-daily-show-with-trevor-noah/full-episodes', } + return self.url_result(shortcut_map[video_id]) diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py index 5a2603b..9427ff4 100644 --- a/youtube_dl/extractor/common.py +++ b/youtube_dl/extractor/common.py @@ -44,6 +44,7 @@ from ..utils import ( sanitized_Request, unescapeHTML, unified_strdate, + unified_timestamp, url_basename, xpath_element, xpath_text, @@ -54,6 +55,8 @@ from ..utils import ( update_Request, update_url_query, parse_m3u8_attributes, + extract_attributes, + parse_codecs, ) @@ -161,6 +164,7 @@ class InfoExtractor(object): * "height" (optional, int) * "resolution" (optional, string "{width}x{height"}, deprecated) + * "filesize" (optional, int) thumbnail: Full URL to a video thumbnail image. description: Full video description. uploader: Full name of the video uploader. @@ -658,6 +662,24 @@ class InfoExtractor(object): else: return res + def _get_netrc_login_info(self, netrc_machine=None): + username = None + password = None + netrc_machine = netrc_machine or self._NETRC_MACHINE + + if self._downloader.params.get('usenetrc', False): + try: + info = netrc.netrc().authenticators(netrc_machine) + if info is not None: + username = info[0] + password = info[2] + else: + raise netrc.NetrcParseError('No authenticators for %s' % netrc_machine) + except (IOError, netrc.NetrcParseError) as err: + self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + + return (username, password) + def _get_login_info(self): """ Get the login info as (username, password) @@ -675,16 +697,8 @@ class InfoExtractor(object): if downloader_params.get('username') is not None: username = downloader_params['username'] password = downloader_params['password'] - elif downloader_params.get('usenetrc', False): - try: - info = netrc.netrc().authenticators(self._NETRC_MACHINE) - if info is not None: - username = info[0] - password = info[2] - else: - raise netrc.NetrcParseError('No authenticators for %s' % self._NETRC_MACHINE) - except (IOError, netrc.NetrcParseError) as err: - self._downloader.report_warning('parsing .netrc: %s' % error_to_compat_str(err)) + else: + username, password = self._get_netrc_login_info() return (username, password) @@ -723,9 +737,14 @@ class InfoExtractor(object): [^>]+?content=(["\'])(?P<content>.*?)\2''' % re.escape(prop) def _og_search_property(self, prop, html, name=None, **kargs): + if not isinstance(prop, (list, tuple)): + prop = [prop] if name is None: - name = 'OpenGraph %s' % prop - escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs) + name = 'OpenGraph %s' % prop[0] + og_regexes = [] + for p in prop: + og_regexes.extend(self._og_regexes(p)) + escaped = self._search_regex(og_regexes, html, name, flags=re.DOTALL, **kargs) if escaped is None: return None return unescapeHTML(escaped) @@ -749,10 +768,12 @@ class InfoExtractor(object): return self._og_search_property('url', html, **kargs) def _html_search_meta(self, name, html, display_name=None, fatal=False, **kwargs): + if not isinstance(name, (list, tuple)): + name = [name] if display_name is None: - display_name = name + display_name = name[0] return self._html_search_regex( - self._meta_regex(name), + [self._meta_regex(n) for n in name], html, display_name, fatal=fatal, group='content', **kwargs) def _dc_search_uploader(self, html): @@ -801,40 +822,66 @@ class InfoExtractor(object): return self._html_search_meta('twitter:player', html, 'twitter card player') - def _search_json_ld(self, html, video_id, **kwargs): + def _search_json_ld(self, html, video_id, expected_type=None, **kwargs): json_ld = self._search_regex( r'(?s)<script[^>]+type=(["\'])application/ld\+json\1[^>]*>(?P<json_ld>.+?)</script>', html, 'JSON-LD', group='json_ld', **kwargs) + default = kwargs.get('default', NO_DEFAULT) if not json_ld: - return {} - return self._json_ld(json_ld, video_id, fatal=kwargs.get('fatal', True)) - - def _json_ld(self, json_ld, video_id, fatal=True): + return default if default is not NO_DEFAULT else {} + # JSON-LD may be malformed and thus `fatal` should be respected. + # At the same time `default` may be passed that assumes `fatal=False` + # for _search_regex. Let's simulate the same behavior here as well. + fatal = kwargs.get('fatal', True) if default == NO_DEFAULT else False + return self._json_ld(json_ld, video_id, fatal=fatal, expected_type=expected_type) + + def _json_ld(self, json_ld, video_id, fatal=True, expected_type=None): if isinstance(json_ld, compat_str): json_ld = self._parse_json(json_ld, video_id, fatal=fatal) if not json_ld: return {} info = {} - if json_ld.get('@context') == 'http://schema.org': - item_type = json_ld.get('@type') - if item_type == 'TVEpisode': - info.update({ - 'episode': unescapeHTML(json_ld.get('name')), - 'episode_number': int_or_none(json_ld.get('episodeNumber')), - 'description': unescapeHTML(json_ld.get('description')), - }) - part_of_season = json_ld.get('partOfSeason') - if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': - info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) - part_of_series = json_ld.get('partOfSeries') - if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': - info['series'] = unescapeHTML(part_of_series.get('name')) - elif item_type == 'Article': - info.update({ - 'timestamp': parse_iso8601(json_ld.get('datePublished')), - 'title': unescapeHTML(json_ld.get('headline')), - 'description': unescapeHTML(json_ld.get('articleBody')), - }) + if not isinstance(json_ld, (list, tuple, dict)): + return info + if isinstance(json_ld, dict): + json_ld = [json_ld] + for e in json_ld: + if e.get('@context') == 'http://schema.org': + item_type = e.get('@type') + if expected_type is not None and expected_type != item_type: + return info + if item_type == 'TVEpisode': + info.update({ + 'episode': unescapeHTML(e.get('name')), + 'episode_number': int_or_none(e.get('episodeNumber')), + 'description': unescapeHTML(e.get('description')), + }) + part_of_season = e.get('partOfSeason') + if isinstance(part_of_season, dict) and part_of_season.get('@type') == 'TVSeason': + info['season_number'] = int_or_none(part_of_season.get('seasonNumber')) + part_of_series = e.get('partOfSeries') or e.get('partOfTVSeries') + if isinstance(part_of_series, dict) and part_of_series.get('@type') == 'TVSeries': + info['series'] = unescapeHTML(part_of_series.get('name')) + elif item_type == 'Article': + info.update({ + 'timestamp': parse_iso8601(e.get('datePublished')), + 'title': unescapeHTML(e.get('headline')), + 'description': unescapeHTML(e.get('articleBody')), + }) + elif item_type == 'VideoObject': + info.update({ + 'url': e.get('contentUrl'), + 'title': unescapeHTML(e.get('name')), + 'description': unescapeHTML(e.get('description')), + 'thumbnail': e.get('thumbnailUrl'), + 'duration': parse_duration(e.get('duration')), + 'timestamp': unified_timestamp(e.get('uploadDate')), + 'filesize': float_or_none(e.get('contentSize')), + 'tbr': int_or_none(e.get('bitrate')), + 'width': int_or_none(e.get('width')), + 'height': int_or_none(e.get('height')), + }) + break return dict((k, v) for k, v in info.items() if v is not None) @staticmethod @@ -876,7 +923,11 @@ class InfoExtractor(object): f['ext'] = determine_ext(f['url']) if isinstance(field_preference, (list, tuple)): - return tuple(f.get(field) if f.get(field) is not None else -1 for field in field_preference) + return tuple( + f.get(field) + if f.get(field) is not None + else ('' if field == 'format_id' else -1) + for field in field_preference) preference = f.get('preference') if preference is None: @@ -884,7 +935,8 @@ class InfoExtractor(object): if f.get('ext') in ['f4f', 'f4m']: # Not yet supported preference -= 0.5 - proto_preference = 0 if determine_protocol(f) in ['http', 'https'] else -0.1 + protocol = f.get('protocol') or determine_protocol(f) + proto_preference = 0 if protocol in ['http', 'https'] else (-0.5 if protocol == 'rtsp' else -0.1) if f.get('vcodec') == 'none': # audio only preference -= 50 @@ -1101,7 +1153,7 @@ class InfoExtractor(object): 'url': m3u8_url, 'ext': ext, 'protocol': 'm3u8', - 'preference': preference - 1 if preference else -1, + 'preference': preference - 100 if preference else -100, 'resolution': 'multiple', 'format_note': 'Quality selection URL', } @@ -1180,6 +1232,7 @@ class InfoExtractor(object): 'url': format_url(line.strip()), 'tbr': tbr, 'ext': ext, + 'fps': float_or_none(last_info.get('FRAME-RATE')), 'protocol': entry_protocol, 'preference': preference, } @@ -1188,24 +1241,17 @@ class InfoExtractor(object): width_str, height_str = resolution.split('x') f['width'] = int(width_str) f['height'] = int(height_str) - codecs = last_info.get('CODECS') - if codecs: - vcodec, acodec = [None] * 2 - va_codecs = codecs.split(',') - if len(va_codecs) == 1: - # Audio only entries usually come with single codec and - # no resolution. For more robustness we also check it to - # be mp4 audio. - if not resolution and va_codecs[0].startswith('mp4a'): - vcodec, acodec = 'none', va_codecs[0] - else: - vcodec = va_codecs[0] - else: - vcodec, acodec = va_codecs[:2] + # Unified Streaming Platform + mobj = re.search( + r'audio.*?(?:%3D|=)(\d+)(?:-video.*?(?:%3D|=)(\d+))?', f['url']) + if mobj: + abr, vbr = mobj.groups() + abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) f.update({ - 'acodec': acodec, - 'vcodec': vcodec, + 'vbr': vbr, + 'abr': abr, }) + f.update(parse_codecs(last_info.get('CODECS'))) if last_media is not None: f['m3u8_media'] = last_media last_media = None @@ -1460,6 +1506,13 @@ class InfoExtractor(object): compat_etree_fromstring(mpd.encode('utf-8')), mpd_id, mpd_base_url, formats_dict=formats_dict) def _parse_mpd_formats(self, mpd_doc, mpd_id=None, mpd_base_url='', formats_dict={}): + """ + Parse formats from MPD manifest. + References: + 1. MPEG-DASH Standard, ISO/IEC 23009-1:2014(E), + http://standards.iso.org/ittf/PubliclyAvailableStandards/c065274_ISO_IEC_23009-1_2014.zip + 2. https://en.wikipedia.org/wiki/Dynamic_Adaptive_Streaming_over_HTTP + """ if mpd_doc.get('type') == 'dynamic': return [] @@ -1492,8 +1545,16 @@ class InfoExtractor(object): s_e = segment_timeline.findall(_add_ns('S')) if s_e: ms_info['total_number'] = 0 + ms_info['s'] = [] for s in s_e: - ms_info['total_number'] += 1 + int(s.get('r', '0')) + r = int(s.get('r', 0)) + ms_info['total_number'] += 1 + r + ms_info['s'].append({ + 't': int(s.get('t', 0)), + # @d is mandatory (see [1, 5.3.9.6.2, Table 17, page 60]) + 'd': int(s.attrib['d']), + 'r': r, + }) else: timescale = segment_template.get('timescale') if timescale: @@ -1530,7 +1591,7 @@ class InfoExtractor(object): continue representation_attrib = adaptation_set.attrib.copy() representation_attrib.update(representation.attrib) - # According to page 41 of ISO/IEC 29001-1:2014, @mimeType is mandatory + # According to [1, 5.3.7.2, Table 9, page 41], @mimeType is mandatory mime_type = representation_attrib['mimeType'] content_type = mime_type.split('/')[0] if content_type == 'text': @@ -1574,16 +1635,40 @@ class InfoExtractor(object): representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration)) media_template = representation_ms_info['media_template'] media_template = media_template.replace('$RepresentationID$', representation_id) - media_template = re.sub(r'\$(Number|Bandwidth)\$', r'%(\1)d', media_template) - media_template = re.sub(r'\$(Number|Bandwidth)%([^$]+)\$', r'%(\1)\2', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)\$', r'%(\1)d', media_template) + media_template = re.sub(r'\$(Number|Bandwidth|Time)%([^$]+)\$', r'%(\1)\2', media_template) media_template.replace('$$', '$') - representation_ms_info['segment_urls'] = [ - media_template % { - 'Number': segment_number, - 'Bandwidth': representation_attrib.get('bandwidth')} - for segment_number in range( - representation_ms_info['start_number'], - representation_ms_info['total_number'] + representation_ms_info['start_number'])] + + # As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$ + # can't be used at the same time + if '%(Number' in media_template: + representation_ms_info['segment_urls'] = [ + media_template % { + 'Number': segment_number, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + for segment_number in range( + representation_ms_info['start_number'], + representation_ms_info['total_number'] + representation_ms_info['start_number'])] + else: + representation_ms_info['segment_urls'] = [] + segment_time = 0 + + def add_segment_url(): + representation_ms_info['segment_urls'].append( + media_template % { + 'Time': segment_time, + 'Bandwidth': representation_attrib.get('bandwidth'), + } + ) + + for num, s in enumerate(representation_ms_info['s']): + segment_time = s.get('t') or segment_time + add_segment_url() + for r in range(s.get('r', 0)): + segment_time += s['d'] + add_segment_url() + segment_time += s['d'] if 'segment_urls' in representation_ms_info: f.update({ 'segment_urls': representation_ms_info['segment_urls'], @@ -1610,6 +1695,62 @@ class InfoExtractor(object): self.report_warning('Unknown MIME type %s in DASH manifest' % mime_type) return formats + def _parse_html5_media_entries(self, base_url, webpage): + def absolute_url(video_url): + return compat_urlparse.urljoin(base_url, video_url) + + def parse_content_type(content_type): + if not content_type: + return {} + ctr = re.search(r'(?P<mimetype>[^/]+/[^;]+)(?:;\s*codecs="?(?P<codecs>[^"]+))?', content_type) + if ctr: + mimetype, codecs = ctr.groups() + f = parse_codecs(codecs) + f['ext'] = mimetype2ext(mimetype) + return f + return {} + + entries = [] + for media_tag, media_type, media_content in re.findall(r'(?s)(<(?P<tag>video|audio)[^>]*>)(.*?)</(?P=tag)>', webpage): + media_info = { + 'formats': [], + 'subtitles': {}, + } + media_attributes = extract_attributes(media_tag) + src = media_attributes.get('src') + if src: + media_info['formats'].append({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['thumbnail'] = media_attributes.get('poster') + if media_content: + for source_tag in re.findall(r'<source[^>]+>', media_content): + source_attributes = extract_attributes(source_tag) + src = source_attributes.get('src') + if not src: + continue + f = parse_content_type(source_attributes.get('type')) + f.update({ + 'url': absolute_url(src), + 'vcodec': 'none' if media_type == 'audio' else None, + }) + media_info['formats'].append(f) + for track_tag in re.findall(r'<track[^>]+>', media_content): + track_attributes = extract_attributes(track_tag) + kind = track_attributes.get('kind') + if not kind or kind == 'subtitles': + src = track_attributes.get('src') + if not src: + continue + lang = track_attributes.get('srclang') or track_attributes.get('lang') or track_attributes.get('label') + media_info['subtitles'].setdefault(lang, []).append({ + 'url': absolute_url(src), + }) + if media_info['formats']: + entries.append(media_info) + return entries + def _live_title(self, name): """ Generate the title for a live video """ now = datetime.datetime.now() @@ -1670,7 +1811,7 @@ class InfoExtractor(object): any_restricted = False for tc in self.get_testcases(include_onlymatching=False): - if 'playlist' in tc: + if tc.get('playlist', []): tc = tc['playlist'][0] is_restricted = age_restricted( tc.get('info_dict', {}).get('age_limit'), age_limit) @@ -1723,6 +1864,13 @@ class InfoExtractor(object): def _mark_watched(self, *args, **kwargs): raise NotImplementedError('This method must be implemented by subclasses') + def geo_verification_headers(self): + headers = {} + geo_verification_proxy = self._downloader.params.get('geo_verification_proxy') + if geo_verification_proxy: + headers['Ytdl-request-proxy'] = geo_verification_proxy + return headers + class SearchInfoExtractor(InfoExtractor): """ diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py index e8f2b5a..8d8f605 100644 --- a/youtube_dl/extractor/condenast.py +++ b/youtube_dl/extractor/condenast.py @@ -5,13 +5,17 @@ import re from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_urllib_parse_urlparse, compat_urlparse, ) from ..utils import ( orderedSet, remove_end, + extract_attributes, + mimetype2ext, + determine_ext, + int_or_none, + parse_iso8601, ) @@ -58,6 +62,9 @@ class CondeNastIE(InfoExtractor): 'ext': 'mp4', 'title': '3D Printed Speakers Lit With LED', 'description': 'Check out these beautiful 3D printed LED speakers. You can\'t actually buy them, but LumiGeek is working on a board that will let you make you\'re own.', + 'uploader': 'wired', + 'upload_date': '20130314', + 'timestamp': 1363219200, } }, { # JS embed @@ -67,70 +74,93 @@ class CondeNastIE(InfoExtractor): 'id': '55f9cf8b61646d1acf00000c', 'ext': 'mp4', 'title': '3D printed TSA Travel Sentry keys really do open TSA locks', + 'uploader': 'arstechnica', + 'upload_date': '20150916', + 'timestamp': 1442434955, } }] def _extract_series(self, url, webpage): - title = self._html_search_regex(r'<div class="cne-series-info">.*?<h1>(.+?)</h1>', - webpage, 'series title', flags=re.DOTALL) + title = self._html_search_regex( + r'(?s)<div class="cne-series-info">.*?<h1>(.+?)</h1>', + webpage, 'series title') url_object = compat_urllib_parse_urlparse(url) base_url = '%s://%s' % (url_object.scheme, url_object.netloc) - m_paths = re.finditer(r'<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', - webpage, flags=re.DOTALL) + m_paths = re.finditer( + r'(?s)<p class="cne-thumb-title">.*?<a href="(/watch/.+?)["\?]', webpage) paths = orderedSet(m.group(1) for m in m_paths) build_url = lambda path: compat_urlparse.urljoin(base_url, path) entries = [self.url_result(build_url(path), 'CondeNast') for path in paths] return self.playlist_result(entries, playlist_title=title) def _extract_video(self, webpage, url_type): - if url_type != 'embed': - description = self._html_search_regex( - [ - r'<div class="cne-video-description">(.+?)</div>', - r'<div class="video-post-content">(.+?)</div>', - ], - webpage, 'description', fatal=False, flags=re.DOTALL) + query = {} + params = self._search_regex( + r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None) + if params: + query.update({ + 'videoId': self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id'), + 'playerId': self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id'), + 'target': self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target'), + }) else: - description = None - params = self._search_regex(r'var params = {(.+?)}[;,]', webpage, - 'player params', flags=re.DOTALL) - video_id = self._search_regex(r'videoId: [\'"](.+?)[\'"]', params, 'video id') - player_id = self._search_regex(r'playerId: [\'"](.+?)[\'"]', params, 'player id') - target = self._search_regex(r'target: [\'"](.+?)[\'"]', params, 'target') - data = compat_urllib_parse_urlencode({'videoId': video_id, - 'playerId': player_id, - 'target': target, - }) - base_info_url = self._search_regex(r'url = [\'"](.+?)[\'"][,;]', - webpage, 'base info url', - default='http://player.cnevids.com/player/loader.js?') - info_url = base_info_url + data - info_page = self._download_webpage(info_url, video_id, - 'Downloading video info') - video_info = self._search_regex(r'var\s+video\s*=\s*({.+?});', info_page, 'video info') - video_info = self._parse_json(video_info, video_id) - - formats = [{ - 'format_id': '%s-%s' % (fdata['type'].split('/')[-1], fdata['quality']), - 'url': fdata['src'], - 'ext': fdata['type'].split('/')[-1], - 'quality': 1 if fdata['quality'] == 'high' else 0, - } for fdata in video_info['sources'][0]] + params = extract_attributes(self._search_regex( + r'(<[^>]+data-js="video-player"[^>]+>)', + webpage, 'player params element')) + query.update({ + 'videoId': params['data-video'], + 'playerId': params['data-player'], + 'target': params['id'], + }) + video_id = query['videoId'] + video_info = None + info_page = self._download_webpage( + 'http://player.cnevids.com/player/video.js', + video_id, 'Downloading video info', query=query, fatal=False) + if info_page: + video_info = self._parse_json(self._search_regex( + r'loadCallback\(({.+})\)', info_page, 'video info'), video_id)['video'] + else: + info_page = self._download_webpage( + 'http://player.cnevids.com/player/loader.js', + video_id, 'Downloading loader info', query=query) + video_info = self._parse_json(self._search_regex( + r'var\s+video\s*=\s*({.+?});', info_page, 'video info'), video_id) + title = video_info['title'] + + formats = [] + for fdata in video_info.get('sources', [{}])[0]: + src = fdata.get('src') + if not src: + continue + ext = mimetype2ext(fdata.get('type')) or determine_ext(src) + quality = fdata.get('quality') + formats.append({ + 'format_id': ext + ('-%s' % quality if quality else ''), + 'url': src, + 'ext': ext, + 'quality': 1 if quality == 'high' else 0, + }) self._sort_formats(formats) - return { + info = self._search_json_ld( + webpage, video_id, fatal=False) if url_type != 'embed' else {} + info.update({ 'id': video_id, 'formats': formats, - 'title': video_info['title'], - 'thumbnail': video_info['poster_frame'], - 'description': description, - } + 'title': title, + 'thumbnail': video_info.get('poster_frame'), + 'uploader': video_info.get('brand'), + 'duration': int_or_none(video_info.get('duration')), + 'tags': video_info.get('tags'), + 'series': video_info.get('series_title'), + 'season': video_info.get('season_title'), + 'timestamp': parse_iso8601(video_info.get('premiere_date')), + }) + return info def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - site = mobj.group('site') - url_type = mobj.group('type') - item_id = mobj.group('id') + site, url_type, item_id = re.match(self._VALID_URL, url).groups() # Convert JS embed to regular embed if url_type == 'embedjs': diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py index 90a6430..6d3abb5 100644 --- a/youtube_dl/extractor/crunchyroll.py +++ b/youtube_dl/extractor/crunchyroll.py @@ -115,6 +115,21 @@ class CrunchyrollIE(CrunchyrollBaseIE): 'skip_download': True, }, }, { + 'url': 'http://www.crunchyroll.com/rezero-starting-life-in-another-world-/episode-5-the-morning-of-our-promise-is-still-distant-702409', + 'info_dict': { + 'id': '702409', + 'ext': 'mp4', + 'title': 'Re:ZERO -Starting Life in Another World- Episode 5 – The Morning of Our Promise Is Still Distant', + 'description': 'md5:97664de1ab24bbf77a9c01918cb7dca9', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'TV TOKYO', + 'upload_date': '20160508', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + }, { 'url': 'http://www.crunchyroll.fr/girl-friend-beta/episode-11-goodbye-la-mode-661697', 'only_matching': True, }, { @@ -336,9 +351,18 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text if video_encode_id in video_encode_ids: continue video_encode_ids.append(video_encode_id) + + video_file = xpath_text(stream_info, './file') + if not video_file: + continue + if video_file.startswith('http'): + formats.extend(self._extract_m3u8_formats( + video_file, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + continue + video_url = xpath_text(stream_info, './host') - video_play_path = xpath_text(stream_info, './file') - if not video_url or not video_play_path: + if not video_url: continue metadata = stream_info.find('./metadata') format_info = { @@ -353,7 +377,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text parsed_video_url = compat_urlparse.urlparse(video_url) direct_video_url = compat_urlparse.urlunparse(parsed_video_url._replace( netloc='v.lvlt.crcdn.net', - path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_play_path.split(':')[-1]))) + path='%s/%s' % (remove_end(parsed_video_url.path, '/'), video_file.split(':')[-1]))) if self._is_valid_url(direct_video_url, video_id, video_format): format_info.update({ 'url': direct_video_url, @@ -363,7 +387,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text format_info.update({ 'url': video_url, - 'play_path': video_play_path, + 'play_path': video_file, 'ext': 'flv', }) formats.append(format_info) diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py index 84b36f4..7e5d4f2 100644 --- a/youtube_dl/extractor/cspan.py +++ b/youtube_dl/extractor/cspan.py @@ -51,8 +51,11 @@ class CSpanIE(InfoExtractor): 'url': 'http://www.c-span.org/video/?104517-1/immigration-reforms-needed-protect-skilled-american-workers', 'info_dict': { 'id': 'judiciary031715', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Immigration Reforms Needed to Protect Skilled American Workers', + }, + 'params': { + 'skip_download': True, # m3u8 downloads } }] diff --git a/youtube_dl/extractor/ctsnews.py b/youtube_dl/extractor/ctsnews.py index 1622fc8..83ca90c 100644 --- a/youtube_dl/extractor/ctsnews.py +++ b/youtube_dl/extractor/ctsnews.py @@ -1,13 +1,12 @@ -# -*- coding: utf-8 -*- +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import parse_iso8601, ExtractorError +from ..utils import unified_timestamp class CtsNewsIE(InfoExtractor): IE_DESC = '華視新聞' - # https connection failed (Connection reset) _VALID_URL = r'https?://news\.cts\.com\.tw/[a-z]+/[a-z]+/\d+/(?P<id>\d+)\.html' _TESTS = [{ 'url': 'http://news.cts.com.tw/cts/international/201501/201501291578109.html', @@ -16,7 +15,7 @@ class CtsNewsIE(InfoExtractor): 'id': '201501291578109', 'ext': 'mp4', 'title': '以色列.真主黨交火 3人死亡', - 'description': 'md5:95e9b295c898b7ff294f09d450178d7d', + 'description': '以色列和黎巴嫩真主黨,爆發五年最嚴重衝突,雙方砲轟交火,兩名以軍死亡,還有一名西班牙籍的聯合國維和人...', 'timestamp': 1422528540, 'upload_date': '20150129', } @@ -28,7 +27,7 @@ class CtsNewsIE(InfoExtractor): 'id': '201309031304098', 'ext': 'mp4', 'title': '韓國31歲童顏男 貌如十多歲小孩', - 'description': 'md5:f183feeba3752b683827aab71adad584', + 'description': '越有年紀的人,越希望看起來年輕一點,而南韓卻有一位31歲的男子,看起來像是11、12歲的小孩,身...', 'thumbnail': 're:^https?://.*\.jpg$', 'timestamp': 1378205880, 'upload_date': '20130903', @@ -36,8 +35,7 @@ class CtsNewsIE(InfoExtractor): }, { # With Youtube embedded video 'url': 'http://news.cts.com.tw/cts/money/201501/201501291578003.html', - 'md5': '1d842c771dc94c8c3bca5af2cc1db9c5', - 'add_ie': ['Youtube'], + 'md5': 'e4726b2ccd70ba2c319865e28f0a91d1', 'info_dict': { 'id': 'OVbfO7d0_hQ', 'ext': 'mp4', @@ -47,42 +45,37 @@ class CtsNewsIE(InfoExtractor): 'upload_date': '20150128', 'uploader_id': 'TBSCTS', 'uploader': '中華電視公司', - } + }, + 'add_ie': ['Youtube'], }] def _real_extract(self, url): news_id = self._match_id(url) page = self._download_webpage(url, news_id) - if self._search_regex(r'(CTSPlayer2)', page, 'CTSPlayer2 identifier', default=None): - feed_url = self._html_search_regex( - r'(http://news\.cts\.com\.tw/action/mp4feed\.php\?news_id=\d+)', - page, 'feed url') - video_url = self._download_webpage( - feed_url, news_id, note='Fetching feed') + news_id = self._hidden_inputs(page).get('get_id') + + if news_id: + mp4_feed = self._download_json( + 'http://news.cts.com.tw/action/test_mp4feed.php', + news_id, note='Fetching feed', query={'news_id': news_id}) + video_url = mp4_feed['source_url'] else: self.to_screen('Not CTSPlayer video, trying Youtube...') youtube_url = self._search_regex( - r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url', - default=None) - if not youtube_url: - raise ExtractorError('The news includes no videos!', expected=True) + r'src="(//www\.youtube\.com/embed/[^"]+)"', page, 'youtube url') - return { - '_type': 'url', - 'url': youtube_url, - 'ie_key': 'Youtube', - } + return self.url_result(youtube_url, ie='Youtube') description = self._html_search_meta('description', page) - title = self._html_search_meta('title', page) + title = self._html_search_meta('title', page, fatal=True) thumbnail = self._html_search_meta('image', page) datetime_str = self._html_search_regex( - r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time') - # Transform into ISO 8601 format with timezone info - datetime_str = datetime_str.replace('/', '-') + ':00+0800' - timestamp = parse_iso8601(datetime_str, delimiter=' ') + r'(\d{4}/\d{2}/\d{2} \d{2}:\d{2})', page, 'date and time', fatal=False) + timestamp = None + if datetime_str: + timestamp = unified_timestamp(datetime_str) - 8 * 3600 return { 'id': news_id, diff --git a/youtube_dl/extractor/ctv.py b/youtube_dl/extractor/ctv.py new file mode 100644 index 0000000..5807fba --- /dev/null +++ b/youtube_dl/extractor/ctv.py @@ -0,0 +1,30 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class CTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctv\.ca/video/player\?vid=(?P<id>[0-9.]+)' + _TESTS = [{ + 'url': 'http://www.ctv.ca/video/player?vid=706966', + 'md5': 'ff2ebbeae0aa2dcc32a830c3fd69b7b0', + 'info_dict': { + 'id': '706966', + 'ext': 'mp4', + 'title': 'Larry Day and Richard Jutras on the TIFF red carpet of \'Stonewall\'', + 'description': 'etalk catches up with Larry Day and Richard Jutras on the TIFF red carpet of "Stonewall”.', + 'upload_date': '20150919', + 'timestamp': 1442624700, + }, + 'expected_warnings': ['HTTP Error 404'], + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': '9c9media:ctv_web:%s' % video_id, + 'ie_key': 'NineCNineMedia', + } diff --git a/youtube_dl/extractor/ctvnews.py b/youtube_dl/extractor/ctvnews.py new file mode 100644 index 0000000..1023b61 --- /dev/null +++ b/youtube_dl/extractor/ctvnews.py @@ -0,0 +1,65 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import orderedSet + + +class CTVNewsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?ctvnews\.ca/(?:video\?(?:clip|playlist|bin)Id=|.*?)(?P<id>[0-9.]+)' + _TESTS = [{ + 'url': 'http://www.ctvnews.ca/video?clipId=901995', + 'md5': '10deb320dc0ccb8d01d34d12fc2ea672', + 'info_dict': { + 'id': '901995', + 'ext': 'mp4', + 'title': 'Extended: \'That person cannot be me\' Johnson says', + 'description': 'md5:958dd3b4f5bbbf0ed4d045c790d89285', + 'timestamp': 1467286284, + 'upload_date': '20160630', + } + }, { + 'url': 'http://www.ctvnews.ca/video?playlistId=1.2966224', + 'info_dict': + { + 'id': '1.2966224', + }, + 'playlist_mincount': 19, + }, { + 'url': 'http://www.ctvnews.ca/video?binId=1.2876780', + 'info_dict': + { + 'id': '1.2876780', + }, + 'playlist_mincount': 100, + }, { + 'url': 'http://www.ctvnews.ca/1.810401', + 'only_matching': True, + }, { + 'url': 'http://www.ctvnews.ca/canadiens-send-p-k-subban-to-nashville-in-blockbuster-trade-1.2967231', + 'only_matching': True, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + def ninecninemedia_url_result(clip_id): + return { + '_type': 'url_transparent', + 'id': clip_id, + 'url': '9c9media:ctvnews_web:%s' % clip_id, + 'ie_key': 'NineCNineMedia', + } + + if page_id.isdigit(): + return ninecninemedia_url_result(page_id) + else: + webpage = self._download_webpage('http://www.ctvnews.ca/%s' % page_id, page_id, query={ + 'ot': 'example.AjaxPageLayout.ot', + 'maxItemsPerPage': 1000000, + }) + entries = [ninecninemedia_url_result(clip_id) for clip_id in orderedSet( + re.findall(r'clip\.id\s*=\s*(\d+);', webpage))] + return self.playlist_result(entries, page_id) diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py index ebd14cb..1ab9333 100644 --- a/youtube_dl/extractor/cwtv.py +++ b/youtube_dl/extractor/cwtv.py @@ -9,7 +9,7 @@ from ..utils import ( class CWTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?cw(?:tv|seed)\.com/(?:shows/)?(?:[^/]+/){2}\?.*\bplay=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' + _VALID_URL = r'https?://(?:www\.)?cw(?:tv(?:pr)?|seed)\.com/(?:shows/)?(?:[^/]+/)+[^?]*\?.*\b(?:play|watch)=(?P<id>[a-z0-9]{8}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{4}-[a-z0-9]{12})' _TESTS = [{ 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?play=6b15e985-9345-4f60-baf8-56e96be57c63', 'info_dict': { @@ -28,7 +28,8 @@ class CWTVIE(InfoExtractor): 'params': { # m3u8 download 'skip_download': True, - } + }, + 'skip': 'redirect to http://cwtv.com/shows/arrow/', }, { 'url': 'http://www.cwseed.com/shows/whose-line-is-it-anyway/jeff-davis-4/?play=24282b12-ead2-42f2-95ad-26770c2c6088', 'info_dict': { @@ -44,22 +45,43 @@ class CWTVIE(InfoExtractor): 'upload_date': '20151006', 'timestamp': 1444107300, }, - 'params': { - # m3u8 download - 'skip_download': True, - } }, { 'url': 'http://cwtv.com/thecw/chroniclesofcisco/?play=8adebe35-f447-465f-ab52-e863506ff6d6', 'only_matching': True, + }, { + 'url': 'http://cwtvpr.com/the-cw/video?watch=9eee3f60-ef4e-440b-b3b2-49428ac9c54e', + 'only_matching': True, + }, { + 'url': 'http://cwtv.com/shows/arrow/legends-of-yesterday/?watch=6b15e985-9345-4f60-baf8-56e96be57c63', + 'only_matching': True, }] def _real_extract(self, url): video_id = self._match_id(url) - video_data = self._download_json( - 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/132?format=json' % video_id, video_id) - - formats = self._extract_m3u8_formats( - video_data['videos']['variantplaylist']['uri'], video_id, 'mp4') + video_data = None + formats = [] + for partner in (154, 213): + vdata = self._download_json( + 'http://metaframe.digitalsmiths.tv/v2/CWtv/assets/%s/partner/%d?format=json' % (video_id, partner), video_id, fatal=False) + if not vdata: + continue + video_data = vdata + for quality, quality_data in vdata.get('videos', {}).items(): + quality_url = quality_data.get('uri') + if not quality_url: + continue + if quality == 'variantplaylist': + formats.extend(self._extract_m3u8_formats( + quality_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) + else: + tbr = int_or_none(quality_data.get('bitrate')) + format_id = 'http' + ('-%d' % tbr if tbr else '') + if self._is_valid_url(quality_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': quality_url, + 'tbr': tbr, + }) self._sort_formats(formats) thumbnails = [{ diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py index b60a1d8..98c835b 100644 --- a/youtube_dl/extractor/dailymail.py +++ b/youtube_dl/extractor/dailymail.py @@ -5,19 +5,20 @@ from .common import InfoExtractor from ..utils import ( int_or_none, determine_protocol, + unescapeHTML, ) class DailyMailIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)' _TEST = { - 'url': 'http://www.dailymail.co.uk/video/sciencetech/video-1288527/Turn-video-impressionist-masterpiece.html', - 'md5': '2f639d446394f53f3a33658b518b6615', + 'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html', + 'md5': 'f6129624562251f628296c3a9ffde124', 'info_dict': { - 'id': '1288527', + 'id': '1295863', 'ext': 'mp4', - 'title': 'Turn any video into an impressionist masterpiece', - 'description': 'md5:88ddbcb504367987b2708bb38677c9d2', + 'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'', + 'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84', } } @@ -26,7 +27,7 @@ class DailyMailIE(InfoExtractor): webpage = self._download_webpage(url, video_id) video_data = self._parse_json(self._search_regex( r"data-opts='({.+?})'", webpage, 'video data'), video_id) - title = video_data['title'] + title = unescapeHTML(video_data['title']) video_sources = self._download_json(video_data.get( 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id) @@ -55,7 +56,7 @@ class DailyMailIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': video_data.get('descr'), + 'description': unescapeHTML(video_data.get('descr')), 'thumbnail': video_data.get('poster') or video_data.get('thumbnail'), 'formats': formats, } diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py index 2e6226e..496883d 100644 --- a/youtube_dl/extractor/dailymotion.py +++ b/youtube_dl/extractor/dailymotion.py @@ -16,6 +16,7 @@ from ..utils import ( sanitized_Request, str_to_int, unescapeHTML, + mimetype2ext, ) @@ -111,6 +112,13 @@ class DailymotionIE(DailymotionBaseInfoExtractor): } ] + @staticmethod + def _extract_urls(webpage): + # Look for embedded Dailymotion player + matches = re.findall( + r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) + return list(map(lambda m: unescapeHTML(m[1]), matches)) + def _real_extract(self, url): video_id = self._match_id(url) @@ -153,18 +161,19 @@ class DailymotionIE(DailymotionBaseInfoExtractor): type_ = media.get('type') if type_ == 'application/vnd.lumberjack.manifest': continue - ext = determine_ext(media_url) - if type_ == 'application/x-mpegURL' or ext == 'm3u8': + ext = mimetype2ext(type_) or determine_ext(media_url) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', preference=-1, m3u8_id='hls', fatal=False)) - elif type_ == 'application/f4m' or ext == 'f4m': + elif ext == 'f4m': formats.extend(self._extract_f4m_formats( media_url, video_id, preference=-1, f4m_id='hds', fatal=False)) else: f = { 'url': media_url, 'format_id': 'http-%s' % quality, + 'ext': ext, } m = re.search(r'H264-(?P<width>\d+)x(?P<height>\d+)', media_url) if m: @@ -322,7 +331,9 @@ class DailymotionPlaylistIE(DailymotionBaseInfoExtractor): for video_id in re.findall(r'data-xid="(.+?)"', webpage): if video_id not in video_ids: - yield self.url_result('http://www.dailymotion.com/video/%s' % video_id, 'Dailymotion') + yield self.url_result( + 'http://www.dailymotion.com/video/%s' % video_id, + DailymotionIE.ie_key(), video_id) video_ids.add(video_id) if re.search(self._MORE_PAGES_INDICATOR, webpage) is None: diff --git a/youtube_dl/extractor/daum.py b/youtube_dl/extractor/daum.py index 86024a7..b5c310c 100644 --- a/youtube_dl/extractor/daum.py +++ b/youtube_dl/extractor/daum.py @@ -66,22 +66,32 @@ class DaumIE(InfoExtractor): 'view_count': int, 'comment_count': int, }, + }, { + # Requires dte_type=WEB (#9972) + 'url': 'http://tvpot.daum.net/v/s3794Uf1NZeZ1qMpGpeqeRU', + 'md5': 'a8917742069a4dd442516b86e7d66529', + 'info_dict': { + 'id': 's3794Uf1NZeZ1qMpGpeqeRU', + 'ext': 'mp4', + 'title': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny) [쇼! 음악중심] 508회 20160611', + 'description': '러블리즈 - Destiny (나의 지구) (Lovelyz - Destiny)\n\n[쇼! 음악중심] 20160611, 507회', + 'upload_date': '20160611', + }, }] def _real_extract(self, url): video_id = compat_urllib_parse_unquote(self._match_id(url)) - query = compat_urllib_parse_urlencode({'vid': video_id}) movie_data = self._download_json( - 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json?' + query, - video_id, 'Downloading video formats info') + 'http://videofarm.daum.net/controller/api/closed/v1_2/IntegratedMovieData.json', + video_id, 'Downloading video formats info', query={'vid': video_id, 'dte_type': 'WEB'}) # For urls like http://m.tvpot.daum.net/v/65139429, where the video_id is really a clipid if not movie_data.get('output_list', {}).get('output_list') and re.match(r'^\d+$', video_id): return self.url_result('http://tvpot.daum.net/clip/ClipView.do?clipid=%s' % video_id) info = self._download_xml( - 'http://tvpot.daum.net/clip/ClipInfoXml.do?' + query, video_id, - 'Downloading video info') + 'http://tvpot.daum.net/clip/ClipInfoXml.do', video_id, + 'Downloading video info', query={'vid': video_id}) formats = [] for format_el in movie_data['output_list']['output_list']: diff --git a/youtube_dl/extractor/dbtv.py b/youtube_dl/extractor/dbtv.py index 133cdc5..caff884 100644 --- a/youtube_dl/extractor/dbtv.py +++ b/youtube_dl/extractor/dbtv.py @@ -4,78 +4,47 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str -from ..utils import ( - float_or_none, - int_or_none, - clean_html, -) class DBTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:(?:lazyplayer|player)/)?(?P<id>[0-9]+)(?:#(?P<display_id>.+))?' + _VALID_URL = r'https?://(?:www\.)?dbtv\.no/(?:[^/]+/)?(?P<id>[0-9]+)(?:#(?P<display_id>.+))?' _TESTS = [{ 'url': 'http://dbtv.no/3649835190001#Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen', - 'md5': 'b89953ed25dacb6edb3ef6c6f430f8bc', + 'md5': '2e24f67936517b143a234b4cadf792ec', 'info_dict': { - 'id': '33100', + 'id': '3649835190001', 'display_id': 'Skulle_teste_ut_fornøyelsespark,_men_kollegaen_var_bare_opptatt_av_bikinikroppen', 'ext': 'mp4', 'title': 'Skulle teste ut fornøyelsespark, men kollegaen var bare opptatt av bikinikroppen', 'description': 'md5:1504a54606c4dde3e4e61fc97aa857e0', - 'thumbnail': 're:https?://.*\.jpg$', - 'timestamp': 1404039863.438, + 'thumbnail': 're:https?://.*\.jpg', + 'timestamp': 1404039863, 'upload_date': '20140629', 'duration': 69.544, - 'view_count': int, - 'categories': list, - } + 'uploader_id': '1027729757001', + }, + 'add_ie': ['BrightcoveNew'] }, { 'url': 'http://dbtv.no/3649835190001', 'only_matching': True, }, { 'url': 'http://www.dbtv.no/lazyplayer/4631135248001', 'only_matching': True, + }, { + 'url': 'http://dbtv.no/vice/5000634109001', + 'only_matching': True, + }, { + 'url': 'http://dbtv.no/filmtrailer/3359293614001', + 'only_matching': True, }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') or video_id - - data = self._download_json( - 'http://api.dbtv.no/discovery/%s' % video_id, display_id) - - video = data['playlist'][0] - - formats = [{ - 'url': f['URL'], - 'vcodec': f.get('container'), - 'width': int_or_none(f.get('width')), - 'height': int_or_none(f.get('height')), - 'vbr': float_or_none(f.get('rate'), 1000), - 'filesize': int_or_none(f.get('size')), - } for f in video['renditions'] if 'URL' in f] - - if not formats: - for url_key, format_id in [('URL', 'mp4'), ('HLSURL', 'hls')]: - if url_key in video: - formats.append({ - 'url': video[url_key], - 'format_id': format_id, - }) - - self._sort_formats(formats) + video_id, display_id = re.match(self._VALID_URL, url).groups() return { - 'id': compat_str(video['id']), + '_type': 'url_transparent', + 'url': 'http://players.brightcove.net/1027729757001/default_default/index.html?videoId=%s' % video_id, + 'id': video_id, 'display_id': display_id, - 'title': video['title'], - 'description': clean_html(video['desc']), - 'thumbnail': video.get('splash') or video.get('thumb'), - 'timestamp': float_or_none(video.get('publishedAt'), 1000), - 'duration': float_or_none(video.get('length'), 1000), - 'view_count': int_or_none(video.get('views')), - 'categories': video.get('tags'), - 'formats': formats, + 'ie_key': 'BrightcoveNew', } diff --git a/youtube_dl/extractor/dcn.py b/youtube_dl/extractor/dcn.py index efb8585..b854282 100644 --- a/youtube_dl/extractor/dcn.py +++ b/youtube_dl/extractor/dcn.py @@ -62,11 +62,9 @@ class DCNBaseIE(InfoExtractor): r'file\s*:\s*"https?(://[^"]+)/playlist.m3u8', r'<a[^>]+href="rtsp(://[^"]+)"' ], webpage, 'format url') - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet - # formats.extend(self._extract_mpd_formats( - # format_url_base + '/manifest.mpd', - # video_id, mpd_id='dash', fatal=False)) + formats.extend(self._extract_mpd_formats( + format_url_base + '/manifest.mpd', + video_id, mpd_id='dash', fatal=False)) formats.extend(self._extract_m3u8_formats( format_url_base + '/playlist.m3u8', video_id, 'mp4', m3u8_entry_protocol, m3u8_id='hls', fatal=False)) diff --git a/youtube_dl/extractor/discoverygo.py b/youtube_dl/extractor/discoverygo.py new file mode 100644 index 0000000..adb68b9 --- /dev/null +++ b/youtube_dl/extractor/discoverygo.py @@ -0,0 +1,98 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + extract_attributes, + int_or_none, + parse_age_limit, + unescapeHTML, +) + + +class DiscoveryGoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?discoverygo\.com/(?:[^/]+/)*(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.discoverygo.com/love-at-first-kiss/kiss-first-ask-questions-later/', + 'info_dict': { + 'id': '57a33c536b66d1cd0345eeb1', + 'ext': 'mp4', + 'title': 'Kiss First, Ask Questions Later!', + 'description': 'md5:fe923ba34050eae468bffae10831cb22', + 'duration': 2579, + 'series': 'Love at First Kiss', + 'season_number': 1, + 'episode_number': 1, + 'age_limit': 14, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + container = extract_attributes( + self._search_regex( + r'(<div[^>]+class=["\']video-player-container[^>]+>)', + webpage, 'video container')) + + video = self._parse_json( + unescapeHTML(container.get('data-video') or container.get('data-json')), + display_id) + + title = video['name'] + + stream = video['stream'] + STREAM_URL_SUFFIX = 'streamUrl' + formats = [] + for stream_kind in ('', 'hds'): + suffix = STREAM_URL_SUFFIX.capitalize() if stream_kind else STREAM_URL_SUFFIX + stream_url = stream.get('%s%s' % (stream_kind, suffix)) + if not stream_url: + continue + if stream_kind == '': + formats.extend(self._extract_m3u8_formats( + stream_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + elif stream_kind == 'hds': + formats.extend(self._extract_f4m_formats( + stream_url, display_id, f4m_id=stream_kind, fatal=False)) + self._sort_formats(formats) + + video_id = video.get('id') or display_id + description = video.get('description', {}).get('detailed') + duration = int_or_none(video.get('duration')) + + series = video.get('show', {}).get('name') + season_number = int_or_none(video.get('season', {}).get('number')) + episode_number = int_or_none(video.get('episodeNumber')) + + tags = video.get('tags') + age_limit = parse_age_limit(video.get('parental', {}).get('rating')) + + subtitles = {} + captions = stream.get('captions') + if isinstance(captions, list): + for caption in captions: + subtitle_url = caption.get('fileUrl') + if (not subtitle_url or not isinstance(subtitle_url, compat_str) or + not subtitle_url.startswith('http')): + continue + lang = caption.get('fileLang', 'en') + subtitles.setdefault(lang, []).append({'url': subtitle_url}) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'series': series, + 'season_number': season_number, + 'episode_number': episode_number, + 'tags': tags, + 'age_limit': age_limit, + 'formats': formats, + 'subtitles': subtitles, + } diff --git a/youtube_dl/extractor/dreisat.py b/youtube_dl/extractor/dreisat.py index 0040e70..908c9e5 100644 --- a/youtube_dl/extractor/dreisat.py +++ b/youtube_dl/extractor/dreisat.py @@ -17,8 +17,12 @@ class DreiSatIE(ZDFIE): 'ext': 'mp4', 'title': 'Waidmannsheil', 'description': 'md5:cce00ca1d70e21425e72c86a98a56817', - 'uploader': '3sat', + 'uploader': 'SCHWEIZWEIT', + 'uploader_id': '100000210', 'upload_date': '20140913' + }, + 'params': { + 'skip_download': True, # m3u8 downloads } }, { diff --git a/youtube_dl/extractor/drtuber.py b/youtube_dl/extractor/drtuber.py index 639f918..e8870c4 100644 --- a/youtube_dl/extractor/drtuber.py +++ b/youtube_dl/extractor/drtuber.py @@ -3,7 +3,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..utils import str_to_int +from ..utils import ( + NO_DEFAULT, + str_to_int, +) class DrTuberIE(InfoExtractor): @@ -17,7 +20,6 @@ class DrTuberIE(InfoExtractor): 'ext': 'mp4', 'title': 'hot perky blonde naked golf', 'like_count': int, - 'dislike_count': int, 'comment_count': int, 'categories': ['Babe', 'Blonde', 'Erotic', 'Outdoor', 'Softcore', 'Solo'], 'thumbnail': 're:https?://.*\.jpg$', @@ -36,25 +38,29 @@ class DrTuberIE(InfoExtractor): r'<source src="([^"]+)"', webpage, 'video URL') title = self._html_search_regex( - [r'<p[^>]+class="title_substrate">([^<]+)</p>', r'<title>([^<]+) - \d+'], + (r'class="title_watch"[^>]*><p>([^<]+)<', + r'<p[^>]+class="title_substrate">([^<]+)</p>', + r'<title>([^<]+) - \d+'), webpage, 'title') thumbnail = self._html_search_regex( r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) - def extract_count(id_, name): + def extract_count(id_, name, default=NO_DEFAULT): return str_to_int(self._html_search_regex( r'<span[^>]+(?:class|id)="%s"[^>]*>([\d,\.]+)</span>' % id_, - webpage, '%s count' % name, fatal=False)) + webpage, '%s count' % name, default=default, fatal=False)) like_count = extract_count('rate_likes', 'like') - dislike_count = extract_count('rate_dislikes', 'dislike') + dislike_count = extract_count('rate_dislikes', 'dislike', default=None) comment_count = extract_count('comments_count', 'comment') cats_str = self._search_regex( - r'<div[^>]+class="categories_list">(.+?)</div>', webpage, 'categories', fatal=False) - categories = [] if not cats_str else re.findall(r'<a title="([^"]+)"', cats_str) + r'<div[^>]+class="categories_list">(.+?)</div>', + webpage, 'categories', fatal=False) + categories = [] if not cats_str else re.findall( + r'<a title="([^"]+)"', cats_str) return { 'id': video_id, diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py index 113a496..12d28d3 100644 --- a/youtube_dl/extractor/eagleplatform.py +++ b/youtube_dl/extractor/eagleplatform.py @@ -51,6 +51,14 @@ class EaglePlatformIE(InfoExtractor): }] @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//.+?\.media\.eagleplatform\.com/index/player\?.+?)\1', + webpage) + if mobj is not None: + return mobj.group('url') + + @staticmethod def _handle_error(response): status = int_or_none(response.get('status', 200)) if status != 200: diff --git a/youtube_dl/extractor/ellentv.py b/youtube_dl/extractor/ellentv.py index 4c8190d..74bbc5c 100644 --- a/youtube_dl/extractor/ellentv.py +++ b/youtube_dl/extractor/ellentv.py @@ -6,12 +6,13 @@ import json from .common import InfoExtractor from ..utils import ( ExtractorError, + NO_DEFAULT, ) class EllenTVIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?(?:ellentv|ellentube)\.com/videos/(?P<id>[a-z0-9_-]+)' - _TEST = { + _TESTS = [{ 'url': 'http://www.ellentv.com/videos/0-ipq1gsai/', 'md5': '4294cf98bc165f218aaa0b89e0fd8042', 'info_dict': { @@ -22,24 +23,47 @@ class EllenTVIE(InfoExtractor): 'timestamp': 1428035648, 'upload_date': '20150403', 'uploader_id': 'batchUser', - } - } + }, + }, { + # not available via http://widgets.ellentube.com/ + 'url': 'http://www.ellentv.com/videos/1-szkgu2m2/', + 'info_dict': { + 'id': '1_szkgu2m2', + 'ext': 'flv', + 'title': "Ellen's Amazingly Talented Audience", + 'description': 'md5:86ff1e376ff0d717d7171590e273f0a5', + 'timestamp': 1255140900, + 'upload_date': '20091010', + 'uploader_id': 'ellenkaltura@gmail.com', + }, + 'params': { + 'skip_download': True, + }, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://widgets.ellentube.com/videos/%s' % video_id, - video_id) + URLS = ('http://widgets.ellentube.com/videos/%s' % video_id, url) + + for num, url_ in enumerate(URLS, 1): + webpage = self._download_webpage( + url_, video_id, fatal=num == len(URLS)) + + default = NO_DEFAULT if num == len(URLS) else None + + partner_id = self._search_regex( + r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id', + default=default) - partner_id = self._search_regex( - r"var\s+partnerId\s*=\s*'([^']+)", webpage, 'partner id') + kaltura_id = self._search_regex( + [r'id="kaltura_player_([^"]+)"', + r"_wb_entry_id\s*:\s*'([^']+)", + r'data-kaltura-entry-id="([^"]+)'], + webpage, 'kaltura id', default=default) - kaltura_id = self._search_regex( - [r'id="kaltura_player_([^"]+)"', - r"_wb_entry_id\s*:\s*'([^']+)", - r'data-kaltura-entry-id="([^"]+)'], - webpage, 'kaltura id') + if partner_id and kaltura_id: + break return self.url_result('kaltura:%s:%s' % (partner_id, kaltura_id), 'Kaltura') diff --git a/youtube_dl/extractor/engadget.py b/youtube_dl/extractor/engadget.py index e5e57d4..a39e901 100644 --- a/youtube_dl/extractor/engadget.py +++ b/youtube_dl/extractor/engadget.py @@ -4,9 +4,10 @@ from .common import InfoExtractor class EngadgetIE(InfoExtractor): - _VALID_URL = r'https?://www.engadget.com/video/(?P<id>\d+)' + _VALID_URL = r'https?://www.engadget.com/video/(?P<id>[^/?#]+)' - _TEST = { + _TESTS = [{ + # video with 5min ID 'url': 'http://www.engadget.com/video/518153925/', 'md5': 'c6820d4828a5064447a4d9fc73f312c9', 'info_dict': { @@ -15,8 +16,12 @@ class EngadgetIE(InfoExtractor): 'title': 'Samsung Galaxy Tab Pro 8.4 Review', }, 'add_ie': ['FiveMin'], - } + }, { + # video with vidible ID + 'url': 'https://www.engadget.com/video/57a28462134aa15a39f0421a/', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - return self.url_result('5min:%s' % video_id) + return self.url_result('aol-video:%s' % video_id) diff --git a/youtube_dl/extractor/eporner.py b/youtube_dl/extractor/eporner.py index ac5d0fe..f3734e9 100644 --- a/youtube_dl/extractor/eporner.py +++ b/youtube_dl/extractor/eporner.py @@ -4,19 +4,23 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( + encode_base_n, + ExtractorError, + int_or_none, parse_duration, str_to_int, ) class EpornerIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)/(?P<display_id>[\w-]+)' + _VALID_URL = r'https?://(?:www\.)?eporner\.com/hd-porn/(?P<id>\w+)(?:/(?P<display_id>[\w-]+))?' _TESTS = [{ 'url': 'http://www.eporner.com/hd-porn/95008/Infamous-Tiffany-Teen-Strip-Tease-Video/', 'md5': '39d486f046212d8e1b911c52ab4691f8', 'info_dict': { - 'id': '95008', + 'id': 'qlDUmNsj6VS', 'display_id': 'Infamous-Tiffany-Teen-Strip-Tease-Video', 'ext': 'mp4', 'title': 'Infamous Tiffany Teen Strip Tease Video', @@ -28,34 +32,72 @@ class EpornerIE(InfoExtractor): # New (May 2016) URL layout 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0/Star-Wars-XXX-Parody/', 'only_matching': True, + }, { + 'url': 'http://www.eporner.com/hd-porn/3YRUtzMcWn0', + 'only_matching': True, }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = mobj.group('display_id') or video_id + + webpage, urlh = self._download_webpage_handle(url, display_id) + + video_id = self._match_id(compat_str(urlh.geturl())) - webpage = self._download_webpage(url, display_id) - title = self._html_search_regex( - r'<title>(.*?) - EPORNER', webpage, 'title') + hash = self._search_regex( + r'hash\s*:\s*["\']([\da-f]{32})', webpage, 'hash') - redirect_url = 'http://www.eporner.com/config5/%s' % video_id - player_code = self._download_webpage( - redirect_url, display_id, note='Downloading player config') + title = self._og_search_title(webpage, default=None) or self._html_search_regex( + r'<title>(.+?) - EPORNER', webpage, 'title') - sources = self._search_regex( - r'(?s)sources\s*:\s*\[\s*({.+?})\s*\]', player_code, 'sources') + # Reverse engineered from vjs.js + def calc_hash(s): + return ''.join((encode_base_n(int(s[lb:lb + 8], 16), 36) for lb in range(0, 32, 8))) + + video = self._download_json( + 'http://www.eporner.com/xhr/video/%s' % video_id, + display_id, note='Downloading video JSON', + query={ + 'hash': calc_hash(hash), + 'device': 'generic', + 'domain': 'www.eporner.com', + 'fallback': 'false', + }) + + if video.get('available') is False: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, video['message']), expected=True) + + sources = video['sources'] formats = [] - for video_url, format_id in re.findall(r'file\s*:\s*"([^"]+)",\s*label\s*:\s*"([^"]+)"', sources): - fmt = { - 'url': video_url, - 'format_id': format_id, - } - m = re.search(r'^(\d+)', format_id) - if m: - fmt['height'] = int(m.group(1)) - formats.append(fmt) + for kind, formats_dict in sources.items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_dict in formats_dict.items(): + if not isinstance(format_dict, dict): + continue + src = format_dict.get('src') + if not isinstance(src, compat_str) or not src.startswith('http'): + continue + if kind == 'hls': + formats.extend(self._extract_m3u8_formats( + src, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=kind, fatal=False)) + else: + height = int_or_none(self._search_regex( + r'(\d+)[pP]', format_id, 'height', default=None)) + fps = int_or_none(self._search_regex( + r'(\d+)fps', format_id, 'fps', default=None)) + + formats.append({ + 'url': src, + 'format_id': format_id, + 'height': height, + 'fps': fps, + }) self._sort_formats(formats) duration = parse_duration(self._html_search_meta('duration', webpage)) diff --git a/youtube_dl/extractor/expotv.py b/youtube_dl/extractor/expotv.py index 1585a03..971c918 100644 --- a/youtube_dl/extractor/expotv.py +++ b/youtube_dl/extractor/expotv.py @@ -1,7 +1,5 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( int_or_none, @@ -12,23 +10,22 @@ from ..utils import ( class ExpoTVIE(InfoExtractor): _VALID_URL = r'https?://www\.expotv\.com/videos/[^?#]*/(?P<id>[0-9]+)($|[?#])' _TEST = { - 'url': 'http://www.expotv.com/videos/reviews/1/24/LinneCardscom/17561', - 'md5': '2985e6d7a392b2f7a05e0ca350fe41d0', + 'url': 'http://www.expotv.com/videos/reviews/3/40/NYX-Butter-lipstick/667916', + 'md5': 'fe1d728c3a813ff78f595bc8b7a707a8', 'info_dict': { - 'id': '17561', + 'id': '667916', 'ext': 'mp4', - 'upload_date': '20060212', - 'title': 'My Favorite Online Scrapbook Store', - 'view_count': int, - 'description': 'You\'ll find most everything you need at this virtual store front.', - 'uploader': 'Anna T.', + 'title': 'NYX Butter Lipstick Little Susie', + 'description': 'Goes on like butter, but looks better!', 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'Stephanie S.', + 'upload_date': '20150520', + 'view_count': int, } } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') + video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) player_key = self._search_regex( @@ -66,7 +63,7 @@ class ExpoTVIE(InfoExtractor): fatal=False) upload_date = unified_strdate(self._search_regex( r'<h5>Reviewed on ([0-9/.]+)</h5>', webpage, 'upload date', - fatal=False)) + fatal=False), day_first=False) return { 'id': video_id, diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py index 6fc5a18..55c6391 100644 --- a/youtube_dl/extractor/extractors.py +++ b/youtube_dl/extractor/extractors.py @@ -20,12 +20,16 @@ from .adobetv import ( AdobeTVVideoIE, ) from .adultswim import AdultSwimIE -from .aenetworks import AENetworksIE +from .aenetworks import ( + AENetworksIE, + HistoryTopicIE, +) from .afreecatv import AfreecaTVIE from .aftonbladet import AftonbladetIE from .airmozilla import AirMozillaIE from .aljazeera import AlJazeeraIE from .alphaporno import AlphaPornoIE +from .amcnetworks import AMCNetworksIE from .animeondemand import AnimeOnDemandIE from .anitube import AnitubeIE from .anysex import AnySexIE @@ -41,6 +45,7 @@ from .appletrailers import ( AppleTrailersSectionIE, ) from .archiveorg import ArchiveOrgIE +from .arkena import ArkenaIE from .ard import ( ARDIE, ARDMediathekIE, @@ -136,9 +141,9 @@ from .chirbit import ( ChirbitProfileIE, ) from .cinchcast import CinchcastIE -from .cliprs import ClipRsIE from .clipfish import ClipfishIE from .cliphunter import CliphunterIE +from .cliprs import ClipRsIE from .clipsyndicate import ClipsyndicateIE from .closertotruth import CloserToTruthIE from .cloudy import CloudyIE @@ -153,7 +158,12 @@ from .cnn import ( ) from .coub import CoubIE from .collegerama import CollegeRamaIE -from .comedycentral import ComedyCentralIE, ComedyCentralShowsIE +from .comedycentral import ( + ComedyCentralIE, + ComedyCentralShortnameIE, + ComedyCentralTVIE, + ToshIE, +) from .comcarcoff import ComCarCoffIE from .commonmistakes import CommonMistakesIE, UnicodeBOMIE from .commonprotocols import RtmpIE @@ -168,6 +178,8 @@ from .crunchyroll import ( ) from .cspan import CSpanIE from .ctsnews import CtsNewsIE +from .ctv import CTVIE +from .ctvnews import CTVNewsIE from .cultureunplugged import CultureUnpluggedIE from .cwtv import CWTVIE from .dailymail import DailyMailIE @@ -210,6 +222,7 @@ from .dvtv import DVTVIE from .dumpert import DumpertIE from .defense import DefenseGouvFrIE from .discovery import DiscoveryIE +from .discoverygo import DiscoveryGoIE from .dispeak import DigitallySpeakingIE from .dropbox import DropboxIE from .dw import ( @@ -251,6 +264,7 @@ from .fivemin import FiveMinIE from .fivetv import FiveTVIE from .fktv import FKTVIE from .flickr import FlickrIE +from .flipagram import FlipagramIE from .folketinget import FolketingetIE from .footyroom import FootyRoomIE from .formula1 import Formula1IE @@ -259,10 +273,7 @@ from .fox import FOXIE from .foxgay import FoxgayIE from .foxnews import FoxNewsIE from .foxsports import FoxSportsIE -from .franceculture import ( - FranceCultureIE, - FranceCultureEmissionIE, -) +from .franceculture import FranceCultureIE from .franceinter import FranceInterIE from .francetv import ( PluzzIE, @@ -276,8 +287,9 @@ from .freespeech import FreespeechIE from .freevideo import FreeVideoIE from .funimation import FunimationIE from .funnyordie import FunnyOrDieIE +from .fusion import FusionIE +from .fxnetworks import FXNetworksIE from .gameinformer import GameInformerIE -from .gamekings import GamekingsIE from .gameone import ( GameOneIE, GameOnePlaylistIE, @@ -298,7 +310,6 @@ from .globo import ( ) from .godtube import GodTubeIE from .godtv import GodTVIE -from .goldenmoustache import GoldenMoustacheIE from .golem import GolemIE from .googledrive import GoogleDriveIE from .googleplus import GooglePlusIE @@ -313,6 +324,7 @@ from .heise import HeiseIE from .hellporno import HellPornoIE from .helsinki import HelsinkiIE from .hentaistigma import HentaiStigmaIE +from .hgtv import HGTVIE from .historicfilms import HistoricFilmsIE from .hitbox import HitboxIE, HitboxLiveIE from .hornbunny import HornBunnyIE @@ -320,6 +332,10 @@ from .hotnewhiphop import HotNewHipHopIE from .hotstar import HotStarIE from .howcast import HowcastIE from .howstuffworks import HowStuffWorksIE +from .hrti import ( + HRTiIE, + HRTiPlaylistIE, +) from .huffpost import HuffPostIE from .hypem import HypemIE from .iconosquare import IconosquareIE @@ -358,6 +374,7 @@ from .jove import JoveIE from .jwplatform import JWPlatformIE from .jpopsukitv import JpopsukiIE from .kaltura import KalturaIE +from .kamcord import KamcordIE from .kanalplay import KanalPlayIE from .kankan import KankanIE from .karaoketv import KaraoketvIE @@ -381,6 +398,10 @@ from .kuwo import ( ) from .la7 import LA7IE from .laola1tv import Laola1TvIE +from .lcp import ( + LcpPlayIE, + LcpIE, +) from .learnr import LearnrIE from .lecture2go import Lecture2GoIE from .lemonde import LemondeIE @@ -422,6 +443,7 @@ from .makerschannel import MakersChannelIE from .makertv import MakerTVIE from .matchtv import MatchTVIE from .mdr import MDRIE +from .meta import METAIE from .metacafe import MetacafeIE from .metacritic import MetacriticIE from .mgoon import MgoonIE @@ -454,10 +476,10 @@ from .motherless import MotherlessIE from .motorsport import MotorsportIE from .movieclips import MovieClipsIE from .moviezine import MoviezineIE +from .msn import MSNIE from .mtv import ( MTVIE, MTVServicesEmbeddedIE, - MTVIggyIE, MTVDEIE, ) from .muenchentv import MuenchenTVIE @@ -469,8 +491,9 @@ from .myvi import MyviIE from .myvideo import MyVideoIE from .myvidster import MyVidsterIE from .nationalgeographic import ( + NationalGeographicVideoIE, NationalGeographicIE, - NationalGeographicChannelIE, + NationalGeographicEpisodeGuideIE, ) from .naver import NaverIE from .nba import NBAIE @@ -507,7 +530,6 @@ from .nextmedia import ( NextMediaActionNewsIE, AppleDailyIE, ) -from .nextmovie import NextMovieIE from .nfb import NFBIE from .nfl import NFLIE from .nhl import ( @@ -521,7 +543,10 @@ from .nick import ( NickDeIE, ) from .niconico import NiconicoIE, NiconicoPlaylistIE +from .ninecninemedia import NineCNineMediaIE from .ninegag import NineGagIE +from .ninenow import NineNowIE +from .nintendo import NintendoIE from .noco import NocoIE from .normalboots import NormalbootsIE from .nosvideo import NosVideoIE @@ -566,8 +591,13 @@ from .nytimes import ( NYTimesArticleIE, ) from .nuvid import NuvidIE +from .odatv import OdaTVIE from .odnoklassniki import OdnoklassnikiIE from .oktoberfesttv import OktoberfestTVIE +from .onet import ( + OnetIE, + OnetChannelIE, +) from .onionstudios import OnionStudiosIE from .ooyala import ( OoyalaIE, @@ -606,6 +636,8 @@ from .pluralsight import ( PluralsightCourseIE, ) from .podomatic import PodomaticIE +from .pokemon import PokemonIE +from .polskieradio import PolskieRadioIE from .porn91 import Porn91IE from .pornhd import PornHdIE from .pornhub import ( @@ -660,16 +692,19 @@ from .rice import RICEIE from .ringtv import RingTVIE from .ro220 import Ro220IE from .rockstargames import RockstarGamesIE +from .roosterteeth import RoosterTeethIE from .rottentomatoes import RottenTomatoesIE from .roxwel import RoxwelIE +from .rozhlas import RozhlasIE from .rtbf import RTBFIE from .rte import RteIE, RteRadioIE from .rtlnl import RtlNlIE from .rtl2 import RTL2IE from .rtp import RTPIE from .rts import RTSIE -from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE +from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE from .rtvnh import RTVNHIE +from .rudo import RudoIE from .ruhd import RUHDIE from .ruleporn import RulePornIE from .rutube import ( @@ -704,10 +739,12 @@ from .shahid import ShahidIE from .shared import SharedIE from .sharesix import ShareSixIE from .sina import SinaIE +from .sixplay import SixPlayIE from .skynewsarabia import ( SkyNewsArabiaIE, SkyNewsArabiaArticleIE, ) +from .skysports import SkySportsIE from .slideshare import SlideshareIE from .slutload import SlutloadIE from .smotri import ( @@ -718,6 +755,7 @@ from .smotri import ( ) from .snotr import SnotrIE from .sohu import SohuIE +from .sonyliv import SonyLIVIE from .soundcloud import ( SoundcloudIE, SoundcloudSetIE, @@ -757,6 +795,7 @@ from .srmediathek import SRMediathekIE from .ssa import SSAIE from .stanfordoc import StanfordOpenClassroomIE from .steam import SteamIE +from .streamable import StreamableIE from .streamcloud import StreamcloudIE from .streamcz import StreamCZIE from .streetvoice import StreetVoiceIE @@ -772,7 +811,6 @@ from .tagesschau import ( TagesschauPlayerIE, TagesschauIE, ) -from .tapely import TapelyIE from .tass import TassIE from .tdslifeway import TDSLifewayIE from .teachertube import ( @@ -856,10 +894,14 @@ from .tvc import ( from .tvigle import TvigleIE from .tvland import TVLandIE from .tvp import ( + TVPEmbedIE, TVPIE, TVPSeriesIE, ) -from .tvplay import TVPlayIE +from .tvplay import ( + TVPlayIE, + ViafreeIE, +) from .tweakers import TweakersIE from .twentyfourvideo import TwentyFourVideoIE from .twentymin import TwentyMinutenIE @@ -888,7 +930,13 @@ from .udemy import ( from .udn import UDNEmbedIE from .digiteka import DigitekaIE from .unistra import UnistraIE +from .uol import UOLIE +from .uplynk import ( + UplynkIE, + UplynkPreplayIE, +) from .urort import UrortIE +from .urplay import URPlayIE from .usatoday import USATodayIE from .ustream import UstreamIE, UstreamChannelIE from .ustudio import ( @@ -915,6 +963,8 @@ from .vice import ( ViceIE, ViceShowIE, ) +from .viceland import VicelandIE +from .vidbit import VidbitIE from .viddler import ViddlerIE from .videodetective import VideoDetectiveIE from .videofyme import VideofyMeIE @@ -963,9 +1013,11 @@ from .viki import ( from .vk import ( VKIE, VKUserVideosIE, + VKWallPostIE, ) from .vlive import VLiveIE from .vodlocker import VodlockerIE +from .vodplatform import VODPlatformIE from .voicerepublic import VoiceRepublicIE from .voxmedia import VoxMediaIE from .vporn import VpornIE @@ -1048,6 +1100,7 @@ from .youtube import ( YoutubeSearchDateIE, YoutubeSearchIE, YoutubeSearchURLIE, + YoutubeSharedVideoIE, YoutubeShowIE, YoutubeSubscriptionsIE, YoutubeTruncatedIDIE, @@ -1061,4 +1114,3 @@ from .zingmp3 import ( ZingMp3SongIE, ZingMp3AlbumIE, ) -from .zippcast import ZippCastIE diff --git a/youtube_dl/extractor/extremetube.py b/youtube_dl/extractor/extremetube.py index 3403581..b4fd933 100644 --- a/youtube_dl/extractor/extremetube.py +++ b/youtube_dl/extractor/extremetube.py @@ -1,22 +1,17 @@ from __future__ import unicode_literals -import re +from ..utils import str_to_int +from .keezmovies import KeezMoviesIE -from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, - str_to_int, -) - -class ExtremeTubeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?P<id>[^/#?&]+)' +class ExtremeTubeIE(KeezMoviesIE): + _VALID_URL = r'https?://(?:www\.)?extremetube\.com/(?:[^/]+/)?video/(?:(?P<display_id>[^/]+)-)(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.extremetube.com/video/music-video-14-british-euro-brit-european-cumshots-swallow-652431', - 'md5': '344d0c6d50e2f16b06e49ca011d8ac69', + 'md5': '1fb9228f5e3332ec8c057d6ac36f33e0', 'info_dict': { - 'id': 'music-video-14-british-euro-brit-european-cumshots-swallow-652431', + 'id': '652431', + 'display_id': 'music-video-14-british-euro-brit-european-cumshots-swallow', 'ext': 'mp4', 'title': 'Music Video 14 british euro brit european cumshots swallow', 'uploader': 'unknown', @@ -35,58 +30,22 @@ class ExtremeTubeIE(InfoExtractor): }] def _real_extract(self, url): - video_id = self._match_id(url) + webpage, info = self._extract_info(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + if not info['title']: + info['title'] = self._search_regex( + r'<h1[^>]+title="([^"]+)"[^>]*>', webpage, 'title') - video_title = self._html_search_regex( - r'<h1 [^>]*?title="([^"]+)"[^>]*>', webpage, 'title') uploader = self._html_search_regex( r'Uploaded by:\s*</strong>\s*(.+?)\s*</div>', webpage, 'uploader', fatal=False) - view_count = str_to_int(self._html_search_regex( + view_count = str_to_int(self._search_regex( r'Views:\s*</strong>\s*<span>([\d,\.]+)</span>', webpage, 'view count', fatal=False)) - flash_vars = self._parse_json( - self._search_regex( - r'var\s+flashvars\s*=\s*({.+?});', webpage, 'flash vars'), - video_id) - - formats = [] - for quality_key, video_url in flash_vars.items(): - height = int_or_none(self._search_regex( - r'quality_(\d+)[pP]$', quality_key, 'height', default=None)) - if not height: - continue - f = { - 'url': video_url, - } - mobj = re.search( - r'/(?P<height>\d{3,4})[pP]_(?P<bitrate>\d+)[kK]_\d+', video_url) - if mobj: - height = int(mobj.group('height')) - bitrate = int(mobj.group('bitrate')) - f.update({ - 'format_id': '%dp-%dk' % (height, bitrate), - 'height': height, - 'tbr': bitrate, - }) - else: - f.update({ - 'format_id': '%dp' % height, - 'height': height, - }) - formats.append(f) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': video_title, - 'formats': formats, + info.update({ 'uploader': uploader, 'view_count': view_count, - 'age_limit': 18, - } + }) + + return info diff --git a/youtube_dl/extractor/facebook.py b/youtube_dl/extractor/facebook.py index 9b87b37..0fb781a 100644 --- a/youtube_dl/extractor/facebook.py +++ b/youtube_dl/extractor/facebook.py @@ -27,7 +27,7 @@ class FacebookIE(InfoExtractor): _VALID_URL = r'''(?x) (?: https?:// - (?:\w+\.)?facebook\.com/ + (?:[\w-]+\.)?facebook\.com/ (?:[^#]*?\#!/)? (?: (?: @@ -127,8 +127,26 @@ class FacebookIE(InfoExtractor): }, { 'url': 'https://www.facebook.com/groups/164828000315060/permalink/764967300301124/', 'only_matching': True, + }, { + 'url': 'https://zh-hk.facebook.com/peoplespower/videos/1135894589806027/', + 'only_matching': True, }] + @staticmethod + def _extract_url(webpage): + mobj = re.search( + r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) + if mobj is not None: + return mobj.group('url') + + # Facebook API embed + # see https://developers.facebook.com/docs/plugins/embedded-video-player + mobj = re.search(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\bfb-(?:video|post)\b[^\'"]*(?P=q1)[^>]+ + data-href=(?P<q2>[\'"])(?P<url>(?:https?:)?//(?:www\.)?facebook.com/.+?)(?P=q2)''', webpage) + if mobj is not None: + return mobj.group('url') + def _login(self): (useremail, password) = self._get_login_info() if useremail is None: @@ -204,12 +222,25 @@ class FacebookIE(InfoExtractor): BEFORE = '{swf.addParam(param[0], param[1]);});' AFTER = '.forEach(function(variable) {swf.addVariable(variable[0], variable[1]);});' - m = re.search(re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER), webpage) - if m: - swf_params = m.group(1).replace('\\\\', '\\').replace('\\"', '"') + PATTERN = re.escape(BEFORE) + '(?:\n|\\\\n)(.*?)' + re.escape(AFTER) + + for m in re.findall(PATTERN, webpage): + swf_params = m.replace('\\\\', '\\').replace('\\"', '"') data = dict(json.loads(swf_params)) params_raw = compat_urllib_parse_unquote(data['params']) - video_data = json.loads(params_raw)['video_data'] + video_data_candidate = json.loads(params_raw)['video_data'] + for _, f in video_data_candidate.items(): + if not f: + continue + if isinstance(f, dict): + f = [f] + if not isinstance(f, list): + continue + if f[0].get('video_id') == video_id: + video_data = video_data_candidate + break + if video_data: + break def video_data_list2dict(video_data): ret = {} diff --git a/youtube_dl/extractor/fivemin.py b/youtube_dl/extractor/fivemin.py index 6b83454..f3f876e 100644 --- a/youtube_dl/extractor/fivemin.py +++ b/youtube_dl/extractor/fivemin.py @@ -1,24 +1,11 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_parse_qs, - compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, - compat_urlparse, -) -from ..utils import ( - ExtractorError, - parse_duration, - replace_extension, -) class FiveMinIE(InfoExtractor): IE_NAME = '5min' - _VALID_URL = r'(?:5min:(?P<id>\d+)(?::(?P<sid>\d+))?|https?://[^/]*?5min\.com/Scripts/PlayerSeed\.js\?(?P<query>.*))' + _VALID_URL = r'(?:5min:|https?://(?:[^/]*?5min\.com/|delivery\.vidible\.tv/aol)(?:(?:Scripts/PlayerSeed\.js|playerseed/?)?\?.*?playList=)?)(?P<id>\d+)' _TESTS = [ { @@ -29,8 +16,16 @@ class FiveMinIE(InfoExtractor): 'id': '518013791', 'ext': 'mp4', 'title': 'iPad Mini with Retina Display Review', + 'description': 'iPad mini with Retina Display review', 'duration': 177, + 'uploader': 'engadget', + 'upload_date': '20131115', + 'timestamp': 1384515288, }, + 'params': { + # m3u8 download + 'skip_download': True, + } }, { # From http://on.aol.com/video/how-to-make-a-next-level-fruit-salad-518086247 @@ -44,108 +39,16 @@ class FiveMinIE(InfoExtractor): }, 'skip': 'no longer available', }, - ] - _ERRORS = { - 'ErrorVideoNotExist': 'We\'re sorry, but the video you are trying to watch does not exist.', - 'ErrorVideoNoLongerAvailable': 'We\'re sorry, but the video you are trying to watch is no longer available.', - 'ErrorVideoRejected': 'We\'re sorry, but the video you are trying to watch has been removed.', - 'ErrorVideoUserNotGeo': 'We\'re sorry, but the video you are trying to watch cannot be viewed from your current location.', - 'ErrorVideoLibraryRestriction': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', - 'ErrorExposurePermission': 'We\'re sorry, but the video you are trying to watch is currently unavailable for viewing at this domain.', - } - _QUALITIES = { - 1: { - 'width': 640, - 'height': 360, - }, - 2: { - 'width': 854, - 'height': 480, - }, - 4: { - 'width': 1280, - 'height': 720, - }, - 8: { - 'width': 1920, - 'height': 1080, - }, - 16: { - 'width': 640, - 'height': 360, - }, - 32: { - 'width': 854, - 'height': 480, - }, - 64: { - 'width': 1280, - 'height': 720, - }, - 128: { - 'width': 640, - 'height': 360, + { + 'url': 'http://embed.5min.com/518726732/', + 'only_matching': True, }, - } + { + 'url': 'http://delivery.vidible.tv/aol?playList=518013791', + 'only_matching': True, + } + ] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - sid = mobj.group('sid') - - if mobj.group('query'): - qs = compat_parse_qs(mobj.group('query')) - if not qs.get('playList'): - raise ExtractorError('Invalid URL', expected=True) - video_id = qs['playList'][0] - if qs.get('sid'): - sid = qs['sid'][0] - - embed_url = 'https://embed.5min.com/playerseed/?playList=%s' % video_id - if not sid: - embed_page = self._download_webpage(embed_url, video_id, - 'Downloading embed page') - sid = self._search_regex(r'sid=(\d+)', embed_page, 'sid') - - response = self._download_json( - 'https://syn.5min.com/handlers/SenseHandler.ashx?' + - compat_urllib_parse_urlencode({ - 'func': 'GetResults', - 'playlist': video_id, - 'sid': sid, - 'isPlayerSeed': 'true', - 'url': embed_url, - }), - video_id) - if not response['success']: - raise ExtractorError( - '%s said: %s' % ( - self.IE_NAME, - self._ERRORS.get(response['errorMessage'], response['errorMessage'])), - expected=True) - info = response['binding'][0] - - formats = [] - parsed_video_url = compat_urllib_parse_urlparse(compat_parse_qs( - compat_urllib_parse_urlparse(info['EmbededURL']).query)['videoUrl'][0]) - for rendition in info['Renditions']: - if rendition['RenditionType'] == 'aac' or rendition['RenditionType'] == 'm3u8': - continue - else: - rendition_url = compat_urlparse.urlunparse(parsed_video_url._replace(path=replace_extension(parsed_video_url.path.replace('//', '/%s/' % rendition['ID']), rendition['RenditionType']))) - quality = self._QUALITIES.get(rendition['ID'], {}) - formats.append({ - 'format_id': '%s-%d' % (rendition['RenditionType'], rendition['ID']), - 'url': rendition_url, - 'width': quality.get('width'), - 'height': quality.get('height'), - }) - self._sort_formats(formats) - - return { - 'id': video_id, - 'title': info['Title'], - 'thumbnail': info.get('ThumbURL'), - 'duration': parse_duration(info.get('Duration')), - 'formats': formats, - } + video_id = self._match_id(url) + return self.url_result('aol-video:%s' % video_id) diff --git a/youtube_dl/extractor/flipagram.py b/youtube_dl/extractor/flipagram.py new file mode 100644 index 0000000..1902a23 --- /dev/null +++ b/youtube_dl/extractor/flipagram.py @@ -0,0 +1,115 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + try_get, + unified_timestamp, +) + + +class FlipagramIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?flipagram\.com/f/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://flipagram.com/f/nyvTSJMKId', + 'md5': '888dcf08b7ea671381f00fab74692755', + 'info_dict': { + 'id': 'nyvTSJMKId', + 'ext': 'mp4', + 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + 'description': 'md5:d55e32edc55261cae96a41fa85ff630e', + 'duration': 35.571, + 'timestamp': 1461244995, + 'upload_date': '20160421', + 'uploader': 'kitty juria', + 'uploader_id': 'sjuria101', + 'creator': 'kitty juria', + 'view_count': int, + 'like_count': int, + 'repost_count': int, + 'comment_count': int, + 'comments': list, + 'formats': 'mincount:2', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + video_data = self._parse_json( + self._search_regex( + r'window\.reactH2O\s*=\s*({.+});', webpage, 'video data'), + video_id) + + flipagram = video_data['flipagram'] + video = flipagram['video'] + + json_ld = self._search_json_ld(webpage, video_id, default={}) + title = json_ld.get('title') or flipagram['captionText'] + description = json_ld.get('description') or flipagram.get('captionText') + + formats = [{ + 'url': video['url'], + 'width': int_or_none(video.get('width')), + 'height': int_or_none(video.get('height')), + 'filesize': int_or_none(video_data.get('size')), + }] + + preview_url = try_get( + flipagram, lambda x: x['music']['track']['previewUrl'], compat_str) + if preview_url: + formats.append({ + 'url': preview_url, + 'ext': 'm4a', + 'vcodec': 'none', + }) + + self._sort_formats(formats) + + counts = flipagram.get('counts', {}) + user = flipagram.get('user', {}) + video_data = flipagram.get('video', {}) + + thumbnails = [{ + 'url': self._proto_relative_url(cover['url']), + 'width': int_or_none(cover.get('width')), + 'height': int_or_none(cover.get('height')), + 'filesize': int_or_none(cover.get('size')), + } for cover in flipagram.get('covers', []) if cover.get('url')] + + # Note that this only retrieves comments that are initally loaded. + # For videos with large amounts of comments, most won't be retrieved. + comments = [] + for comment in video_data.get('comments', {}).get(video_id, {}).get('items', []): + text = comment.get('comment') + if not text or not isinstance(text, list): + continue + comments.append({ + 'author': comment.get('user', {}).get('name'), + 'author_id': comment.get('user', {}).get('username'), + 'id': comment.get('id'), + 'text': text[0], + 'timestamp': unified_timestamp(comment.get('created')), + }) + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': float_or_none(flipagram.get('duration'), 1000), + 'thumbnails': thumbnails, + 'timestamp': unified_timestamp(flipagram.get('iso8601Created')), + 'uploader': user.get('name'), + 'uploader_id': user.get('username'), + 'creator': user.get('name'), + 'view_count': int_or_none(counts.get('plays')), + 'like_count': int_or_none(counts.get('likes')), + 'repost_count': int_or_none(counts.get('reflips')), + 'comment_count': int_or_none(counts.get('comments')), + 'comments': comments, + 'formats': formats, + } diff --git a/youtube_dl/extractor/formula1.py b/youtube_dl/extractor/formula1.py index 322c41e..8c417ab 100644 --- a/youtube_dl/extractor/formula1.py +++ b/youtube_dl/extractor/formula1.py @@ -5,8 +5,8 @@ from .common import InfoExtractor class Formula1IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?formula1\.com/content/fom-website/en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?formula1\.com/(?:content/fom-website/)?en/video/\d{4}/\d{1,2}/(?P<id>.+?)\.html' + _TESTS = [{ 'url': 'http://www.formula1.com/content/fom-website/en/video/2016/5/Race_highlights_-_Spain_2016.html', 'md5': '8c79e54be72078b26b89e0e111c0502b', 'info_dict': { @@ -15,7 +15,10 @@ class Formula1IE(InfoExtractor): 'title': 'Race highlights - Spain 2016', }, 'add_ie': ['Ooyala'], - } + }, { + 'url': 'http://www.formula1.com/en/video/2016/5/Race_highlights_-_Spain_2016.html', + 'only_matching': True, + }] def _real_extract(self, url): display_id = self._match_id(url) diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py index fc4a5a0..9776c84 100644 --- a/youtube_dl/extractor/fourtube.py +++ b/youtube_dl/extractor/fourtube.py @@ -43,14 +43,14 @@ class FourTubeIE(InfoExtractor): 'uploadDate', webpage)) thumbnail = self._html_search_meta('thumbnailUrl', webpage) uploader_id = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', + r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">', webpage, 'uploader id', fatal=False) uploader = self._html_search_regex( - r'<a class="img-avatar" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', + r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">', webpage, 'uploader', fatal=False) categories_html = self._search_regex( - r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="list">(.*?)</ul>', + r'(?s)><i class="icon icon-tag"></i>\s*Categories / Tags\s*.*?<ul class="[^"]*?list[^"]*?">(.*?)</ul>', webpage, 'categories', fatal=False) categories = None if categories_html: @@ -59,10 +59,10 @@ class FourTubeIE(InfoExtractor): r'(?s)<li><a.*?>(.*?)</a>', categories_html)] view_count = str_to_int(self._search_regex( - r'<meta itemprop="interactionCount" content="UserPlays:([0-9,]+)">', + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">', webpage, 'view count', fatal=False)) like_count = str_to_int(self._search_regex( - r'<meta itemprop="interactionCount" content="UserLikes:([0-9,]+)">', + r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">', webpage, 'like count', fatal=False)) duration = parse_duration(self._html_search_meta('duration', webpage)) diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py index 95c1abf..9f406b1 100644 --- a/youtube_dl/extractor/fox.py +++ b/youtube_dl/extractor/fox.py @@ -2,7 +2,10 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..utils import smuggle_url +from ..utils import ( + smuggle_url, + update_url_query, +) class FOXIE(InfoExtractor): @@ -29,11 +32,12 @@ class FOXIE(InfoExtractor): release_url = self._parse_json(self._search_regex( r'"fox_pdk_player"\s*:\s*({[^}]+?})', webpage, 'fox_pdk_player'), - video_id)['release_url'] + '&switch=http' + video_id)['release_url'] return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', - 'url': smuggle_url(release_url, {'force_smil_url': True}), + 'url': smuggle_url(update_url_query( + release_url, {'switch': 'http'}), {'force_smil_url': True}), 'id': video_id, } diff --git a/youtube_dl/extractor/franceculture.py b/youtube_dl/extractor/franceculture.py index e2ca962..186da0d 100644 --- a/youtube_dl/extractor/franceculture.py +++ b/youtube_dl/extractor/franceculture.py @@ -2,104 +2,56 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import ( - compat_urlparse, -) from ..utils import ( determine_ext, - int_or_none, - ExtractorError, + unified_strdate, ) class FranceCultureIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/player/reecouter\?play=(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emissions/(?:[^/]+/)*(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.franceculture.fr/player/reecouter?play=4795174', + 'url': 'http://www.franceculture.fr/emissions/carnet-nomade/rendez-vous-au-pays-des-geeks', 'info_dict': { - 'id': '4795174', + 'id': 'rendez-vous-au-pays-des-geeks', + 'display_id': 'rendez-vous-au-pays-des-geeks', 'ext': 'mp3', 'title': 'Rendez-vous au pays des geeks', - 'alt_title': 'Carnet nomade | 13-14', - 'vcodec': 'none', + 'thumbnail': 're:^https?://.*\\.jpg$', 'upload_date': '20140301', - 'thumbnail': r're:^http://static\.franceculture\.fr/.*/images/player/Carnet-nomade\.jpg$', - 'description': 'startswith:Avec :Jean-Baptiste Péretié pour son documentaire sur Arte "La revanche', - 'timestamp': 1393700400, + 'vcodec': 'none', } } - def _extract_from_player(self, url, video_id): - webpage = self._download_webpage(url, video_id) + def _real_extract(self, url): + display_id = self._match_id(url) - video_path = self._search_regex( - r'<a id="player".*?href="([^"]+)"', webpage, 'video path') - video_url = compat_urlparse.urljoin(url, video_path) - timestamp = int_or_none(self._search_regex( - r'<a id="player".*?data-date="([0-9]+)"', - webpage, 'upload date', fatal=False)) - thumbnail = self._search_regex( - r'<a id="player".*?>\s+<img src="([^"]+)"', - webpage, 'thumbnail', fatal=False) + webpage = self._download_webpage(url, display_id) - display_id = self._search_regex( - r'<span class="path-diffusion">emission-(.*?)</span>', webpage, 'display_id') + video_url = self._search_regex( + r'(?s)<div[^>]+class="[^"]*?title-zone-diffusion[^"]*?"[^>]*>.*?<a[^>]+href="([^"]+)"', + webpage, 'video path') - title = self._html_search_regex( - r'<span class="title-diffusion">(.*?)</span>', webpage, 'title') - alt_title = self._html_search_regex( - r'<span class="title">(.*?)</span>', - webpage, 'alt_title', fatal=False) - description = self._html_search_regex( - r'<span class="description">(.*?)</span>', - webpage, 'description', fatal=False) + title = self._og_search_title(webpage) + upload_date = unified_strdate(self._search_regex( + '(?s)<div[^>]+class="date"[^>]*>.*?<span[^>]+class="inner"[^>]*>([^<]+)<', + webpage, 'upload date', fatal=False)) + thumbnail = self._search_regex( + r'(?s)<figure[^>]+itemtype="https://schema.org/ImageObject"[^>]*>.*?<img[^>]+data-pagespeed-(?:lazy|high-res)-src="([^"]+)"', + webpage, 'thumbnail', fatal=False) uploader = self._html_search_regex( r'(?s)<div id="emission".*?<span class="author">(.*?)</span>', webpage, 'uploader', default=None) vcodec = 'none' if determine_ext(video_url.lower()) == 'mp3' else None return { - 'id': video_id, + 'id': display_id, + 'display_id': display_id, 'url': video_url, - 'vcodec': vcodec, - 'uploader': uploader, - 'timestamp': timestamp, 'title': title, - 'alt_title': alt_title, 'thumbnail': thumbnail, - 'description': description, - 'display_id': display_id, + 'vcodec': vcodec, + 'uploader': uploader, + 'upload_date': upload_date, } - - def _real_extract(self, url): - video_id = self._match_id(url) - return self._extract_from_player(url, video_id) - - -class FranceCultureEmissionIE(FranceCultureIE): - _VALID_URL = r'https?://(?:www\.)?franceculture\.fr/emission-(?P<id>[^?#]+)' - _TEST = { - 'url': 'http://www.franceculture.fr/emission-les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', - 'info_dict': { - 'title': 'Jean-Gabriel Périot, cinéaste', - 'alt_title': 'Les Carnets de la création', - 'id': '5093239', - 'display_id': 'les-carnets-de-la-creation-jean-gabriel-periot-cineaste-2015-10-13', - 'ext': 'mp3', - 'timestamp': 1444762500, - 'upload_date': '20151013', - 'description': 'startswith:Aujourd\'hui dans "Les carnets de la création", le cinéaste', - }, - } - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - video_path = self._html_search_regex( - r'<a class="rf-player-open".*?href="([^"]+)"', webpage, 'video path', 'no_path_player') - if video_path == 'no_path_player': - raise ExtractorError('no player : no sound in this page.', expected=True) - new_id = self._search_regex('play=(?P<id>[0-9]+)', video_path, 'new_id', group='id') - video_url = compat_urlparse.urljoin(url, video_path) - return self._extract_from_player(video_url, new_id) diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py index ad94e31..3233f66 100644 --- a/youtube_dl/extractor/francetv.py +++ b/youtube_dl/extractor/francetv.py @@ -14,7 +14,10 @@ from ..utils import ( parse_duration, determine_ext, ) -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( + DailymotionIE, + DailymotionCloudIE, +) class FranceTVBaseInfoExtractor(InfoExtractor): @@ -128,7 +131,7 @@ class PluzzIE(FranceTVBaseInfoExtractor): class FranceTvInfoIE(FranceTVBaseInfoExtractor): IE_NAME = 'francetvinfo.fr' - _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/.*/(?P<title>.+)\.html' + _VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)' _TESTS = [{ 'url': 'http://www.francetvinfo.fr/replay-jt/france-3/soir-3/jt-grand-soir-3-lundi-26-aout-2013_393427.html', @@ -188,6 +191,24 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): 'params': { 'skip_download': True, }, + }, { + # Dailymotion embed + 'url': 'http://www.francetvinfo.fr/politique/notre-dame-des-landes/video-sur-france-inter-cecile-duflot-denonce-le-regard-meprisant-de-patrick-cohen_1520091.html', + 'md5': 'ee7f1828f25a648addc90cb2687b1f12', + 'info_dict': { + 'id': 'x4iiko0', + 'ext': 'mp4', + 'title': 'NDDL, référendum, Brexit : Cécile Duflot répond à Patrick Cohen', + 'description': 'Au lendemain de la victoire du "oui" au référendum sur l\'aéroport de Notre-Dame-des-Landes, l\'ancienne ministre écologiste est l\'invitée de Patrick Cohen. Plus d\'info : https://www.franceinter.fr/emissions/le-7-9/le-7-9-27-juin-2016', + 'timestamp': 1467011958, + 'upload_date': '20160627', + 'uploader': 'France Inter', + 'uploader_id': 'x2q2ez', + }, + 'add_ie': ['Dailymotion'], + }, { + 'url': 'http://france3-regions.francetvinfo.fr/limousin/emissions/jt-1213-limousin', + 'only_matching': True, }] def _real_extract(self, url): @@ -197,7 +218,13 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor): dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) if dmcloud_url: - return self.url_result(dmcloud_url, 'DailymotionCloud') + return self.url_result(dmcloud_url, DailymotionCloudIE.ie_key()) + + dailymotion_urls = DailymotionIE._extract_urls(webpage) + if dailymotion_urls: + return self.playlist_result([ + self.url_result(dailymotion_url, DailymotionIE.ie_key()) + for dailymotion_url in dailymotion_urls]) video_id, catalogue = self._search_regex( (r'id-video=([^@]+@[^"]+)', diff --git a/youtube_dl/extractor/fusion.py b/youtube_dl/extractor/fusion.py new file mode 100644 index 0000000..b4ab4cb --- /dev/null +++ b/youtube_dl/extractor/fusion.py @@ -0,0 +1,35 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from .ooyala import OoyalaIE + + +class FusionIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?fusion\.net/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://fusion.net/video/201781/u-s-and-panamanian-forces-work-together-to-stop-a-vessel-smuggling-drugs/', + 'info_dict': { + 'id': 'ZpcWNoMTE6x6uVIIWYpHh0qQDjxBuq5P', + 'ext': 'mp4', + 'title': 'U.S. and Panamanian forces work together to stop a vessel smuggling drugs', + 'description': 'md5:0cc84a9943c064c0f46b128b41b1b0d7', + 'duration': 140.0, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://fusion.net/video/201781', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + + ooyala_code = self._search_regex( + r'data-video-id=(["\'])(?P<code>.+?)\1', + webpage, 'ooyala code', group='code') + + return OoyalaIE._build_url_result(ooyala_code) diff --git a/youtube_dl/extractor/fxnetworks.py b/youtube_dl/extractor/fxnetworks.py new file mode 100644 index 0000000..6298973 --- /dev/null +++ b/youtube_dl/extractor/fxnetworks.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .adobepass import AdobePassIE +from ..utils import ( + update_url_query, + extract_attributes, + parse_age_limit, + smuggle_url, +) + + +class FXNetworksIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?(?:fxnetworks|simpsonsworld)\.com/video/(?P<id>\d+)' + _TESTS = [{ + 'url': 'http://www.fxnetworks.com/video/719841347694', + 'md5': '1447d4722e42ebca19e5232ab93abb22', + 'info_dict': { + 'id': '719841347694', + 'ext': 'mp4', + 'title': 'Vanpage', + 'description': 'F*ck settling down. You\'re the Worst returns for an all new season August 31st on FXX.', + 'age_limit': 14, + 'uploader': 'NEWA-FNG-FX', + 'upload_date': '20160706', + 'timestamp': 1467844741, + }, + 'add_ie': ['ThePlatform'], + }, { + 'url': 'http://www.simpsonsworld.com/video/716094019682', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + if 'The content you are trying to access is not available in your region.' in webpage: + self.raise_geo_restricted() + video_data = extract_attributes(self._search_regex( + r'(<a.+?rel="http://link\.theplatform\.com/s/.+?</a>)', webpage, 'video data')) + player_type = self._search_regex(r'playerType\s*=\s*[\'"]([^\'"]+)', webpage, 'player type', default=None) + release_url = video_data['rel'] + title = video_data['data-title'] + rating = video_data.get('data-rating') + query = { + 'mbr': 'true', + } + if player_type == 'movies': + query.update({ + 'manifest': 'm3u', + }) + else: + query.update({ + 'switch': 'http', + }) + if video_data.get('data-req-auth') == '1': + resource = self._get_mvpd_resource( + video_data['data-channel'], title, + video_data.get('data-guid'), rating) + query['auth'] = self._extract_mvpd_auth(url, video_id, 'fx', resource) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'title': title, + 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}), + 'thumbnail': video_data.get('data-large-thumb'), + 'age_limit': parse_age_limit(rating), + 'ie_key': 'ThePlatform', + } diff --git a/youtube_dl/extractor/gamekings.py b/youtube_dl/extractor/gamekings.py deleted file mode 100644 index cbcddcb..0000000 --- a/youtube_dl/extractor/gamekings.py +++ /dev/null @@ -1,76 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .common import InfoExtractor -from ..utils import ( - xpath_text, - xpath_with_ns, -) -from .youtube import YoutubeIE - - -class GamekingsIE(InfoExtractor): - _VALID_URL = r'https?://www\.gamekings\.nl/(?:videos|nieuws)/(?P<id>[^/]+)' - _TESTS = [{ - # YouTube embed video - 'url': 'http://www.gamekings.nl/videos/phoenix-wright-ace-attorney-dual-destinies-review/', - 'md5': '5208d3a17adeaef829a7861887cb9029', - 'info_dict': { - 'id': 'HkSQKetlGOU', - 'ext': 'mp4', - 'title': 'Phoenix Wright: Ace Attorney - Dual Destinies Review', - 'description': 'md5:db88c0e7f47e9ea50df3271b9dc72e1d', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader_id': 'UCJugRGo4STYMeFr5RoOShtQ', - 'uploader': 'Gamekings Vault', - 'upload_date': '20151123', - }, - 'add_ie': ['Youtube'], - }, { - # vimeo video - 'url': 'http://www.gamekings.nl/videos/the-legend-of-zelda-majoras-mask/', - 'md5': '12bf04dfd238e70058046937657ea68d', - 'info_dict': { - 'id': 'the-legend-of-zelda-majoras-mask', - 'ext': 'mp4', - 'title': 'The Legend of Zelda: Majora’s Mask', - 'description': 'md5:9917825fe0e9f4057601fe1e38860de3', - 'thumbnail': 're:^https?://.*\.jpg$', - }, - }, { - 'url': 'http://www.gamekings.nl/nieuws/gamekings-extra-shelly-en-david-bereiden-zich-voor-op-de-livestream/', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage(url, video_id) - - playlist_id = self._search_regex( - r'gogoVideo\([^,]+,\s*"([^"]+)', webpage, 'playlist id') - - # Check if a YouTube embed is used - if YoutubeIE.suitable(playlist_id): - return self.url_result(playlist_id, ie='Youtube') - - playlist = self._download_xml( - 'http://www.gamekings.tv/wp-content/themes/gk2010/rss_playlist.php?id=%s' % playlist_id, - video_id) - - NS_MAP = { - 'jwplayer': 'http://rss.jwpcdn.com/' - } - - item = playlist.find('./channel/item') - - thumbnail = xpath_text(item, xpath_with_ns('./jwplayer:image', NS_MAP), 'thumbnail') - video_url = item.find(xpath_with_ns('./jwplayer:source', NS_MAP)).get('file') - - return { - 'id': video_id, - 'url': video_url, - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py index 621257c..4e859e0 100644 --- a/youtube_dl/extractor/gamespot.py +++ b/youtube_dl/extractor/gamespot.py @@ -28,10 +28,13 @@ class GameSpotIE(OnceIE): 'url': 'http://www.gamespot.com/videos/the-witcher-3-wild-hunt-xbox-one-now-playing/2300-6424837/', 'info_dict': { 'id': 'gs-2300-6424837', - 'ext': 'flv', - 'title': 'The Witcher 3: Wild Hunt [Xbox ONE] - Now Playing', + 'ext': 'mp4', + 'title': 'Now Playing - The Witcher 3: Wild Hunt', 'description': 'Join us as we take a look at the early hours of The Witcher 3: Wild Hunt and more.', }, + 'params': { + 'skip_download': True, # m3u8 downloads + }, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4aa2406..197ab95 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -49,7 +49,10 @@ from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( + DailymotionIE, + DailymotionCloudIE, +) from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE @@ -59,11 +62,17 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .arkena import ArkenaIE from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .vessel import VesselIE +from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE +from .soundcloud import SoundcloudIE +from .vbox7 import Vbox7IE class GenericIE(InfoExtractor): @@ -467,7 +476,7 @@ class GenericIE(InfoExtractor): 'url': 'http://www.vestifinance.ru/articles/25753', 'info_dict': { 'id': '25753', - 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', + 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', }, 'playlist': [{ 'info_dict': { @@ -634,6 +643,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', 'description': 'Two valets share their love for movie star Liam Neesons.', + 'timestamp': 1349922600, + 'upload_date': '20121011', }, }, # YouTube embed via <data-embed-url=""> @@ -775,6 +786,15 @@ class GenericIE(InfoExtractor): 'upload_date': '20141029', } }, + # Soundcloud multiple embeds + { + 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', + 'info_dict': { + 'id': '52809', + 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', + }, + 'playlist_mincount': 7, + }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', @@ -850,6 +870,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:601cb790edd05908957dae8aaa866465', 'upload_date': '20150220', }, + 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', }, # jwplayer YouTube { @@ -920,6 +941,24 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura embedded via quoted entry_id + 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', + 'info_dict': { + 'id': '0_utuok90b', + 'ext': 'mp4', + 'title': '06_matthew_brender_raj_dutt', + 'timestamp': 1466638791, + 'upload_date': '20160622', + }, + 'add_ie': ['Kaltura'], + 'expected_warnings': [ + 'Could not send HEAD request' + ], + 'params': { + 'skip_download': True, + } + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1091,12 +1130,17 @@ class GenericIE(InfoExtractor): # Dailymotion Cloud video { 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': '49444254273501a64675a7e68c502681', + 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', 'info_dict': { - 'id': '5585de919473990de4bee11b', + 'id': 'x2uy8t3', 'ext': 'mp4', - 'title': 'Le débat', + 'title': 'Sauvons les abeilles ! - Le débat', + 'description': 'md5:d9082128b1c5277987825d684939ca26', 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1434970506, + 'upload_date': '20150622', + 'uploader': 'Public Sénat', + 'uploader_id': 'xa9gza', } }, # OnionStudios embed @@ -1220,6 +1264,145 @@ class GenericIE(InfoExtractor): 'uploader': 'www.hudl.com', }, }, + # twitter:player:stream embed + { + 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288', + 'info_dict': { + 'id': 'master', + 'ext': 'mp4', + 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine', + 'uploader': 'www.rtl.be', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, + # twitter:player embed + { + 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', + 'md5': 'a3e0df96369831de324f0778e126653c', + 'info_dict': { + 'id': '4909620399001', + 'ext': 'mp4', + 'title': 'What Do Black Holes Sound Like?', + 'description': 'what do black holes sound like', + 'upload_date': '20160524', + 'uploader_id': '29913724001', + 'timestamp': 1464107587, + 'uploader': 'TheAtlantic', + }, + 'add_ie': ['BrightcoveLegacy'], + }, + # Facebook <iframe> embed + { + 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', + 'md5': 'fbcde74f534176ecb015849146dd3aee', + 'info_dict': { + 'id': '599637780109885', + 'ext': 'mp4', + 'title': 'Facebook video #599637780109885', + }, + }, + # Facebook API embed + { + 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', + 'md5': 'a47372ee61b39a7b90287094d447d94e', + 'info_dict': { + 'id': '10153467542406923', + 'ext': 'mp4', + 'title': 'Facebook video #10153467542406923', + }, + }, + # Wordpress "YouTube Video Importer" plugin + { + 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', + 'md5': 'd16797741b560b485194eddda8121b48', + 'info_dict': { + 'id': 'HNTXWDXV9Is', + 'ext': 'mp4', + 'title': 'Blue Devils Drumline Stanford lot 2016', + 'upload_date': '20160627', + 'uploader_id': 'GENOCIDE8GENERAL10', + 'uploader': 'cylus cyrus', + }, + }, + { + # video stored on custom kaltura server + 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', + 'md5': '537617d06e64dfed891fa1593c4b30cc', + 'info_dict': { + 'id': '0_1iotm5bh', + 'ext': 'mp4', + 'title': 'Elecciones británicas: 5 lecciones para Rajoy', + 'description': 'md5:435a89d68b9760b92ce67ed227055f16', + 'uploader_id': 'videos.expansion@el-mundo.net', + 'upload_date': '20150429', + 'timestamp': 1430303472, + }, + 'add_ie': ['Kaltura'], + }, + { + # Non-standard Vimeo embed + 'url': 'https://openclassrooms.com/courses/understanding-the-web', + 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', + 'info_dict': { + 'id': '148867247', + 'ext': 'mp4', + 'title': 'Understanding the web - Teaser', + 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', + 'upload_date': '20151214', + 'uploader': 'OpenClassrooms', + 'uploader_id': 'openclassrooms', + }, + 'add_ie': ['Vimeo'], + }, + { + 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ArkenaIE.ie_key()], + }, + { + 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', + 'info_dict': { + 'id': '1c7141f46c', + 'ext': 'mp4', + 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [Vbox7IE.ie_key()], + }, + # { + # # TODO: find another test + # # http://schema.org/VideoObject + # 'url': 'https://flipagram.com/f/nyvTSJMKId', + # 'md5': '888dcf08b7ea671381f00fab74692755', + # 'info_dict': { + # 'id': 'nyvTSJMKId', + # 'ext': 'mp4', + # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + # 'description': '#love for cats.', + # 'timestamp': 1461244995, + # 'upload_date': '20160421', + # }, + # 'params': { + # 'force_generic_extractor': True, + # }, + # } ] def report_following_redirect(self, new_url): @@ -1576,12 +1759,16 @@ class GenericIE(InfoExtractor): if matches: return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) - # Look for embedded Dailymotion player - matches = re.findall( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) + # Look for Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) + return _playlist_from_matches(matches, lambda m: m[-1]) + + matches = DailymotionIE._extract_urls(webpage) + if matches: + return _playlist_from_matches(matches) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -1718,10 +1905,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for embedded Facebook player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Facebook') + facebook_url = FacebookIE._extract_url(webpage) + if facebook_url is not None: + return self.url_result(facebook_url, 'Facebook') # Look for embedded VK player mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) @@ -1836,12 +2022,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - mobj = re.search( - r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', - webpage) - if mobj is not None: - url = unescapeHTML(mobj.group('url')) - return self.url_result(url) + soundcloud_urls = SoundcloudIE._extract_urls(webpage) + if soundcloud_urls: + return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -1903,18 +2086,14 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or - re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage)) - if mobj is not None: - return self.url_result(smuggle_url( - 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), - {'source_url': url}), 'Kaltura') + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for Eagle.Platform embeds - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'EaglePlatform') + eagleplatform_url = EaglePlatformIE._extract_url(webpage) + if eagleplatform_url: + return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) # Look for ClipYou (uses Eagle.Platform) embeds mobj = re.search( @@ -2008,6 +2187,11 @@ class GenericIE(InfoExtractor): if digiteka_url: return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + # Look for Arkena embeds + arkena_url = ArkenaIE._extract_url(webpage) + if arkena_url: + return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: @@ -2036,6 +2220,14 @@ class GenericIE(InfoExtractor): return self.url_result( self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + # Look for VODPlatform embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vod-platform\.net/embed/[^/?#]+)', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'VODPlatform') + # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: @@ -2060,6 +2252,24 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, } + # Look for VBOX7 embeds + vbox7_url = Vbox7IE._extract_url(webpage) + if vbox7_url: + return self.url_result(vbox7_url, Vbox7IE.ie_key()) + + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + info_dict.update({ + 'title': video_title or info_dict['title'], + 'description': video_description, + 'thumbnail': video_thumbnail, + 'age_limit': age_limit + }) + info_dict.update(json_ld) + return info_dict + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -2103,6 +2313,9 @@ class GenericIE(InfoExtractor): r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) if not found: # Try to find twitter cards info + # twitter:player:stream should be checked before twitter:player since + # it is expected to contain a raw stream (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) if not found: @@ -2136,6 +2349,15 @@ class GenericIE(InfoExtractor): '_type': 'url', 'url': new_url, } + + if not found: + # twitter:player is a https URL to iframe player that may or may not + # be supported by youtube-dl thus this is checked the very last (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) + embed_url = self._html_search_meta('twitter:player', webpage, default=None) + if embed_url: + return self.url_result(embed_url) + if not found: raise UnsupportedError(url) diff --git a/youtube_dl/extractor/goldenmoustache.py b/youtube_dl/extractor/goldenmoustache.py deleted file mode 100644 index 0fb5097..0000000 --- a/youtube_dl/extractor/goldenmoustache.py +++ /dev/null @@ -1,48 +0,0 @@ -from __future__ import unicode_literals - -from .common import InfoExtractor - - -class GoldenMoustacheIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?goldenmoustache\.com/(?P<display_id>[\w-]+)-(?P<id>\d+)' - _TESTS = [{ - 'url': 'http://www.goldenmoustache.com/suricate-le-poker-3700/', - 'md5': '0f904432fa07da5054d6c8beb5efb51a', - 'info_dict': { - 'id': '3700', - 'ext': 'mp4', - 'title': 'Suricate - Le Poker', - 'description': 'md5:3d1f242f44f8c8cb0a106f1fd08e5dc9', - 'thumbnail': 're:^https?://.*\.jpg$', - } - }, { - 'url': 'http://www.goldenmoustache.com/le-lab-tout-effacer-mc-fly-et-carlito-55249/', - 'md5': '27f0c50fb4dd5f01dc9082fc67cd5700', - 'info_dict': { - 'id': '55249', - 'ext': 'mp4', - 'title': 'Le LAB - Tout Effacer (Mc Fly et Carlito)', - 'description': 'md5:9b7fbf11023fb2250bd4b185e3de3b2a', - 'thumbnail': 're:^https?://.*\.(?:png|jpg)$', - } - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - video_url = self._html_search_regex( - r'data-src-type="mp4" data-src="([^"]+)"', webpage, 'video URL') - title = self._html_search_regex( - r'<title>(.*?)(?: - Golden Moustache)?</title>', webpage, 'title') - thumbnail = self._og_search_thumbnail(webpage) - description = self._og_search_description(webpage) - - return { - 'id': video_id, - 'url': video_url, - 'ext': 'mp4', - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - } diff --git a/youtube_dl/extractor/hgtv.py b/youtube_dl/extractor/hgtv.py new file mode 100644 index 0000000..c3f0733 --- /dev/null +++ b/youtube_dl/extractor/hgtv.py @@ -0,0 +1,48 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + js_to_json, + smuggle_url, +) + + +class HGTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?hgtv\.ca/[^/]+/video/(?P<id>[^/]+)/video.html' + _TEST = { + 'url': 'http://www.hgtv.ca/homefree/video/overnight-success/video.html?v=738081859718&p=1&s=da#video', + 'md5': '', + 'info_dict': { + 'id': 'aFH__I_5FBOX', + 'ext': 'mp4', + 'title': 'Overnight Success', + 'description': 'After weeks of hard work, high stakes, breakdowns and pep talks, the final 2 contestants compete to win the ultimate dream.', + 'uploader': 'SHWM-NEW', + 'timestamp': 1470320034, + 'upload_date': '20160804', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + embed_vars = self._parse_json(self._search_regex( + r'(?s)embed_vars\s*=\s*({.*?});', + webpage, 'embed vars'), display_id, js_to_json) + return { + '_type': 'url_transparent', + 'url': smuggle_url( + 'http://link.theplatform.com/s/dtjsEC/%s?mbr=true&manifest=m3u' % embed_vars['pid'], { + 'force_smil_url': True + }), + 'series': embed_vars.get('show'), + 'season_number': int_or_none(embed_vars.get('season')), + 'episode_number': int_or_none(embed_vars.get('episode')), + 'ie_key': 'ThePlatform', + } diff --git a/youtube_dl/extractor/hrti.py b/youtube_dl/extractor/hrti.py new file mode 100644 index 0000000..656ce6d --- /dev/null +++ b/youtube_dl/extractor/hrti.py @@ -0,0 +1,202 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import json +import re + +from .common import InfoExtractor +from ..compat import compat_HTTPError +from ..utils import ( + clean_html, + ExtractorError, + int_or_none, + parse_age_limit, + sanitized_Request, + try_get, +) + + +class HRTiBaseIE(InfoExtractor): + """ + Base Information Extractor for Croatian Radiotelevision + video on demand site https://hrti.hrt.hr + Reverse engineered from the JavaScript app in app.min.js + """ + _NETRC_MACHINE = 'hrti' + + _APP_LANGUAGE = 'hr' + _APP_VERSION = '1.1' + _APP_PUBLICATION_ID = 'all_in_one' + _API_URL = 'http://clientapi.hrt.hr/client_api.php/config/identify/format/json' + + def _initialize_api(self): + init_data = { + 'application_publication_id': self._APP_PUBLICATION_ID + } + + uuid = self._download_json( + self._API_URL, None, note='Downloading uuid', + errnote='Unable to download uuid', + data=json.dumps(init_data).encode('utf-8'))['uuid'] + + app_data = { + 'uuid': uuid, + 'application_publication_id': self._APP_PUBLICATION_ID, + 'application_version': self._APP_VERSION + } + + req = sanitized_Request(self._API_URL, data=json.dumps(app_data).encode('utf-8')) + req.get_method = lambda: 'PUT' + + resources = self._download_json( + req, None, note='Downloading session information', + errnote='Unable to download session information') + + self._session_id = resources['session_id'] + + modules = resources['modules'] + + self._search_url = modules['vod_catalog']['resources']['search']['uri'].format( + language=self._APP_LANGUAGE, + application_id=self._APP_PUBLICATION_ID) + + self._login_url = (modules['user']['resources']['login']['uri'] + + '/format/json').format(session_id=self._session_id) + + self._logout_url = modules['user']['resources']['logout']['uri'] + + def _login(self): + (username, password) = self._get_login_info() + # TODO: figure out authentication with cookies + if username is None or password is None: + self.raise_login_required() + + auth_data = { + 'username': username, + 'password': password, + } + + try: + auth_info = self._download_json( + self._login_url, None, note='Logging in', errnote='Unable to log in', + data=json.dumps(auth_data).encode('utf-8')) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 406: + auth_info = self._parse_json(e.cause.read().encode('utf-8'), None) + else: + raise + + error_message = auth_info.get('error', {}).get('message') + if error_message: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error_message), + expected=True) + + self._token = auth_info['secure_streaming_token'] + + def _real_initialize(self): + self._initialize_api() + self._login() + + +class HRTiIE(HRTiBaseIE): + _VALID_URL = r'''(?x) + (?: + hrti:(?P<short_id>[0-9]+)| + https?:// + hrti\.hrt\.hr/\#/video/show/(?P<id>[0-9]+)/(?P<display_id>[^/]+)? + ) + ''' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/republika-dokumentarna-serija-16-hd', + 'info_dict': { + 'id': '2181385', + 'display_id': 'republika-dokumentarna-serija-16-hd', + 'ext': 'mp4', + 'title': 'REPUBLIKA, dokumentarna serija (1/6) (HD)', + 'description': 'md5:48af85f620e8e0e1df4096270568544f', + 'duration': 2922, + 'view_count': int, + 'average_rating': int, + 'episode_number': int, + 'season_number': int, + 'age_limit': 12, + }, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/show/2181385/', + 'only_matching': True, + }, { + 'url': 'hrti:2181385', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('short_id') or mobj.group('id') + display_id = mobj.group('display_id') or video_id + + video = self._download_json( + '%s/video_id/%s/format/json' % (self._search_url, video_id), + display_id, 'Downloading video metadata JSON')['video'][0] + + title_info = video['title'] + title = title_info['title_long'] + + movie = video['video_assets']['movie'][0] + m3u8_url = movie['url'].format(TOKEN=self._token) + formats = self._extract_m3u8_formats( + m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls') + self._sort_formats(formats) + + description = clean_html(title_info.get('summary_long')) + age_limit = parse_age_limit(video.get('parental_control', {}).get('rating')) + view_count = int_or_none(video.get('views')) + average_rating = int_or_none(video.get('user_rating')) + duration = int_or_none(movie.get('duration')) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'duration': duration, + 'view_count': view_count, + 'average_rating': average_rating, + 'age_limit': age_limit, + 'formats': formats, + } + + +class HRTiPlaylistIE(HRTiBaseIE): + _VALID_URL = r'https?://hrti.hrt.hr/#/video/list/category/(?P<id>[0-9]+)/(?P<display_id>[^/]+)?' + _TESTS = [{ + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/ekumena', + 'info_dict': { + 'id': '212', + 'title': 'ekumena', + }, + 'playlist_mincount': 8, + 'skip': 'Requires account credentials', + }, { + 'url': 'https://hrti.hrt.hr/#/video/list/category/212/', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + category_id = mobj.group('id') + display_id = mobj.group('display_id') or category_id + + response = self._download_json( + '%s/category_id/%s/format/json' % (self._search_url, category_id), + display_id, 'Downloading video metadata JSON') + + video_ids = try_get( + response, lambda x: x['video_listings'][0]['alternatives'][0]['list'], + list) or [video['id'] for video in response.get('videos', []) if video.get('id')] + + entries = [self.url_result('hrti:%s' % video_id) for video_id in video_ids] + + return self.playlist_result(entries, category_id, display_id) diff --git a/youtube_dl/extractor/imgur.py b/youtube_dl/extractor/imgur.py index 85e9344..d23489d 100644 --- a/youtube_dl/extractor/imgur.py +++ b/youtube_dl/extractor/imgur.py @@ -50,12 +50,10 @@ class ImgurIE(InfoExtractor): webpage = self._download_webpage( compat_urlparse.urljoin(url, video_id), video_id) - width = int_or_none(self._search_regex( - r'<param name="width" value="([0-9]+)"', - webpage, 'width', fatal=False)) - height = int_or_none(self._search_regex( - r'<param name="height" value="([0-9]+)"', - webpage, 'height', fatal=False)) + width = int_or_none(self._og_search_property( + 'video:width', webpage, default=None)) + height = int_or_none(self._og_search_property( + 'video:height', webpage, default=None)) video_elements = self._search_regex( r'(?s)<div class="video-elements">(.*?)</div>', diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py index fc0197a..8f7f232 100644 --- a/youtube_dl/extractor/instagram.py +++ b/youtube_dl/extractor/instagram.py @@ -36,7 +36,6 @@ class InstagramIE(InfoExtractor): 'info_dict': { 'id': 'BA-pQFBG8HZ', 'ext': 'mp4', - 'uploader_id': 'britneyspears', 'title': 'Video by britneyspears', 'thumbnail': 're:^https?://.*\.jpg', 'timestamp': 1453760977, diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py index ddcb3c9..01c7b30 100644 --- a/youtube_dl/extractor/iqiyi.py +++ b/youtube_dl/extractor/iqiyi.py @@ -3,28 +3,22 @@ from __future__ import unicode_literals import hashlib import itertools -import math -import os -import random import re import time -import uuid from .common import InfoExtractor from ..compat import ( - compat_parse_qs, compat_str, compat_urllib_parse_urlencode, - compat_urllib_parse_urlparse, ) from ..utils import ( + clean_html, decode_packed_codes, + get_element_by_id, + get_element_by_attribute, ExtractorError, ohdave_rsa_encrypt, remove_start, - sanitized_Request, - urlencode_postdata, - url_basename, ) @@ -171,70 +165,21 @@ class IqiyiIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.iqiyi.com/v_19rrojlavg.html', - 'md5': '2cb594dc2781e6c941a110d8f358118b', + # MD5 checksum differs on my machine and Travis CI 'info_dict': { 'id': '9c1fb1b99d192b21c559e5a1a2cb3c73', + 'ext': 'mp4', 'title': '美国德州空中惊现奇异云团 酷似UFO', - 'ext': 'f4v', } }, { 'url': 'http://www.iqiyi.com/v_19rrhnnclk.html', + 'md5': '667171934041350c5de3f5015f7f1152', 'info_dict': { 'id': 'e3f585b550a280af23c98b6cb2be19fb', - 'title': '名侦探柯南第752集', - }, - 'playlist': [{ - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part1', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part2', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part3', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part4', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part5', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part6', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part7', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }, { - 'info_dict': { - 'id': 'e3f585b550a280af23c98b6cb2be19fb_part8', - 'ext': 'f4v', - 'title': '名侦探柯南第752集', - }, - }], - 'params': { - 'skip_download': True, + 'ext': 'mp4', + 'title': '名侦探柯南 国语版:第752集 迫近灰原秘密的黑影 下篇', }, + 'skip': 'Geo-restricted to China', }, { 'url': 'http://www.iqiyi.com/w_19rt6o8t9p.html', 'only_matching': True, @@ -250,22 +195,10 @@ class IqiyiIE(InfoExtractor): 'url': 'http://www.iqiyi.com/v_19rrny4w8w.html', 'info_dict': { 'id': 'f3cf468b39dddb30d676f89a91200dc1', + 'ext': 'mp4', 'title': '泰坦尼克号', }, - 'playlist': [{ - 'info_dict': { - 'id': 'f3cf468b39dddb30d676f89a91200dc1_part1', - 'ext': 'f4v', - 'title': '泰坦尼克号', - }, - }, { - 'info_dict': { - 'id': 'f3cf468b39dddb30d676f89a91200dc1_part2', - 'ext': 'f4v', - 'title': '泰坦尼克号', - }, - }], - 'expected_warnings': ['Needs a VIP account for full video'], + 'skip': 'Geo-restricted to China', }, { 'url': 'http://www.iqiyi.com/a_19rrhb8ce1.html', 'info_dict': { @@ -278,20 +211,15 @@ class IqiyiIE(InfoExtractor): 'only_matching': True, }] - _FORMATS_MAP = [ - ('1', 'h6'), - ('2', 'h5'), - ('3', 'h4'), - ('4', 'h3'), - ('5', 'h2'), - ('10', 'h1'), - ] - - AUTH_API_ERRORS = { - # No preview available (不允许试看鉴权失败) - 'Q00505': 'This video requires a VIP account', - # End of preview time (试看结束鉴权失败) - 'Q00506': 'Needs a VIP account for full video', + _FORMATS_MAP = { + '96': 1, # 216p, 240p + '1': 2, # 336p, 360p + '2': 3, # 480p, 504p + '21': 4, # 504p + '4': 5, # 720p + '17': 5, # 720p + '5': 6, # 1072p, 1080p + '18': 7, # 1080p } def _real_initialize(self): @@ -352,177 +280,23 @@ class IqiyiIE(InfoExtractor): return True - def _authenticate_vip_video(self, api_video_url, video_id, tvid, _uuid, do_report_warning): - auth_params = { - # version and platform hard-coded in com/qiyi/player/core/model/remote/AuthenticationRemote.as - 'version': '2.0', - 'platform': 'b6c13e26323c537d', - 'aid': tvid, - 'tvid': tvid, - 'uid': '', - 'deviceId': _uuid, - 'playType': 'main', # XXX: always main? - 'filename': os.path.splitext(url_basename(api_video_url))[0], - } + def get_raw_data(self, tvid, video_id): + tm = int(time.time() * 1000) - qd_items = compat_parse_qs(compat_urllib_parse_urlparse(api_video_url).query) - for key, val in qd_items.items(): - auth_params[key] = val[0] - - auth_req = sanitized_Request( - 'http://api.vip.iqiyi.com/services/ckn.action', - urlencode_postdata(auth_params)) - # iQiyi server throws HTTP 405 error without the following header - auth_req.add_header('Content-Type', 'application/x-www-form-urlencoded') - auth_result = self._download_json( - auth_req, video_id, - note='Downloading video authentication JSON', - errnote='Unable to download video authentication JSON') - - code = auth_result.get('code') - msg = self.AUTH_API_ERRORS.get(code) or auth_result.get('msg') or code - if code == 'Q00506': - if do_report_warning: - self.report_warning(msg) - return False - if 'data' not in auth_result: - if msg is not None: - raise ExtractorError('%s said: %s' % (self.IE_NAME, msg), expected=True) - raise ExtractorError('Unexpected error from Iqiyi auth API') - - return auth_result['data'] - - def construct_video_urls(self, data, video_id, _uuid, tvid): - def do_xor(x, y): - a = y % 3 - if a == 1: - return x ^ 121 - if a == 2: - return x ^ 72 - return x ^ 103 - - def get_encode_code(l): - a = 0 - b = l.split('-') - c = len(b) - s = '' - for i in range(c - 1, -1, -1): - a = do_xor(int(b[c - i - 1], 16), i) - s += chr(a) - return s[::-1] - - def get_path_key(x, format_id, segment_index): - mg = ')(*&^flash@#$%a' - tm = self._download_json( - 'http://data.video.qiyi.com/t?tn=' + str(random.random()), video_id, - note='Download path key of segment %d for format %s' % (segment_index + 1, format_id) - )['t'] - t = str(int(math.floor(int(tm) / (600.0)))) - return md5_text(t + mg + x) - - video_urls_dict = {} - need_vip_warning_report = True - for format_item in data['vp']['tkl'][0]['vs']: - if 0 < int(format_item['bid']) <= 10: - format_id = self.get_format(format_item['bid']) - else: - continue - - video_urls = [] - - video_urls_info = format_item['fs'] - if not format_item['fs'][0]['l'].startswith('/'): - t = get_encode_code(format_item['fs'][0]['l']) - if t.endswith('mp4'): - video_urls_info = format_item['flvs'] - - for segment_index, segment in enumerate(video_urls_info): - vl = segment['l'] - if not vl.startswith('/'): - vl = get_encode_code(vl) - is_vip_video = '/vip/' in vl - filesize = segment['b'] - base_url = data['vp']['du'].split('/') - if not is_vip_video: - key = get_path_key( - vl.split('/')[-1].split('.')[0], format_id, segment_index) - base_url.insert(-1, key) - base_url = '/'.join(base_url) - param = { - 'su': _uuid, - 'qyid': uuid.uuid4().hex, - 'client': '', - 'z': '', - 'bt': '', - 'ct': '', - 'tn': str(int(time.time())) - } - api_video_url = base_url + vl - if is_vip_video: - api_video_url = api_video_url.replace('.f4v', '.hml') - auth_result = self._authenticate_vip_video( - api_video_url, video_id, tvid, _uuid, need_vip_warning_report) - if auth_result is False: - need_vip_warning_report = False - break - param.update({ - 't': auth_result['t'], - # cid is hard-coded in com/qiyi/player/core/player/RuntimeData.as - 'cid': 'afbe8fd3d73448c9', - 'vid': video_id, - 'QY00001': auth_result['u'], - }) - api_video_url += '?' if '?' not in api_video_url else '&' - api_video_url += compat_urllib_parse_urlencode(param) - js = self._download_json( - api_video_url, video_id, - note='Download video info of segment %d for format %s' % (segment_index + 1, format_id)) - video_url = js['l'] - video_urls.append( - (video_url, filesize)) - - video_urls_dict[format_id] = video_urls - return video_urls_dict - - def get_format(self, bid): - matched_format_ids = [_format_id for _bid, _format_id in self._FORMATS_MAP if _bid == str(bid)] - return matched_format_ids[0] if len(matched_format_ids) else None - - def get_bid(self, format_id): - matched_bids = [_bid for _bid, _format_id in self._FORMATS_MAP if _format_id == format_id] - return matched_bids[0] if len(matched_bids) else None - - def get_raw_data(self, tvid, video_id, enc_key, _uuid): - tm = str(int(time.time())) - tail = tm + tvid - param = { - 'key': 'fvip', - 'src': md5_text('youtube-dl'), - 'tvId': tvid, + key = 'd5fb4bd9d50c4be6948c97edd7254b0e' + sc = md5_text(compat_str(tm) + key + tvid) + params = { + 'tvid': tvid, 'vid': video_id, - 'vinfo': 1, - 'tm': tm, - 'enc': md5_text(enc_key + tail), - 'qyid': _uuid, - 'tn': random.random(), - # In iQiyi's flash player, um is set to 1 if there's a logged user - # Some 1080P formats are only available with a logged user. - # Here force um=1 to trick the iQiyi server - 'um': 1, - 'authkey': md5_text(md5_text('') + tail), - 'k_tag': 1, + 'src': '76f90cbd92f94a2e925d83e8ccd22cb7', + 'sc': sc, + 't': tm, } - api_url = 'http://cache.video.qiyi.com/vms' + '?' + \ - compat_urllib_parse_urlencode(param) - raw_data = self._download_json(api_url, video_id) - return raw_data - - def get_enc_key(self, video_id): - # TODO: automatic key extraction - # last update at 2016-01-22 for Zombie::bite - enc_key = '4a1caba4b4465345366f28da7c117d20' - return enc_key + return self._download_json( + 'http://cache.m.iqiyi.com/jp/tmts/%s/%s/' % (tvid, video_id), + video_id, transform_source=lambda s: remove_start(s, 'var tvInfoJs='), + query=params, headers=self.geo_verification_headers()) def _extract_playlist(self, webpage): PAGE_SIZE = 50 @@ -571,58 +345,41 @@ class IqiyiIE(InfoExtractor): r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid') video_id = self._search_regex( r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id') - _uuid = uuid.uuid4().hex - - enc_key = self.get_enc_key(video_id) - - raw_data = self.get_raw_data(tvid, video_id, enc_key, _uuid) - - if raw_data['code'] != 'A000000': - raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) - - data = raw_data['data'] - - title = data['vi']['vn'] - - # generate video_urls_dict - video_urls_dict = self.construct_video_urls( - data, video_id, _uuid, tvid) - - # construct info - entries = [] - for format_id in video_urls_dict: - video_urls = video_urls_dict[format_id] - for i, video_url_info in enumerate(video_urls): - if len(entries) < i + 1: - entries.append({'formats': []}) - entries[i]['formats'].append( - { - 'url': video_url_info[0], - 'filesize': video_url_info[-1], - 'format_id': format_id, - 'preference': int(self.get_bid(format_id)) - } - ) - - for i in range(len(entries)): - self._sort_formats(entries[i]['formats']) - entries[i].update( - { - 'id': '%s_part%d' % (video_id, i + 1), - 'title': title, - } - ) - - if len(entries) > 1: - info = { - '_type': 'multi_video', - 'id': video_id, - 'title': title, - 'entries': entries, - } - else: - info = entries[0] - info['id'] = video_id - info['title'] = title - - return info + + formats = [] + for _ in range(5): + raw_data = self.get_raw_data(tvid, video_id) + + if raw_data['code'] != 'A00000': + if raw_data['code'] == 'A00111': + self.raise_geo_restricted() + raise ExtractorError('Unable to load data. Error code: ' + raw_data['code']) + + data = raw_data['data'] + + for stream in data['vidl']: + if 'm3utx' not in stream: + continue + vd = compat_str(stream['vd']) + formats.append({ + 'url': stream['m3utx'], + 'format_id': vd, + 'ext': 'mp4', + 'preference': self._FORMATS_MAP.get(vd, -1), + 'protocol': 'm3u8_native', + }) + + if formats: + break + + self._sleep(5, video_id) + + self._sort_formats(formats) + title = (get_element_by_id('widget-videotitle', webpage) or + clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage))) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/jwplatform.py b/youtube_dl/extractor/jwplatform.py index e44e311..ce31269 100644 --- a/youtube_dl/extractor/jwplatform.py +++ b/youtube_dl/extractor/jwplatform.py @@ -4,10 +4,12 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( determine_ext, float_or_none, int_or_none, + mimetype2ext, ) @@ -28,74 +30,86 @@ class JWPlatformBaseIE(InfoExtractor): return self._parse_jwplayer_data( jwplayer_data, video_id, *args, **kwargs) - def _parse_jwplayer_data(self, jwplayer_data, video_id, require_title=True, m3u8_id=None, rtmp_params=None): + def _parse_jwplayer_data(self, jwplayer_data, video_id=None, require_title=True, m3u8_id=None, rtmp_params=None, base_url=None): # JWPlayer backward compatibility: flattened playlists # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/api/config.js#L81-L96 if 'playlist' not in jwplayer_data: jwplayer_data = {'playlist': [jwplayer_data]} - video_data = jwplayer_data['playlist'][0] - - # JWPlayer backward compatibility: flattened sources - # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 - if 'sources' not in video_data: - video_data['sources'] = [video_data] - - formats = [] - for source in video_data['sources']: - source_url = self._proto_relative_url(source['file']) - source_type = source.get('type') or '' - if source_type in ('application/vnd.apple.mpegurl', 'hls') or determine_ext(source_url) == 'm3u8': - formats.extend(self._extract_m3u8_formats( - source_url, video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) - elif source_type.startswith('audio'): - formats.append({ - 'url': source_url, - 'vcodec': 'none', - }) - else: - a_format = { - 'url': source_url, - 'width': int_or_none(source.get('width')), - 'height': int_or_none(source.get('height')), - } - if source_url.startswith('rtmp'): - a_format['ext'] = 'flv', - - # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as - # of jwplayer.flash.swf - rtmp_url_parts = re.split( - r'((?:mp4|mp3|flv):)', source_url, 1) - if len(rtmp_url_parts) == 3: - rtmp_url, prefix, play_path = rtmp_url_parts - a_format.update({ - 'url': rtmp_url, - 'play_path': prefix + play_path, - }) - if rtmp_params: - a_format.update(rtmp_params) - formats.append(a_format) - self._sort_formats(formats) - - subtitles = {} - tracks = video_data.get('tracks') - if tracks and isinstance(tracks, list): - for track in tracks: - if track.get('file') and track.get('kind') == 'captions': - subtitles.setdefault(track.get('label') or 'en', []).append({ - 'url': self._proto_relative_url(track['file']) + entries = [] + for video_data in jwplayer_data['playlist']: + # JWPlayer backward compatibility: flattened sources + # https://github.com/jwplayer/jwplayer/blob/v7.4.3/src/js/playlist/item.js#L29-L35 + if 'sources' not in video_data: + video_data['sources'] = [video_data] + + this_video_id = video_id or video_data['mediaid'] + + formats = [] + for source in video_data['sources']: + source_url = self._proto_relative_url(source['file']) + if base_url: + source_url = compat_urlparse.urljoin(base_url, source_url) + source_type = source.get('type') or '' + ext = mimetype2ext(source_type) or determine_ext(source_url) + if source_type == 'hls' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, this_video_id, 'mp4', 'm3u8_native', m3u8_id=m3u8_id, fatal=False)) + # https://github.com/jwplayer/jwplayer/blob/master/src/js/providers/default.js#L67 + elif source_type.startswith('audio') or ext in ('oga', 'aac', 'mp3', 'mpeg', 'vorbis'): + formats.append({ + 'url': source_url, + 'vcodec': 'none', + 'ext': ext, }) + else: + a_format = { + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'height': int_or_none(source.get('height')), + 'ext': ext, + } + if source_url.startswith('rtmp'): + a_format['ext'] = 'flv' + + # See com/longtailvideo/jwplayer/media/RTMPMediaProvider.as + # of jwplayer.flash.swf + rtmp_url_parts = re.split( + r'((?:mp4|mp3|flv):)', source_url, 1) + if len(rtmp_url_parts) == 3: + rtmp_url, prefix, play_path = rtmp_url_parts + a_format.update({ + 'url': rtmp_url, + 'play_path': prefix + play_path, + }) + if rtmp_params: + a_format.update(rtmp_params) + formats.append(a_format) + self._sort_formats(formats) + + subtitles = {} + tracks = video_data.get('tracks') + if tracks and isinstance(tracks, list): + for track in tracks: + if track.get('file') and track.get('kind') == 'captions': + subtitles.setdefault(track.get('label') or 'en', []).append({ + 'url': self._proto_relative_url(track['file']) + }) - return { - 'id': video_id, - 'title': video_data['title'] if require_title else video_data.get('title'), - 'description': video_data.get('description'), - 'thumbnail': self._proto_relative_url(video_data.get('image')), - 'timestamp': int_or_none(video_data.get('pubdate')), - 'duration': float_or_none(jwplayer_data.get('duration')), - 'subtitles': subtitles, - 'formats': formats, - } + entries.append({ + 'id': this_video_id, + 'title': video_data['title'] if require_title else video_data.get('title'), + 'description': video_data.get('description'), + 'thumbnail': self._proto_relative_url(video_data.get('image')), + 'timestamp': int_or_none(video_data.get('pubdate')), + 'duration': float_or_none(jwplayer_data.get('duration')), + 'subtitles': subtitles, + 'formats': formats, + }) + if len(entries) == 1: + return entries[0] + else: + return self.playlist_result(entries) class JWPlatformIE(JWPlatformBaseIE): diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py index a65697f..ddf1165 100644 --- a/youtube_dl/extractor/kaltura.py +++ b/youtube_dl/extractor/kaltura.py @@ -6,7 +6,6 @@ import base64 from .common import InfoExtractor from ..compat import ( - compat_urllib_parse_urlencode, compat_urlparse, compat_parse_qs, ) @@ -15,6 +14,7 @@ from ..utils import ( ExtractorError, int_or_none, unsmuggle_url, + smuggle_url, ) @@ -34,7 +34,8 @@ class KalturaIE(InfoExtractor): )(?:/(?P<path>[^?]+))?(?:\?(?P<query>.*))? ) ''' - _API_BASE = 'http://cdnapi.kaltura.com/api_v3/index.php?' + _SERVICE_URL = 'http://cdnapi.kaltura.com' + _SERVICE_BASE = '/api_v3/index.php' _TESTS = [ { 'url': 'kaltura:269692:1_1jc2y3e4', @@ -61,19 +62,58 @@ class KalturaIE(InfoExtractor): { 'url': 'https://cdnapisec.kaltura.com/html5/html5lib/v2.30.2/mwEmbedFrame.php/p/1337/uiconf_id/20540612/entry_id/1_sf5ovm7u?wid=_243342', 'only_matching': True, + }, + { + # video with subtitles + 'url': 'kaltura:111032:1_cw786r8q', + 'only_matching': True, } ] - def _kaltura_api_call(self, video_id, actions, *args, **kwargs): + @staticmethod + def _extract_url(webpage): + mobj = ( + re.search( + r"""(?xs) + kWidget\.(?:thumb)?[Ee]mbed\( + \{.*? + (?P<q1>['\"])wid(?P=q1)\s*:\s* + (?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*? + (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s* + (?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4), + """, webpage) or + re.search( + r'''(?xs) + (?P<q1>["\']) + (?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*? + (?P=q1).*? + (?: + entry_?[Ii]d| + (?P<q2>["\'])entry_?[Ii]d(?P=q2) + )\s*:\s* + (?P<q3>["\'])(?P<id>.+?)(?P=q3) + ''', webpage)) + if mobj: + embed_info = mobj.groupdict() + url = 'kaltura:%(partner_id)s:%(id)s' % embed_info + escaped_pid = re.escape(embed_info['partner_id']) + service_url = re.search( + r'<script[^>]+src=["\']((?:https?:)?//.+?)/p/%s/sp/%s00/embedIframeJs' % (escaped_pid, escaped_pid), + webpage) + if service_url: + url = smuggle_url(url, {'service_url': service_url.group(1)}) + return url + + def _kaltura_api_call(self, video_id, actions, service_url=None, *args, **kwargs): params = actions[0] if len(actions) > 1: for i, a in enumerate(actions[1:], start=1): for k, v in a.items(): params['%d:%s' % (i, k)] = v - query = compat_urllib_parse_urlencode(params) - url = self._API_BASE + query - data = self._download_json(url, video_id, *args, **kwargs) + data = self._download_json( + (service_url or self._SERVICE_URL) + self._SERVICE_BASE, + video_id, query=params, *args, **kwargs) status = data if len(actions) == 1 else data[0] if status.get('objectType') == 'KalturaAPIException': @@ -82,7 +122,7 @@ class KalturaIE(InfoExtractor): return data - def _get_kaltura_signature(self, video_id, partner_id): + def _get_kaltura_signature(self, video_id, partner_id, service_url=None): actions = [{ 'apiVersion': '3.1', 'expiry': 86400, @@ -92,10 +132,9 @@ class KalturaIE(InfoExtractor): 'widgetId': '_%s' % partner_id, }] return self._kaltura_api_call( - video_id, actions, note='Downloading Kaltura signature')['ks'] + video_id, actions, service_url, note='Downloading Kaltura signature')['ks'] - def _get_video_info(self, video_id, partner_id): - signature = self._get_kaltura_signature(video_id, partner_id) + def _get_video_info(self, video_id, partner_id, service_url=None): actions = [ { 'action': 'null', @@ -103,22 +142,34 @@ class KalturaIE(InfoExtractor): 'clientTag': 'kdp:v3.8.5', 'format': 1, # JSON, 2 = XML, 3 = PHP 'service': 'multirequest', - 'ks': signature, + }, + { + 'expiry': 86400, + 'service': 'session', + 'action': 'startWidgetSession', + 'widgetId': '_%s' % partner_id, }, { 'action': 'get', 'entryId': video_id, 'service': 'baseentry', - 'version': '-1', + 'ks': '{1:result:ks}', }, { 'action': 'getbyentryid', 'entryId': video_id, 'service': 'flavorAsset', + 'ks': '{1:result:ks}', + }, + { + 'action': 'list', + 'filter:entryIdEqual': video_id, + 'service': 'caption_captionasset', + 'ks': '{1:result:ks}', }, ] return self._kaltura_api_call( - video_id, actions, note='Downloading video info JSON') + video_id, actions, service_url, note='Downloading video info JSON') def _real_extract(self, url): url, smuggled_data = unsmuggle_url(url, {}) @@ -126,8 +177,9 @@ class KalturaIE(InfoExtractor): mobj = re.match(self._VALID_URL, url) partner_id, entry_id = mobj.group('partner_id', 'id') ks = None + captions = None if partner_id and entry_id: - info, flavor_assets = self._get_video_info(entry_id, partner_id) + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id, smuggled_data.get('service_url')) else: path, query = mobj.group('path', 'query') if not path and not query: @@ -146,7 +198,7 @@ class KalturaIE(InfoExtractor): raise ExtractorError('Invalid URL', expected=True) if 'entry_id' in params: entry_id = params['entry_id'][0] - info, flavor_assets = self._get_video_info(entry_id, partner_id) + _, info, flavor_assets, captions = self._get_video_info(entry_id, partner_id) elif 'uiconf_id' in params and 'flashvars[referenceId]' in params: reference_id = params['flashvars[referenceId]'][0] webpage = self._download_webpage(url, reference_id) @@ -175,12 +227,17 @@ class KalturaIE(InfoExtractor): unsigned_url += '?referrer=%s' % referrer return unsigned_url + data_url = info['dataUrl'] + if '/flvclipper/' in data_url: + data_url = re.sub(r'/flvclipper/.*', '/serveFlavor', data_url) + formats = [] for f in flavor_assets: # Continue if asset is not ready - if f['status'] != 2: + if f.get('status') != 2: continue - video_url = sign_url('%s/flavorId/%s' % (info['dataUrl'], f['id'])) + video_url = sign_url( + '%s/flavorId/%s' % (data_url, f['id'])) formats.append({ 'format_id': '%(fileExt)s-%(bitrate)s' % f, 'ext': f.get('fileExt'), @@ -193,17 +250,31 @@ class KalturaIE(InfoExtractor): 'width': int_or_none(f.get('width')), 'url': video_url, }) - m3u8_url = sign_url(info['dataUrl'].replace('format/url', 'format/applehttp')) - formats.extend(self._extract_m3u8_formats( - m3u8_url, entry_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + if '/playManifest/' in data_url: + m3u8_url = sign_url(data_url.replace( + 'format/url', 'format/applehttp')) + formats.extend(self._extract_m3u8_formats( + m3u8_url, entry_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) - self._check_formats(formats, entry_id) self._sort_formats(formats) + subtitles = {} + if captions: + for caption in captions.get('objects', []): + # Continue if caption is not ready + if f.get('status') != 2: + continue + subtitles.setdefault(caption.get('languageCode') or caption.get('language'), []).append({ + 'url': '%s/api_v3/service/caption_captionasset/action/serve/captionAssetId/%s' % (self._SERVICE_URL, caption['id']), + 'ext': caption.get('fileExt'), + }) + return { 'id': entry_id, 'title': info['name'], 'formats': formats, + 'subtitles': subtitles, 'description': clean_html(info.get('description')), 'thumbnail': info.get('thumbnailUrl'), 'duration': info.get('duration'), diff --git a/youtube_dl/extractor/kamcord.py b/youtube_dl/extractor/kamcord.py new file mode 100644 index 0000000..b50120d --- /dev/null +++ b/youtube_dl/extractor/kamcord.py @@ -0,0 +1,71 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + qualities, +) + + +class KamcordIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?kamcord\.com/v/(?P<id>[^/?#&]+)' + _TEST = { + 'url': 'https://www.kamcord.com/v/hNYRduDgWb4', + 'md5': 'c3180e8a9cfac2e86e1b88cb8751b54c', + 'info_dict': { + 'id': 'hNYRduDgWb4', + 'ext': 'mp4', + 'title': 'Drinking Madness', + 'uploader': 'jacksfilms', + 'uploader_id': '3044562', + 'view_count': int, + 'like_count': int, + 'comment_count': int, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video = self._parse_json( + self._search_regex( + r'window\.__props\s*=\s*({.+?});?(?:\n|\s*</script)', + webpage, 'video'), + video_id)['video'] + + title = video['title'] + + formats = self._extract_m3u8_formats( + video['play']['hls'], video_id, 'mp4', entry_protocol='m3u8_native') + self._sort_formats(formats) + + uploader = video.get('user', {}).get('username') + uploader_id = video.get('user', {}).get('id') + + view_count = int_or_none(video.get('viewCount')) + like_count = int_or_none(video.get('heartCount')) + comment_count = int_or_none(video.get('messageCount')) + + preference_key = qualities(('small', 'medium', 'large')) + + thumbnails = [{ + 'url': thumbnail_url, + 'id': thumbnail_id, + 'preference': preference_key(thumbnail_id), + } for thumbnail_id, thumbnail_url in (video.get('thumbnail') or {}).items() + if isinstance(thumbnail_id, compat_str) and isinstance(thumbnail_url, compat_str)] + + return { + 'id': video_id, + 'title': title, + 'uploader': uploader, + 'uploader_id': uploader_id, + 'view_count': view_count, + 'like_count': like_count, + 'comment_count': comment_count, + 'thumbnails': thumbnails, + 'formats': formats, + } diff --git a/youtube_dl/extractor/keezmovies.py b/youtube_dl/extractor/keezmovies.py index 126ca13..ad2f8a8 100644 --- a/youtube_dl/extractor/keezmovies.py +++ b/youtube_dl/extractor/keezmovies.py @@ -3,64 +3,124 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..aes import aes_decrypt_text +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) from ..utils import ( - sanitized_Request, - url_basename, + determine_ext, + ExtractorError, + int_or_none, + str_to_int, + strip_or_none, ) class KeezMoviesIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/.+?(?P<id>[0-9]+)(?:[/?&]|$)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?keezmovies\.com/video/(?:(?P<display_id>[^/]+)-)?(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.keezmovies.com/video/petite-asian-lady-mai-playing-in-bathtub-1214711', 'md5': '1c1e75d22ffa53320f45eeb07bc4cdc0', 'info_dict': { 'id': '1214711', + 'display_id': 'petite-asian-lady-mai-playing-in-bathtub', 'ext': 'mp4', 'title': 'Petite Asian Lady Mai Playing In Bathtub', - 'age_limit': 18, 'thumbnail': 're:^https?://.*\.jpg$', + 'view_count': int, + 'age_limit': 18, } - } + }, { + 'url': 'http://www.keezmovies.com/video/1214711', + 'only_matching': True, + }] - def _real_extract(self, url): - video_id = self._match_id(url) + def _extract_info(self, url): + mobj = re.match(self._VALID_URL, url) + video_id = mobj.group('id') + display_id = mobj.group('display_id') or video_id - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) + webpage = self._download_webpage( + url, display_id, headers={'Cookie': 'age_verified=1'}) - # embedded video - mobj = re.search(r'href="([^"]+)"></iframe>', webpage) - if mobj: - embedded_url = mobj.group(1) - return self.url_result(embedded_url) + formats = [] + format_urls = set() - video_title = self._html_search_regex( - r'<h1 [^>]*>([^<]+)', webpage, 'title') - flashvars = self._parse_json(self._search_regex( - r'var\s+flashvars\s*=\s*([^;]+);', webpage, 'flashvars'), video_id) + title = None + thumbnail = None + duration = None + encrypted = False - formats = [] - for height in (180, 240, 480): - if flashvars.get('quality_%dp' % height): - video_url = flashvars['quality_%dp' % height] - a_format = { - 'url': video_url, - 'height': height, - 'format_id': '%dp' % height, - } - filename_parts = url_basename(video_url).split('_') - if len(filename_parts) >= 2 and re.match(r'\d+[Kk]', filename_parts[1]): - a_format['tbr'] = int(filename_parts[1][:-1]) - formats.append(a_format) - - age_limit = self._rta_search(webpage) - - return { + def extract_format(format_url, height=None): + if not isinstance(format_url, compat_str) or not format_url.startswith('http'): + return + if format_url in format_urls: + return + format_urls.add(format_url) + tbr = int_or_none(self._search_regex( + r'[/_](\d+)[kK][/_]', format_url, 'tbr', default=None)) + if not height: + height = int_or_none(self._search_regex( + r'[/_](\d+)[pP][/_]', format_url, 'height', default=None)) + if encrypted: + format_url = aes_decrypt_text( + video_url, title, 32).decode('utf-8') + formats.append({ + 'url': format_url, + 'format_id': '%dp' % height if height else None, + 'height': height, + 'tbr': tbr, + }) + + flashvars = self._parse_json( + self._search_regex( + r'flashvars\s*=\s*({.+?});', webpage, + 'flashvars', default='{}'), + display_id, fatal=False) + + if flashvars: + title = flashvars.get('video_title') + thumbnail = flashvars.get('image_url') + duration = int_or_none(flashvars.get('video_duration')) + encrypted = flashvars.get('encrypted') is True + for key, value in flashvars.items(): + mobj = re.search(r'quality_(\d+)[pP]', key) + if mobj: + extract_format(value, int(mobj.group(1))) + video_url = flashvars.get('video_url') + if video_url and determine_ext(video_url, None): + extract_format(video_url) + + video_url = self._html_search_regex( + r'flashvars\.video_url\s*=\s*(["\'])(?P<url>http.+?)\1', + webpage, 'video url', default=None, group='url') + if video_url: + extract_format(compat_urllib_parse_unquote(video_url)) + + if not formats: + if 'title="This video is no longer available"' in webpage: + raise ExtractorError( + 'Video %s is no longer available' % video_id, expected=True) + + self._sort_formats(formats) + + if not title: + title = self._html_search_regex( + r'<h1[^>]*>([^<]+)', webpage, 'title') + + return webpage, { 'id': video_id, - 'title': video_title, + 'display_id': display_id, + 'title': strip_or_none(title), + 'thumbnail': thumbnail, + 'duration': duration, + 'age_limit': 18, 'formats': formats, - 'age_limit': age_limit, - 'thumbnail': flashvars.get('image_url') } + + def _real_extract(self, url): + webpage, info = self._extract_info(url) + info['view_count'] = str_to_int(self._search_regex( + r'<b>([\d,.]+)</b> Views?', webpage, 'view count', fatal=False)) + return info diff --git a/youtube_dl/extractor/kuwo.py b/youtube_dl/extractor/kuwo.py index 0221fb9..0eeb9ff 100644 --- a/youtube_dl/extractor/kuwo.py +++ b/youtube_dl/extractor/kuwo.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import re from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( get_element_by_id, clean_html, @@ -26,11 +27,6 @@ class KuwoBaseIE(InfoExtractor): def _get_formats(self, song_id, tolerate_ip_deny=False): formats = [] for file_format in self._FORMATS: - headers = {} - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - headers['Ytdl-request-proxy'] = cn_verification_proxy - query = { 'format': file_format['ext'], 'br': file_format.get('br', ''), @@ -42,7 +38,7 @@ class KuwoBaseIE(InfoExtractor): song_url = self._download_webpage( 'http://antiserver.kuwo.cn/anti.s', song_id, note='Download %s url info' % file_format['format'], - query=query, headers=headers, + query=query, headers=self.geo_verification_headers(), ) if song_url == 'IPDeny' and not tolerate_ip_deny: @@ -247,8 +243,9 @@ class KuwoSingerIE(InfoExtractor): query={'artistId': artist_id, 'pn': page_num, 'rn': self.PAGE_SIZE}) return [ - self.url_result(song_url, 'Kuwo') for song_url in re.findall( - r'<div[^>]+class="name"><a[^>]+href="(http://www\.kuwo\.cn/yinyue/\d+)', + self.url_result(compat_urlparse.urljoin(url, song_url), 'Kuwo') + for song_url in re.findall( + r'<div[^>]+class="name"><a[^>]+href="(/yinyue/\d+)', webpage) ] diff --git a/youtube_dl/extractor/la7.py b/youtube_dl/extractor/la7.py index b08f6e3..da5a5de 100644 --- a/youtube_dl/extractor/la7.py +++ b/youtube_dl/extractor/la7.py @@ -1,60 +1,65 @@ +# coding: utf-8 from __future__ import unicode_literals from .common import InfoExtractor from ..utils import ( - parse_duration, + js_to_json, + smuggle_url, ) class LA7IE(InfoExtractor): - IE_NAME = 'la7.tv' - _VALID_URL = r'''(?x) - https?://(?:www\.)?la7\.tv/ - (?: - richplayer/\?assetid=| - \?contentId= - ) - (?P<id>[0-9]+)''' - - _TEST = { - 'url': 'http://www.la7.tv/richplayer/?assetid=50355319', - 'md5': 'ec7d1f0224d20ba293ab56cf2259651f', + IE_NAME = 'la7.it' + _VALID_URL = r'''(?x)(https?://)?(?: + (?:www\.)?la7\.it/([^/]+)/(?:rivedila7|video)/| + tg\.la7\.it/repliche-tgla7\?id= + )(?P<id>.+)''' + + _TESTS = [{ + # 'src' is a plain URL + 'url': 'http://www.la7.it/crozza/video/inccool8-02-10-2015-163722', + 'md5': '8b613ffc0c4bf9b9e377169fc19c214c', 'info_dict': { - 'id': '50355319', + 'id': 'inccool8-02-10-2015-163722', 'ext': 'mp4', - 'title': 'IL DIVO', - 'description': 'Un film di Paolo Sorrentino con Toni Servillo, Anna Bonaiuto, Giulio Bosetti e Flavio Bucci', - 'duration': 6254, + 'title': 'Inc.Cool8', + 'description': 'Benvenuti nell\'incredibile mondo della INC. COOL. 8. dove “INC.” sta per “Incorporated” “COOL” sta per “fashion” ed Eight sta per il gesto atletico', + 'thumbnail': 're:^https?://.*', + 'uploader_id': 'kdla7pillole@iltrovatore.it', + 'timestamp': 1443814869, + 'upload_date': '20151002', }, - 'skip': 'Blocked in the US', - } + }, { + # 'src' is a dictionary + 'url': 'http://tg.la7.it/repliche-tgla7?id=189080', + 'md5': '6b0d8888d286e39870208dfeceaf456b', + 'info_dict': { + 'id': '189080', + 'ext': 'mp4', + 'title': 'TG LA7', + }, + }, { + 'url': 'http://www.la7.it/omnibus/rivedila7/omnibus-news-02-07-2016-189077', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - xml_url = 'http://www.la7.tv/repliche/content/index.php?contentId=%s' % video_id - doc = self._download_xml(xml_url, video_id) - - video_title = doc.find('title').text - description = doc.find('description').text - duration = parse_duration(doc.find('duration').text) - thumbnail = doc.find('img').text - view_count = int(doc.find('views').text) - prefix = doc.find('.//fqdn').text.strip().replace('auto:', 'http:') + webpage = self._download_webpage(url, video_id) - formats = [{ - 'format': vnode.find('quality').text, - 'tbr': int(vnode.find('quality').text), - 'url': vnode.find('fms').text.strip().replace('mp4:', prefix), - } for vnode in doc.findall('.//videos/video')] - self._sort_formats(formats) + player_data = self._parse_json( + self._search_regex(r'videoLa7\(({[^;]+})\);', webpage, 'player data'), + video_id, transform_source=js_to_json) return { + '_type': 'url_transparent', + 'url': smuggle_url('kaltura:103:%s' % player_data['vid'], { + 'service_url': 'http://kdam.iltrovatore.it', + }), 'id': video_id, - 'title': video_title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - 'view_count': view_count, + 'title': player_data['title'], + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': player_data.get('poster'), + 'ie_key': 'Kaltura', } diff --git a/youtube_dl/extractor/lcp.py b/youtube_dl/extractor/lcp.py new file mode 100644 index 0000000..ade27a9 --- /dev/null +++ b/youtube_dl/extractor/lcp.py @@ -0,0 +1,90 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .arkena import ArkenaIE + + +class LcpPlayIE(ArkenaIE): + _VALID_URL = r'https?://play\.lcp\.fr/embed/(?P<id>[^/]+)/(?P<account_id>[^/]+)/[^/]+/[^/]+' + _TESTS = [{ + 'url': 'http://play.lcp.fr/embed/327336/131064/darkmatter/0', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': '327336', + 'ext': 'mp4', + 'title': '327336', + 'timestamp': 1456391602, + 'upload_date': '20160225', + }, + 'params': { + 'skip_download': True, + }, + }] + + +class LcpIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?lcp\.fr/(?:[^/]+/)*(?P<id>[^/]+)' + + _TESTS = [{ + # arkena embed + 'url': 'http://www.lcp.fr/la-politique-en-video/schwartzenberg-prg-preconise-francois-hollande-de-participer-une-primaire', + 'md5': 'b8bd9298542929c06c1c15788b1f277a', + 'info_dict': { + 'id': 'd56d03e9', + 'ext': 'mp4', + 'title': 'Schwartzenberg (PRG) préconise à François Hollande de participer à une primaire à gauche', + 'description': 'md5:96ad55009548da9dea19f4120c6c16a8', + 'timestamp': 1456488895, + 'upload_date': '20160226', + }, + 'params': { + 'skip_download': True, + }, + }, { + # dailymotion live stream + 'url': 'http://www.lcp.fr/le-direct', + 'info_dict': { + 'id': 'xji3qy', + 'ext': 'mp4', + 'title': 'La Chaine Parlementaire (LCP), Live TNT', + 'description': 'md5:5c69593f2de0f38bd9a949f2c95e870b', + 'uploader': 'LCP', + 'uploader_id': 'xbz33d', + 'timestamp': 1308923058, + 'upload_date': '20110624', + }, + 'params': { + # m3u8 live stream + 'skip_download': True, + }, + }, { + 'url': 'http://www.lcp.fr/emissions/277792-les-volontaires', + 'only_matching': True, + }] + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + play_url = self._search_regex( + r'<iframe[^>]+src=(["\'])(?P<url>%s?(?:(?!\1).)*)\1' % LcpPlayIE._VALID_URL, + webpage, 'play iframe', default=None, group='url') + + if not play_url: + return self.url_result(url, 'Generic') + + title = self._og_search_title(webpage, default=None) or self._html_search_meta( + 'twitter:title', webpage, fatal=True) + description = self._html_search_meta( + ('description', 'twitter:description'), webpage) + + return { + '_type': 'url_transparent', + 'ie_key': LcpPlayIE.ie_key(), + 'url': play_url, + 'display_id': display_id, + 'title': title, + 'description': description, + } diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py index 63f581c..e9cc9aa 100644 --- a/youtube_dl/extractor/leeco.py +++ b/youtube_dl/extractor/leeco.py @@ -20,9 +20,10 @@ from ..utils import ( int_or_none, orderedSet, parse_iso8601, - sanitized_Request, str_or_none, url_basename, + urshift, + update_url_query, ) @@ -74,15 +75,11 @@ class LeIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def urshift(val, n): - return val >> n if val >= 0 else (val + 0x100000000) >> n - # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf def ror(self, param1, param2): _loc3_ = 0 while _loc3_ < param2: - param1 = self.urshift(param1, 1) + ((param1 & 1) << 31) + param1 = urshift(param1, 1) + ((param1 & 1) << 31) _loc3_ += 1 return param1 @@ -93,6 +90,10 @@ class LeIE(InfoExtractor): _loc3_ = self.ror(_loc3_, _loc2_ % 17) return _loc3_ + # reversed from http://jstatic.letvcdn.com/sdk/player.js + def get_mms_key(self, time): + return self.ror(time, 8) ^ 185025305 + # see M3U8Encryption class in KLetvPlayer.swf @staticmethod def decrypt_m3u8(encrypted_data): @@ -113,28 +114,7 @@ class LeIE(InfoExtractor): return bytes(_loc7_) - def _real_extract(self, url): - media_id = self._match_id(url) - page = self._download_webpage(url, media_id) - params = { - 'id': media_id, - 'platid': 1, - 'splatid': 101, - 'format': 1, - 'tkey': self.calc_time_key(int(time.time())), - 'domain': 'www.le.com' - } - play_json_req = sanitized_Request( - 'http://api.le.com/mms/out/video/playJson?' + compat_urllib_parse_urlencode(params) - ) - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - play_json_req.add_header('Ytdl-request-proxy', cn_verification_proxy) - - play_json = self._download_json( - play_json_req, - media_id, 'Downloading playJson data') - + def _check_errors(self, play_json): # Check for errors playstatus = play_json['playstatus'] if playstatus['status'] == 0: @@ -145,43 +125,99 @@ class LeIE(InfoExtractor): msg = 'Generic error. flag = %d' % flag raise ExtractorError(msg, expected=True) - playurl = play_json['playurl'] - - formats = ['350', '1000', '1300', '720p', '1080p'] - dispatch = playurl['dispatch'] + def _real_extract(self, url): + media_id = self._match_id(url) + page = self._download_webpage(url, media_id) - urls = [] - for format_id in formats: - if format_id in dispatch: - media_url = playurl['domain'][0] + dispatch[format_id][0] - media_url += '&' + compat_urllib_parse_urlencode({ - 'm3v': 1, + play_json_h5 = self._download_json( + 'http://api.le.com/mms/out/video/playJsonH5', + media_id, 'Downloading html5 playJson data', query={ + 'id': media_id, + 'platid': 3, + 'splatid': 304, + 'format': 1, + 'tkey': self.get_mms_key(int(time.time())), + 'domain': 'www.le.com', + 'tss': 'no', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_h5) + + play_json_flash = self._download_json( + 'http://api.le.com/mms/out/video/playJson', + media_id, 'Downloading flash playJson data', query={ + 'id': media_id, + 'platid': 1, + 'splatid': 101, + 'format': 1, + 'tkey': self.calc_time_key(int(time.time())), + 'domain': 'www.le.com', + }, + headers=self.geo_verification_headers()) + self._check_errors(play_json_flash) + + def get_h5_urls(media_url, format_id): + location = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id, query={ 'format': 1, 'expect': 3, - 'rateid': format_id, - }) + 'tss': 'no', + })['location'] + + return { + 'http': update_url_query(location, {'tss': 'no'}), + 'hls': update_url_query(location, {'tss': 'ios'}), + } - nodes_data = self._download_json( - media_url, media_id, - 'Download JSON metadata for format %s' % format_id) + def get_flash_urls(media_url, format_id): + media_url += '&' + compat_urllib_parse_urlencode({ + 'm3v': 1, + 'format': 1, + 'expect': 3, + 'rateid': format_id, + }) - req = self._request_webpage( - nodes_data['nodelist'][0]['location'], media_id, - note='Downloading m3u8 information for format %s' % format_id) + nodes_data = self._download_json( + media_url, media_id, + 'Download JSON metadata for format %s' % format_id) - m3u8_data = self.decrypt_m3u8(req.read()) + req = self._request_webpage( + nodes_data['nodelist'][0]['location'], media_id, + note='Downloading m3u8 information for format %s' % format_id) - url_info_dict = { - 'url': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), - 'ext': determine_ext(dispatch[format_id][1]), - 'format_id': format_id, - 'protocol': 'm3u8', - } + m3u8_data = self.decrypt_m3u8(req.read()) - if format_id[-1:] == 'p': - url_info_dict['height'] = int_or_none(format_id[:-1]) + return { + 'hls': encode_data_uri(m3u8_data, 'application/vnd.apple.mpegurl'), + } - urls.append(url_info_dict) + extracted_formats = [] + formats = [] + for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)): + playurl = play_json['playurl'] + play_domain = playurl['domain'][0] + + for format_id, format_data in playurl.get('dispatch', []).items(): + if format_id in extracted_formats: + continue + extracted_formats.append(format_id) + + media_url = play_domain + format_data[0] + for protocol, format_url in get_urls(media_url, format_id).items(): + f = { + 'url': format_url, + 'ext': determine_ext(format_data[1]), + 'format_id': '%s-%s' % (protocol, format_id), + 'protocol': 'm3u8_native' if protocol == 'hls' else 'http', + 'quality': int_or_none(format_id), + } + + if format_id[-1:] == 'p': + f['height'] = int_or_none(format_id[:-1]) + + formats.append(f) + self._sort_formats(formats, ('height', 'quality', 'format_id')) publish_time = parse_iso8601(self._html_search_regex( r'发布时间 ([^<>]+) ', page, 'publish time', default=None), @@ -190,7 +226,7 @@ class LeIE(InfoExtractor): return { 'id': media_id, - 'formats': urls, + 'formats': formats, 'title': playurl['title'], 'thumbnail': playurl['pic'], 'description': description, diff --git a/youtube_dl/extractor/lifenews.py b/youtube_dl/extractor/lifenews.py index c2b4490..87120ec 100644 --- a/youtube_dl/extractor/lifenews.py +++ b/youtube_dl/extractor/lifenews.py @@ -4,7 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( determine_ext, ExtractorError, @@ -96,7 +99,7 @@ class LifeNewsIE(InfoExtractor): r'<video[^>]+><source[^>]+src=["\'](.+?)["\']', webpage) iframe_links = re.findall( - r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/embed/.+?)["\']', + r'<iframe[^>]+src=["\']((?:https?:)?//embed\.life\.ru/(?:embed|video)/.+?)["\']', webpage) if not video_urls and not iframe_links: @@ -164,9 +167,9 @@ class LifeNewsIE(InfoExtractor): class LifeEmbedIE(InfoExtractor): IE_NAME = 'life:embed' - _VALID_URL = r'https?://embed\.life\.ru/embed/(?P<id>[\da-f]{32})' + _VALID_URL = r'https?://embed\.life\.ru/(?:embed|video)/(?P<id>[\da-f]{32})' - _TEST = { + _TESTS = [{ 'url': 'http://embed.life.ru/embed/e50c2dec2867350528e2574c899b8291', 'md5': 'b889715c9e49cb1981281d0e5458fbbe', 'info_dict': { @@ -175,30 +178,57 @@ class LifeEmbedIE(InfoExtractor): 'title': 'e50c2dec2867350528e2574c899b8291', 'thumbnail': 're:http://.*\.jpg', } - } + }, { + # with 1080p + 'url': 'https://embed.life.ru/video/e50c2dec2867350528e2574c899b8291', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) + thumbnail = None formats = [] - for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): - video_url = compat_urlparse.urljoin(url, video_url) - ext = determine_ext(video_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='m3u8')) - else: - formats.append({ - 'url': video_url, - 'format_id': ext, - 'preference': 1, - }) + + def extract_m3u8(manifest_url): + formats.extend(self._extract_m3u8_formats( + manifest_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='m3u8')) + + def extract_original(original_url): + formats.append({ + 'url': original_url, + 'format_id': determine_ext(original_url, None), + 'preference': 1, + }) + + playlist = self._parse_json( + self._search_regex( + r'options\s*=\s*({.+?});', webpage, 'options', default='{}'), + video_id).get('playlist', {}) + if playlist: + master = playlist.get('master') + if isinstance(master, compat_str) and determine_ext(master) == 'm3u8': + extract_m3u8(compat_urlparse.urljoin(url, master)) + original = playlist.get('original') + if isinstance(original, compat_str): + extract_original(original) + thumbnail = playlist.get('image') + + # Old rendition fallback + if not formats: + for video_url in re.findall(r'"file"\s*:\s*"([^"]+)', webpage): + video_url = compat_urlparse.urljoin(url, video_url) + if determine_ext(video_url) == 'm3u8': + extract_m3u8(video_url) + else: + extract_original(video_url) + self._sort_formats(formats) - thumbnail = self._search_regex( + thumbnail = thumbnail or self._search_regex( r'"image"\s*:\s*"([^"]+)', webpage, 'thumbnail', default=None) return { diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py index 5d2c3e2..a425baf 100644 --- a/youtube_dl/extractor/limelight.py +++ b/youtube_dl/extractor/limelight.py @@ -37,11 +37,12 @@ class LimelightBaseIE(InfoExtractor): for stream in streams: stream_url = stream.get('url') - if not stream_url: + if not stream_url or stream.get('drmProtected'): continue - if '.f4m' in stream_url: + ext = determine_ext(stream_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( - stream_url, video_id, fatal=False)) + stream_url, video_id, f4m_id='hds', fatal=False)) else: fmt = { 'url': stream_url, @@ -50,13 +51,19 @@ class LimelightBaseIE(InfoExtractor): 'fps': float_or_none(stream.get('videoFrameRate')), 'width': int_or_none(stream.get('videoWidthInPixels')), 'height': int_or_none(stream.get('videoHeightInPixels')), - 'ext': determine_ext(stream_url) + 'ext': ext, } - rtmp = re.search(r'^(?P<url>rtmpe?://[^/]+/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url) + rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url) if rtmp: format_id = 'rtmp' if stream.get('videoBitRate'): format_id += '-%d' % int_or_none(stream['videoBitRate']) + http_fmt = fmt.copy() + http_fmt.update({ + 'url': 'http://%s/%s' % (rtmp.group('host').replace('csl.', 'cpl.'), rtmp.group('playpath')[4:]), + 'format_id': format_id.replace('rtmp', 'http'), + }) + formats.append(http_fmt) fmt.update({ 'url': rtmp.group('url'), 'play_path': rtmp.group('playpath'), @@ -68,18 +75,23 @@ class LimelightBaseIE(InfoExtractor): for mobile_url in mobile_urls: media_url = mobile_url.get('mobileUrl') - if not media_url: - continue format_id = mobile_url.get('targetMediaPlatform') - if determine_ext(media_url) == 'm3u8': + if not media_url or format_id == 'Widevine': + continue + ext = determine_ext(media_url) + if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( media_url, video_id, 'mp4', 'm3u8_native', m3u8_id=format_id, fatal=False)) + elif ext == 'f4m': + formats.extend(self._extract_f4m_formats( + stream_url, video_id, f4m_id=format_id, fatal=False)) else: formats.append({ 'url': media_url, 'format_id': format_id, 'preference': -1, + 'ext': ext, }) self._sort_formats(formats) @@ -145,7 +157,7 @@ class LimelightMediaIE(LimelightBaseIE): 'url': 'http://link.videoplatform.limelight.com/media/?mediaId=3ffd040b522b4485b6d84effc750cd86', 'info_dict': { 'id': '3ffd040b522b4485b6d84effc750cd86', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'HaP and the HB Prince Trailer', 'description': 'md5:8005b944181778e313d95c1237ddb640', 'thumbnail': 're:^https?://.*\.jpeg$', @@ -154,27 +166,23 @@ class LimelightMediaIE(LimelightBaseIE): 'upload_date': '20090604', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, }, { # video with subtitles 'url': 'limelight:media:a3e00274d4564ec4a9b29b9466432335', + 'md5': '2fa3bad9ac321e23860ca23bc2c69e3d', 'info_dict': { 'id': 'a3e00274d4564ec4a9b29b9466432335', - 'ext': 'flv', + 'ext': 'mp4', 'title': '3Play Media Overview Video', - 'description': '', 'thumbnail': 're:^https?://.*\.jpeg$', 'duration': 78.101, 'timestamp': 1338929955, 'upload_date': '20120605', 'subtitles': 'mincount:9', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'https://assets.delvenetworks.com/player/loader.swf?mediaId=8018a574f08d416e95ceaccae4ba0452', 'only_matching': True, diff --git a/youtube_dl/extractor/lynda.py b/youtube_dl/extractor/lynda.py index 2d50400..a98c4c5 100644 --- a/youtube_dl/extractor/lynda.py +++ b/youtube_dl/extractor/lynda.py @@ -100,7 +100,7 @@ class LyndaIE(LyndaBaseIE): _TESTS = [{ 'url': 'http://www.lynda.com/Bootstrap-tutorials/Using-exercise-files/110885/114408-4.html', - 'md5': 'ecfc6862da89489161fb9cd5f5a6fac1', + # md5 is unstable 'info_dict': { 'id': '114408', 'ext': 'mp4', diff --git a/youtube_dl/extractor/m6.py b/youtube_dl/extractor/m6.py index d5945ad..39d2742 100644 --- a/youtube_dl/extractor/m6.py +++ b/youtube_dl/extractor/m6.py @@ -1,8 +1,6 @@ # encoding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor @@ -23,34 +21,5 @@ class M6IE(InfoExtractor): } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - - rss = self._download_xml('http://ws.m6.fr/v1/video/info/m6/bonus/%s' % video_id, video_id, - 'Downloading video RSS') - - title = rss.find('./channel/item/title').text - description = rss.find('./channel/item/description').text - thumbnail = rss.find('./channel/item/visuel_clip_big').text - duration = int(rss.find('./channel/item/duration').text) - view_count = int(rss.find('./channel/item/nombre_vues').text) - - formats = [] - for format_id in ['lq', 'sd', 'hq', 'hd']: - video_url = rss.find('./channel/item/url_video_%s' % format_id) - if video_url is None: - continue - formats.append({ - 'url': video_url.text, - 'format_id': format_id, - }) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'view_count': view_count, - 'formats': formats, - } + video_id = self._match_id(url) + return self.url_result('6play:%s' % video_id, 'SixPlay', video_id) diff --git a/youtube_dl/extractor/meta.py b/youtube_dl/extractor/meta.py new file mode 100644 index 0000000..cdb46e1 --- /dev/null +++ b/youtube_dl/extractor/meta.py @@ -0,0 +1,73 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from .pladform import PladformIE +from ..utils import ( + unescapeHTML, + int_or_none, + ExtractorError, +) + + +class METAIE(InfoExtractor): + _VALID_URL = r'https?://video\.meta\.ua/(?:iframe/)?(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://video.meta.ua/5502115.video', + 'md5': '71b6f3ee274bef16f1ab410f7f56b476', + 'info_dict': { + 'id': '5502115', + 'ext': 'mp4', + 'title': 'Sony Xperia Z camera test [HQ]', + 'description': 'Xperia Z shoots video in FullHD HDR.', + 'uploader_id': 'nomobile', + 'uploader': 'CHЁZA.TV', + 'upload_date': '20130211', + }, + 'add_ie': ['Youtube'], + }, { + 'url': 'http://video.meta.ua/iframe/5502115', + 'only_matching': True, + }, { + # pladform embed + 'url': 'http://video.meta.ua/7121015.video', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + st_html5 = self._search_regex( + r"st_html5\s*=\s*'#([^']+)'", webpage, 'uppod html5 st', default=None) + + if st_html5: + # uppod st decryption algorithm is reverse engineered from function un(s) at uppod.js + json_str = '' + for i in range(0, len(st_html5), 3): + json_str += '�%s;' % st_html5[i:i + 3] + uppod_data = self._parse_json(unescapeHTML(json_str), video_id) + error = uppod_data.get('customnotfound') + if error: + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + video_url = uppod_data['file'] + info = { + 'id': video_id, + 'url': video_url, + 'title': uppod_data.get('comment') or self._og_search_title(webpage), + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': uppod_data.get('poster') or self._og_search_thumbnail(webpage), + 'duration': int_or_none(self._og_search_property( + 'video:duration', webpage, default=None)), + } + if 'youtube.com/' in video_url: + info.update({ + '_type': 'url_transparent', + 'ie_key': 'Youtube', + }) + return info + + pladform_url = PladformIE._extract_url(webpage) + if pladform_url: + return self.url_result(pladform_url) diff --git a/youtube_dl/extractor/metacafe.py b/youtube_dl/extractor/metacafe.py index b6f00cc..e6e7659 100644 --- a/youtube_dl/extractor/metacafe.py +++ b/youtube_dl/extractor/metacafe.py @@ -11,13 +11,14 @@ from ..utils import ( determine_ext, ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata, + get_element_by_attribute, + mimetype2ext, ) class MetacafeIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/([^/]+)/([^/]+)/.*' + _VALID_URL = r'https?://(?:www\.)?metacafe\.com/watch/(?P<video_id>[^/]+)/(?P<display_id>[^/?#]+)' _DISCLAIMER = 'http://www.metacafe.com/family_filter/' _FILTER_POST = 'http://www.metacafe.com/f/index.php?inputType=filter&controllerGroup=user' IE_NAME = 'metacafe' @@ -47,6 +48,7 @@ class MetacafeIE(InfoExtractor): 'uploader': 'ign', 'description': 'Sony released a massive FAQ on the PlayStation Blog detailing the PS4\'s capabilities and limitations.', }, + 'skip': 'Page is temporarily unavailable.', }, # AnyClip video { @@ -55,8 +57,8 @@ class MetacafeIE(InfoExtractor): 'id': 'an-dVVXnuY7Jh77J', 'ext': 'mp4', 'title': 'The Andromeda Strain (1971): Stop the Bomb Part 3', - 'uploader': 'anyclip', - 'description': 'md5:38c711dd98f5bb87acf973d573442e67', + 'uploader': 'AnyClip', + 'description': 'md5:cbef0460d31e3807f6feb4e7a5952e5b', }, }, # age-restricted video @@ -110,28 +112,25 @@ class MetacafeIE(InfoExtractor): def report_disclaimer(self): self.to_screen('Retrieving disclaimer') - def _real_initialize(self): + def _confirm_age(self): # Retrieve disclaimer self.report_disclaimer() self._download_webpage(self._DISCLAIMER, None, False, 'Unable to retrieve disclaimer') # Confirm age - disclaimer_form = { - 'filters': '0', - 'submit': "Continue - I'm over 18", - } - request = sanitized_Request(self._FILTER_POST, urlencode_postdata(disclaimer_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') self.report_age_confirmation() - self._download_webpage(request, None, False, 'Unable to confirm age') + self._download_webpage( + self._FILTER_POST, None, False, 'Unable to confirm age', + data=urlencode_postdata({ + 'filters': '0', + 'submit': "Continue - I'm over 18", + }), headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + }) def _real_extract(self, url): # Extract id and simplified title from URL - mobj = re.match(self._VALID_URL, url) - if mobj is None: - raise ExtractorError('Invalid URL: %s' % url) - - video_id = mobj.group(1) + video_id, display_id = re.match(self._VALID_URL, url).groups() # the video may come from an external site m_external = re.match('^(\w{2})-(.*)$', video_id) @@ -144,15 +143,24 @@ class MetacafeIE(InfoExtractor): if prefix == 'cb': return self.url_result('theplatform:%s' % ext_id, 'ThePlatform') - # Retrieve video webpage to extract further information - req = sanitized_Request('http://www.metacafe.com/watch/%s/' % video_id) + # self._confirm_age() # AnyClip videos require the flashversion cookie so that we get the link # to the mp4 file - mobj_an = re.match(r'^an-(.*?)$', video_id) - if mobj_an: - req.headers['Cookie'] = 'flashVersion=0;' - webpage = self._download_webpage(req, video_id) + headers = {} + if video_id.startswith('an-'): + headers['Cookie'] = 'flashVersion=0;' + + # Retrieve video webpage to extract further information + webpage = self._download_webpage(url, video_id, headers=headers) + + error = get_element_by_attribute( + 'class', 'notfound-page-title', webpage) + if error: + raise ExtractorError(error, expected=True) + + video_title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title', default=None) or self._search_regex(r'<h1>(.*?)</h1>', webpage, 'title') # Extract URL, uploader and title from webpage self.report_extraction(video_id) @@ -216,20 +224,40 @@ class MetacafeIE(InfoExtractor): 'player_url': player_url, 'ext': play_path.partition(':')[0], }) + if video_url is None: + flashvars = self._parse_json(self._search_regex( + r'flashvars\s*=\s*({.*});', webpage, 'flashvars', + default=None), video_id, fatal=False) + if flashvars: + video_url = [] + for source in flashvars.get('sources'): + source_url = source.get('src') + if not source_url: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_url) + if ext == 'm3u8': + video_url.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + else: + video_url.append({ + 'url': source_url, + 'ext': ext, + }) if video_url is None: raise ExtractorError('Unsupported video type') - video_title = self._html_search_regex( - r'(?im)<title>(.*) - Video</title>', webpage, 'title') - description = self._og_search_description(webpage) - thumbnail = self._og_search_thumbnail(webpage) + description = self._html_search_meta( + ['og:description', 'twitter:description', 'description'], + webpage, 'title', fatal=False) + thumbnail = self._html_search_meta( + ['og:image', 'twitter:image'], webpage, 'title', fatal=False) video_uploader = self._html_search_regex( r'submitter=(.*?);|googletag\.pubads\(\)\.setTargeting\("(?:channel|submiter)","([^"]+)"\);', webpage, 'uploader nickname', fatal=False) duration = int_or_none( - self._html_search_meta('video:duration', webpage)) - + self._html_search_meta('video:duration', webpage, default=None)) age_limit = ( 18 if re.search(r'(?:"contentRating":|"rating",)"restricted"', webpage) @@ -242,10 +270,11 @@ class MetacafeIE(InfoExtractor): 'url': video_url, 'ext': video_ext, }] - self._sort_formats(formats) + return { 'id': video_id, + 'display_id': display_id, 'description': description, 'uploader': video_uploader, 'title': video_title, diff --git a/youtube_dl/extractor/mgtv.py b/youtube_dl/extractor/mgtv.py index 9fbc74f..27bdff8 100644 --- a/youtube_dl/extractor/mgtv.py +++ b/youtube_dl/extractor/mgtv.py @@ -9,7 +9,7 @@ class MGTVIE(InfoExtractor): _VALID_URL = r'https?://www\.mgtv\.com/v/(?:[^/]+/)*(?P<id>\d+)\.html' IE_DESC = '芒果TV' - _TEST = { + _TESTS = [{ 'url': 'http://www.mgtv.com/v/1/290525/f/3116640.html', 'md5': '1bdadcf760a0b90946ca68ee9a2db41a', 'info_dict': { @@ -20,13 +20,18 @@ class MGTVIE(InfoExtractor): 'duration': 7461, 'thumbnail': 're:^https?://.*\.jpg$', }, - } + }, { + # no tbr extracted from stream_url + 'url': 'http://www.mgtv.com/v/1/1/f/3324755.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) api_data = self._download_json( 'http://v.api.mgtv.com/player/video', video_id, - query={'video_id': video_id})['data'] + query={'video_id': video_id}, + headers=self.geo_verification_headers())['data'] info = api_data['info'] formats = [] @@ -40,7 +45,8 @@ class MGTVIE(InfoExtractor): def extract_format(stream_url, format_id, idx, query={}): format_info = self._download_json( stream_url, video_id, - note='Download video info for format %s' % format_id or '#%d' % idx, query=query) + note='Download video info for format %s' % (format_id or '#%d' % idx), + query=query) return { 'format_id': format_id, 'url': format_info['info'], diff --git a/youtube_dl/extractor/miomio.py b/youtube_dl/extractor/miomio.py index 170ebd9..937ba0f 100644 --- a/youtube_dl/extractor/miomio.py +++ b/youtube_dl/extractor/miomio.py @@ -4,6 +4,7 @@ from __future__ import unicode_literals import random from .common import InfoExtractor +from ..compat import compat_urlparse from ..utils import ( xpath_text, int_or_none, @@ -18,13 +19,16 @@ class MioMioIE(InfoExtractor): _TESTS = [{ # "type=video" in flashvars 'url': 'http://www.miomio.tv/watch/cc88912/', - 'md5': '317a5f7f6b544ce8419b784ca8edae65', 'info_dict': { 'id': '88912', 'ext': 'flv', 'title': '【SKY】字幕 铠武昭和VS平成 假面骑士大战FEAT战队 魔星字幕组 字幕', 'duration': 5923, }, + 'params': { + # The server provides broken file + 'skip_download': True, + } }, { 'url': 'http://www.miomio.tv/watch/cc184024/', 'info_dict': { @@ -32,7 +36,7 @@ class MioMioIE(InfoExtractor): 'title': '《动漫同人插画绘制》', }, 'playlist_mincount': 86, - 'skip': 'This video takes time too long for retrieving the URL', + 'skip': 'Unable to load videos', }, { 'url': 'http://www.miomio.tv/watch/cc173113/', 'info_dict': { @@ -40,20 +44,23 @@ class MioMioIE(InfoExtractor): 'title': 'The New Macbook 2015 上手试玩与简评' }, 'playlist_mincount': 2, + 'skip': 'Unable to load videos', + }, { + # new 'h5' player + 'url': 'http://www.miomio.tv/watch/cc273295/', + 'md5': '', + 'info_dict': { + 'id': '273295', + 'ext': 'mp4', + 'title': 'アウト×デラックス 20160526', + }, + 'params': { + # intermittent HTTP 500 + 'skip_download': True, + }, }] - def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - title = self._html_search_meta( - 'description', webpage, 'title', fatal=True) - - mioplayer_path = self._search_regex( - r'src="(/mioplayer/[^"]+)"', webpage, 'ref_path') - - http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} - + def _extract_mioplayer(self, webpage, video_id, title, http_headers): xml_config = self._search_regex( r'flashvars="type=(?:sina|video)&(.+?)&', webpage, 'xml config') @@ -92,10 +99,34 @@ class MioMioIE(InfoExtractor): 'http_headers': http_headers, }) + return entries + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = self._html_search_meta( + 'description', webpage, 'title', fatal=True) + + mioplayer_path = self._search_regex( + r'src="(/mioplayer(?:_h5)?/[^"]+)"', webpage, 'ref_path') + + if '_h5' in mioplayer_path: + player_url = compat_urlparse.urljoin(url, mioplayer_path) + player_webpage = self._download_webpage( + player_url, video_id, + note='Downloading player webpage', headers={'Referer': url}) + entries = self._parse_html5_media_entries(player_url, player_webpage) + http_headers = {'Referer': player_url} + else: + http_headers = {'Referer': 'http://www.miomio.tv%s' % mioplayer_path} + entries = self._extract_mioplayer(webpage, video_id, title, http_headers) + if len(entries) == 1: segment = entries[0] segment['id'] = video_id segment['title'] = title + segment['http_headers'] = http_headers return segment return { diff --git a/youtube_dl/extractor/mitele.py b/youtube_dl/extractor/mitele.py index 5a00cd3..cd169f3 100644 --- a/youtube_dl/extractor/mitele.py +++ b/youtube_dl/extractor/mitele.py @@ -12,12 +12,69 @@ from ..utils import ( get_element_by_attribute, int_or_none, remove_start, + extract_attributes, + determine_ext, ) -class MiTeleIE(InfoExtractor): +class MiTeleBaseIE(InfoExtractor): + def _get_player_info(self, url, webpage): + player_data = extract_attributes(self._search_regex( + r'(?s)(<ms-video-player.+?</ms-video-player>)', + webpage, 'ms video player')) + video_id = player_data['data-media-id'] + config_url = compat_urlparse.urljoin(url, player_data['data-config']) + config = self._download_json( + config_url, video_id, 'Downloading config JSON') + mmc_url = config['services']['mmc'] + + duration = None + formats = [] + for m_url in (mmc_url, mmc_url.replace('/flash.json', '/html5.json')): + mmc = self._download_json( + m_url, video_id, 'Downloading mmc JSON') + if not duration: + duration = int_or_none(mmc.get('duration')) + for location in mmc['locations']: + gat = self._proto_relative_url(location.get('gat'), 'http:') + bas = location.get('bas') + loc = location.get('loc') + ogn = location.get('ogn') + if None in (gat, bas, loc, ogn): + continue + token_data = { + 'bas': bas, + 'icd': loc, + 'ogn': ogn, + 'sta': '0', + } + media = self._download_json( + '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), + video_id, 'Downloading %s JSON' % location['loc']) + file_ = media.get('file') + if not file_: + continue + ext = determine_ext(file_) + if ext == 'f4m': + formats.extend(self._extract_f4m_formats( + file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', + video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + file_, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + self._sort_formats(formats) + + return { + 'id': video_id, + 'formats': formats, + 'thumbnail': player_data.get('data-poster') or config.get('poster', {}).get('imageUrl'), + 'duration': duration, + } + + +class MiTeleIE(MiTeleBaseIE): IE_DESC = 'mitele.es' - _VALID_URL = r'https?://www\.mitele\.es/[^/]+/[^/]+/[^/]+/(?P<id>[^/]+)/' + _VALID_URL = r'https?://www\.mitele\.es/(?:[^/]+/){3}(?P<id>[^/]+)/' _TESTS = [{ 'url': 'http://www.mitele.es/programas-tv/diario-de/la-redaccion/programa-144/', @@ -25,7 +82,7 @@ class MiTeleIE(InfoExtractor): 'info_dict': { 'id': '0NF1jJnxS1Wu3pHrmvFyw2', 'display_id': 'programa-144', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Tor, la web invisible', 'description': 'md5:3b6fce7eaa41b2d97358726378d9369f', 'series': 'Diario de', @@ -40,7 +97,7 @@ class MiTeleIE(InfoExtractor): 'info_dict': { 'id': 'eLZSwoEd1S3pVyUm8lc6F', 'display_id': 'programa-226', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Cuarto Milenio - Temporada 6 - Programa 226', 'description': 'md5:50daf9fadefa4e62d9fc866d0c015701', 'series': 'Cuarto Milenio', @@ -59,40 +116,7 @@ class MiTeleIE(InfoExtractor): webpage = self._download_webpage(url, display_id) - config_url = self._search_regex( - r'data-config\s*=\s*"([^"]+)"', webpage, 'data config url') - config_url = compat_urlparse.urljoin(url, config_url) - - config = self._download_json( - config_url, display_id, 'Downloading config JSON') - - mmc = self._download_json( - config['services']['mmc'], display_id, 'Downloading mmc JSON') - - formats = [] - for location in mmc['locations']: - gat = self._proto_relative_url(location.get('gat'), 'http:') - bas = location.get('bas') - loc = location.get('loc') - ogn = location.get('ogn') - if None in (gat, bas, loc, ogn): - continue - token_data = { - 'bas': bas, - 'icd': loc, - 'ogn': ogn, - 'sta': '0', - } - media = self._download_json( - '%s/?%s' % (gat, compat_urllib_parse_urlencode(token_data)), - display_id, 'Downloading %s JSON' % location['loc']) - file_ = media.get('file') - if not file_: - continue - formats.extend(self._extract_f4m_formats( - file_ + '&hdcore=3.2.0&plugin=aasp-3.2.0.77.18', - display_id, f4m_id=loc)) - self._sort_formats(formats) + info = self._get_player_info(url, webpage) title = self._search_regex( r'class="Destacado-text"[^>]*>\s*<strong>([^<]+)</strong>', @@ -112,21 +136,12 @@ class MiTeleIE(InfoExtractor): title = remove_start(self._search_regex( r'<title>([^<]+)</title>', webpage, 'title'), 'Ver online ') - video_id = self._search_regex( - r'data-media-id\s*=\s*"([^"]+)"', webpage, - 'data media id', default=None) or display_id - thumbnail = config.get('poster', {}).get('imageUrl') - duration = int_or_none(mmc.get('duration')) - - return { - 'id': video_id, + info.update({ 'display_id': display_id, 'title': title, 'description': get_element_by_attribute('class', 'text', webpage), 'series': series, 'season': season, 'episode': episode, - 'thumbnail': thumbnail, - 'duration': duration, - 'formats': formats, - } + }) + return info diff --git a/youtube_dl/extractor/mofosex.py b/youtube_dl/extractor/mofosex.py index e47c801..e3bbe5a 100644 --- a/youtube_dl/extractor/mofosex.py +++ b/youtube_dl/extractor/mofosex.py @@ -1,53 +1,56 @@ from __future__ import unicode_literals -import os -import re - -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlparse, +from ..utils import ( + int_or_none, + str_to_int, + unified_strdate, ) -from ..utils import sanitized_Request +from .keezmovies import KeezMoviesIE -class MofosexIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?P<url>mofosex\.com/videos/(?P<id>[0-9]+)/.*?\.html)' - _TEST = { - 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', - 'md5': '1b2eb47ac33cc75d4a80e3026b613c5a', +class MofosexIE(KeezMoviesIE): + _VALID_URL = r'https?://(?:www\.)?mofosex\.com/videos/(?P<id>\d+)/(?P<display_id>[^/?#&.]+)\.html' + _TESTS = [{ + 'url': 'http://www.mofosex.com/videos/318131/amateur-teen-playing-and-masturbating-318131.html', + 'md5': '39a15853632b7b2e5679f92f69b78e91', 'info_dict': { - 'id': '5018', + 'id': '318131', + 'display_id': 'amateur-teen-playing-and-masturbating-318131', 'ext': 'mp4', - 'title': 'Japanese Teen Music Video', + 'title': 'amateur teen playing and masturbating', + 'thumbnail': 're:^https?://.*\.jpg$', + 'upload_date': '20121114', + 'view_count': int, + 'like_count': int, + 'dislike_count': int, 'age_limit': 18, } - } + }, { + # This video is no longer available + 'url': 'http://www.mofosex.com/videos/5018/japanese-teen-music-video.html', + 'only_matching': True, + }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - url = 'http://www.' + mobj.group('url') - - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, video_id) - - video_title = self._html_search_regex(r'<h1>(.+?)<', webpage, 'title') - video_url = compat_urllib_parse_unquote(self._html_search_regex(r'flashvars.video_url = \'([^\']+)', webpage, 'video_url')) - path = compat_urllib_parse_urlparse(video_url).path - extension = os.path.splitext(path)[1][1:] - format = path.split('/')[5].split('_')[:2] - format = '-'.join(format) - - age_limit = self._rta_search(webpage) - - return { - 'id': video_id, - 'title': video_title, - 'url': video_url, - 'ext': extension, - 'format': format, - 'format_id': format, - 'age_limit': age_limit, - } + webpage, info = self._extract_info(url) + + view_count = str_to_int(self._search_regex( + r'VIEWS:</span>\s*([\d,.]+)', webpage, 'view count', fatal=False)) + like_count = int_or_none(self._search_regex( + r'id=["\']amountLikes["\'][^>]*>(\d+)', webpage, + 'like count', fatal=False)) + dislike_count = int_or_none(self._search_regex( + r'id=["\']amountDislikes["\'][^>]*>(\d+)', webpage, + 'like count', fatal=False)) + upload_date = unified_strdate(self._html_search_regex( + r'Added:</span>([^<]+)', webpage, 'upload date', fatal=False)) + + info.update({ + 'view_count': view_count, + 'like_count': like_count, + 'dislike_count': dislike_count, + 'upload_date': upload_date, + 'thumbnail': self._og_search_thumbnail(webpage), + }) + + return info diff --git a/youtube_dl/extractor/msn.py b/youtube_dl/extractor/msn.py new file mode 100644 index 0000000..1ec8e0f --- /dev/null +++ b/youtube_dl/extractor/msn.py @@ -0,0 +1,122 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, + unescapeHTML, +) + + +class MSNIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?msn\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/[a-z]{2}-(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/criminal-minds-shemar-moore-shares-a-touching-goodbye-message/vp-BBqQYNE', + 'md5': '8442f66c116cbab1ff7098f986983458', + 'info_dict': { + 'id': 'BBqQYNE', + 'display_id': 'criminal-minds-shemar-moore-shares-a-touching-goodbye-message', + 'ext': 'mp4', + 'title': 'Criminal Minds - Shemar Moore Shares A Touching Goodbye Message', + 'description': 'md5:e8e89b897b222eb33a6b5067a8f1bc25', + 'duration': 104, + 'uploader': 'CBS Entertainment', + 'uploader_id': 'IT0X5aoJ6bJgYerJXSDCgFmYPB1__54v', + }, + }, { + 'url': 'http://www.msn.com/en-ae/news/offbeat/meet-the-nine-year-old-self-made-millionaire/ar-BBt6ZKf', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/video/watch/obama-a-lot-of-people-will-be-disappointed/vi-AAhxUMH', + 'only_matching': True, + }, { + # geo restricted + 'url': 'http://www.msn.com/en-ae/foodanddrink/joinourtable/the-first-fart-makes-you-laugh-the-last-fart-makes-you-cry/vp-AAhzIBU', + 'only_matching': True, + }, { + 'url': 'http://www.msn.com/en-ae/entertainment/bollywood/watch-how-salman-khan-reacted-when-asked-if-he-would-apologize-for-his-‘raped-woman’-comment/vi-AAhvzW6', + 'only_matching': True, + }] + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + video_id, display_id = mobj.group('id', 'display_id') + + webpage = self._download_webpage(url, display_id) + + video = self._parse_json( + self._search_regex( + r'data-metadata\s*=\s*(["\'])(?P<data>.+?)\1', + webpage, 'video data', default='{}', group='data'), + display_id, transform_source=unescapeHTML) + + if not video: + error = unescapeHTML(self._search_regex( + r'data-error=(["\'])(?P<error>.+?)\1', + webpage, 'error', group='error')) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True) + + title = video['title'] + + formats = [] + for file_ in video.get('videoFiles', []): + format_url = file_.get('url') + if not format_url: + continue + ext = determine_ext(format_url) + # .ism is not yet supported (see + # https://github.com/rg3/youtube-dl/issues/8118) + if ext == 'ism': + continue + if 'm3u8' in format_url: + # m3u8_native should not be used here until + # https://github.com/rg3/youtube-dl/issues/9913 is fixed + m3u8_formats = self._extract_m3u8_formats( + format_url, display_id, 'mp4', + m3u8_id='hls', fatal=False) + # Despite metadata in m3u8 all video+audio formats are + # actually video-only (no audio) + for f in m3u8_formats: + if f.get('acodec') != 'none' and f.get('vcodec') != 'none': + f['acodec'] = 'none' + formats.extend(m3u8_formats) + else: + formats.append({ + 'url': format_url, + 'ext': 'mp4', + 'format_id': 'http', + 'width': int_or_none(file_.get('width')), + 'height': int_or_none(file_.get('height')), + }) + self._sort_formats(formats) + + subtitles = {} + for file_ in video.get('files', []): + format_url = file_.get('url') + format_code = file_.get('formatCode') + if not format_url or not format_code: + continue + if compat_str(format_code) == '3100': + subtitles.setdefault(file_.get('culture', 'en'), []).append({ + 'ext': determine_ext(format_url, 'ttml'), + 'url': format_url, + }) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': video.get('headlineImage', {}).get('url'), + 'duration': int_or_none(video.get('durationSecs')), + 'uploader': video.get('sourceFriendly'), + 'uploader_id': video.get('providerId'), + 'creator': video.get('creator'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py index dd06395..2f45568 100644 --- a/youtube_dl/extractor/mtv.py +++ b/youtube_dl/extractor/mtv.py @@ -15,6 +15,8 @@ from ..utils import ( float_or_none, HEADRequest, sanitized_Request, + strip_or_none, + timeconvert, unescapeHTML, url_basename, RegexNotFoundError, @@ -35,13 +37,13 @@ class MTVServicesInfoExtractor(InfoExtractor): return uri.split(':')[-1] # This was originally implemented for ComedyCentral, but it also works here - @staticmethod - def _transform_rtmp_url(rtmp_video_url): + @classmethod + def _transform_rtmp_url(cls, rtmp_video_url): m = re.match(r'^rtmpe?://.*?/(?P<finalid>gsp\..+?/.*)$', rtmp_video_url) if not m: - return rtmp_video_url + return {'rtmp': rtmp_video_url} base = 'http://viacommtvstrmfs.fplive.net/' - return base + m.group('finalid') + return {'http': base + m.group('finalid')} def _get_feed_url(self, uri): return self._FEED_URL @@ -85,14 +87,14 @@ class MTVServicesInfoExtractor(InfoExtractor): rtmp_video_url = rendition.find('./src').text if rtmp_video_url.endswith('siteunavail.png'): continue - new_url = self._transform_rtmp_url(rtmp_video_url) - formats.append({ + new_urls = self._transform_rtmp_url(rtmp_video_url) + formats.extend([{ 'ext': 'flv' if new_url.startswith('rtmp') else ext, 'url': new_url, - 'format_id': rendition.get('bitrate'), + 'format_id': '-'.join(filter(None, [kind, rendition.get('bitrate')])), 'width': int(rendition.get('width')), 'height': int(rendition.get('height')), - }) + } for kind, new_url in new_urls.items()]) except (KeyError, TypeError): raise ExtractorError('Invalid rendition field.') self._sort_formats(formats) @@ -133,7 +135,9 @@ class MTVServicesInfoExtractor(InfoExtractor): message += item.text raise ExtractorError(message, expected=True) - description = xpath_text(itemdoc, 'description') + description = strip_or_none(xpath_text(itemdoc, 'description')) + + timestamp = timeconvert(xpath_text(itemdoc, 'pubDate')) title_el = None if title_el is None: @@ -167,6 +171,7 @@ class MTVServicesInfoExtractor(InfoExtractor): 'thumbnail': self._get_thumbnail_url(uri, itemdoc), 'description': description, 'duration': float_or_none(content_el.attrib.get('duration')), + 'timestamp': timestamp, } def _get_feed_query(self, uri): @@ -185,8 +190,13 @@ class MTVServicesInfoExtractor(InfoExtractor): idoc = self._download_xml( url, video_id, 'Downloading info', transform_source=fix_xml_ampersands) + + title = xpath_text(idoc, './channel/title') + description = xpath_text(idoc, './channel/description') + return self.playlist_result( - [self._get_video_info(item) for item in idoc.findall('.//item')]) + [self._get_video_info(item) for item in idoc.findall('.//item')], + playlist_title=title, playlist_description=description) def _extract_mgid(self, webpage): try: @@ -232,6 +242,8 @@ class MTVServicesEmbeddedIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Peter Dinklage Sums Up \'Game Of Thrones\' In 45 Seconds', 'description': '"Sexy sexy sexy, stabby stabby stabby, beautiful language," says Peter Dinklage as he tries summarizing "Game of Thrones" in under a minute.', + 'timestamp': 1400126400, + 'upload_date': '20140515', }, } @@ -274,6 +286,8 @@ class MTVIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'Taylor Swift - "Ours (VH1 Storytellers)"', 'description': 'Album: Taylor Swift performs "Ours" for VH1 Storytellers at Harvey Mudd College.', + 'timestamp': 1352610000, + 'upload_date': '20121111', }, }, ] @@ -300,20 +314,6 @@ class MTVIE(MTVServicesInfoExtractor): return self._get_videos_info(uri) -class MTVIggyIE(MTVServicesInfoExtractor): - IE_NAME = 'mtviggy.com' - _VALID_URL = r'https?://www\.mtviggy\.com/videos/.+' - _TEST = { - 'url': 'http://www.mtviggy.com/videos/arcade-fire-behind-the-scenes-at-the-biggest-music-experiment-yet/', - 'info_dict': { - 'id': '984696', - 'ext': 'mp4', - 'title': 'Arcade Fire: Behind the Scenes at the Biggest Music Experiment Yet', - } - } - _FEED_URL = 'http://all.mtvworldverticals.com/feed-xml/' - - class MTVDEIE(MTVServicesInfoExtractor): IE_NAME = 'mtv.de' _VALID_URL = r'https?://(?:www\.)?mtv\.de/(?:artists|shows|news)/(?:[^/]+/)*(?P<id>\d+)-[^/#?]+/*(?:[#?].*)?$' @@ -321,7 +321,7 @@ class MTVDEIE(MTVServicesInfoExtractor): 'url': 'http://www.mtv.de/artists/10571-cro/videos/61131-traum', 'info_dict': { 'id': 'music_video-a50bc5f0b3aa4b3190aa', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'MusicVideo_cro-traum', 'description': 'Cro - Traum', }, @@ -329,20 +329,21 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { # mediagen URL without query (e.g. http://videos.mtvnn.com/mediagen/e865da714c166d18d6f80893195fcb97) 'url': 'http://www.mtv.de/shows/933-teen-mom-2/staffeln/5353/folgen/63565-enthullungen', 'info_dict': { 'id': 'local_playlist-f5ae778b9832cc837189', - 'ext': 'mp4', + 'ext': 'flv', 'title': 'Episode_teen-mom-2_shows_season-5_episode-1_full-episode_part1', }, 'params': { # rtmp download 'skip_download': True, }, + 'skip': 'Blocked at Travis CI', }, { - # single video in pagePlaylist with different id 'url': 'http://www.mtv.de/news/77491-mtv-movies-spotlight-pixels-teil-3', 'info_dict': { 'id': 'local_playlist-4e760566473c4c8c5344', @@ -354,6 +355,7 @@ class MTVDEIE(MTVServicesInfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'Das Video kann zur Zeit nicht abgespielt werden.', }] def _real_extract(self, url): @@ -366,11 +368,14 @@ class MTVDEIE(MTVServicesInfoExtractor): r'window\.pagePlaylist\s*=\s*(\[.+?\]);\n', webpage, 'page playlist'), video_id) + def _mrss_url(item): + return item['mrss'] + item.get('mrssvars', '') + # news pages contain single video in playlist with different id if len(playlist) == 1: - return self._get_videos_info_from_url(playlist[0]['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(playlist[0]), video_id) for item in playlist: item_id = item.get('id') if item_id and compat_str(item_id) == video_id: - return self._get_videos_info_from_url(item['mrss'], video_id) + return self._get_videos_info_from_url(_mrss_url(item), video_id) diff --git a/youtube_dl/extractor/muenchentv.py b/youtube_dl/extractor/muenchentv.py index b4e8ad1..d9f1761 100644 --- a/youtube_dl/extractor/muenchentv.py +++ b/youtube_dl/extractor/muenchentv.py @@ -36,7 +36,7 @@ class MuenchenTVIE(InfoExtractor): title = self._live_title(self._og_search_title(webpage)) data_js = self._search_regex( - r'(?s)\nplaylist:\s*(\[.*?}\]),related:', + r'(?s)\nplaylist:\s*(\[.*?}\]),', webpage, 'playlist configuration') data_json = js_to_json(data_js) data = json.loads(data_json)[0] diff --git a/youtube_dl/extractor/nationalgeographic.py b/youtube_dl/extractor/nationalgeographic.py index 7225186..1dcf27a 100644 --- a/youtube_dl/extractor/nationalgeographic.py +++ b/youtube_dl/extractor/nationalgeographic.py @@ -1,15 +1,19 @@ from __future__ import unicode_literals +import re + from .common import InfoExtractor +from .adobepass import AdobePassIE from ..utils import ( smuggle_url, url_basename, update_url_query, + get_element_by_class, ) -class NationalGeographicIE(InfoExtractor): - IE_NAME = 'natgeo' +class NationalGeographicVideoIE(InfoExtractor): + IE_NAME = 'natgeo:video' _VALID_URL = r'https?://video\.nationalgeographic\.com/.*?' _TESTS = [ @@ -61,16 +65,16 @@ class NationalGeographicIE(InfoExtractor): } -class NationalGeographicChannelIE(InfoExtractor): - IE_NAME = 'natgeo:channel' - _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/videos/(?P<id>[^/?]+)' +class NationalGeographicIE(AdobePassIE): + IE_NAME = 'natgeo' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?[^/]+/(?:videos|episodes)/(?P<id>[^/?]+)' _TESTS = [ { 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/videos/uncovering-a-universal-knowledge/', 'md5': '518c9aa655686cf81493af5cc21e2a04', 'info_dict': { - 'id': 'nB5vIAfmyllm', + 'id': 'vKInpacll2pC', 'ext': 'mp4', 'title': 'Uncovering a Universal Knowledge', 'description': 'md5:1a89148475bf931b3661fcd6ddb2ae3a', @@ -84,7 +88,7 @@ class NationalGeographicChannelIE(InfoExtractor): 'url': 'http://channel.nationalgeographic.com/wild/destination-wild/videos/the-stunning-red-bird-of-paradise/', 'md5': 'c4912f656b4cbe58f3e000c489360989', 'info_dict': { - 'id': '3TmMv9OvGwIR', + 'id': 'Pok5lWCkiEFA', 'ext': 'mp4', 'title': 'The Stunning Red Bird of Paradise', 'description': 'md5:7bc8cd1da29686be4d17ad1230f0140c', @@ -94,6 +98,10 @@ class NationalGeographicChannelIE(InfoExtractor): }, 'add_ie': ['ThePlatform'], }, + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episodes/the-power-of-miracles/', + 'only_matching': True, + } ] def _real_extract(self, url): @@ -102,12 +110,59 @@ class NationalGeographicChannelIE(InfoExtractor): release_url = self._search_regex( r'video_auth_playlist_url\s*=\s*"([^"]+)"', webpage, 'release url') + query = { + 'mbr': 'true', + 'switch': 'http', + } + is_auth = self._search_regex(r'video_is_auth\s*=\s*"([^"]+)"', webpage, 'is auth', fatal=False) + if is_auth == 'auth': + auth_resource_id = self._search_regex( + r"video_auth_resourceId\s*=\s*'([^']+)'", + webpage, 'auth resource id') + query['auth'] = self._extract_mvpd_auth(url, display_id, 'natgeo', auth_resource_id) return { '_type': 'url_transparent', 'ie_key': 'ThePlatform', 'url': smuggle_url( - update_url_query(release_url, {'mbr': 'true', 'switch': 'http'}), + update_url_query(release_url, query), {'force_smil_url': True}), 'display_id': display_id, } + + +class NationalGeographicEpisodeGuideIE(InfoExtractor): + IE_NAME = 'natgeo:episodeguide' + _VALID_URL = r'https?://channel\.nationalgeographic\.com/(?:wild/)?(?P<id>[^/]+)/episode-guide' + _TESTS = [ + { + 'url': 'http://channel.nationalgeographic.com/the-story-of-god-with-morgan-freeman/episode-guide/', + 'info_dict': { + 'id': 'the-story-of-god-with-morgan-freeman-season-1', + 'title': 'The Story of God with Morgan Freeman - Season 1', + }, + 'playlist_mincount': 6, + }, + { + 'url': 'http://channel.nationalgeographic.com/underworld-inc/episode-guide/?s=2', + 'info_dict': { + 'id': 'underworld-inc-season-2', + 'title': 'Underworld, Inc. - Season 2', + }, + 'playlist_mincount': 7, + }, + ] + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + show = get_element_by_class('show', webpage) + selected_season = self._search_regex( + r'<div[^>]+class="select-seasons[^"]*".*?<a[^>]*>(.*?)</a>', + webpage, 'selected season') + entries = [ + self.url_result(self._proto_relative_url(entry_url), 'NationalGeographic') + for entry_url in re.findall('(?s)<div[^>]+class="col-inner"[^>]*?>.*?<a[^>]+href="([^"]+)"', webpage)] + return self.playlist_result( + entries, '%s-%s' % (display_id, selected_season.lower().replace(' ', '-')), + '%s - %s' % (show, selected_season)) diff --git a/youtube_dl/extractor/naver.py b/youtube_dl/extractor/naver.py index 6d6f69b..0891d27 100644 --- a/youtube_dl/extractor/naver.py +++ b/youtube_dl/extractor/naver.py @@ -4,12 +4,10 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_urlencode, - compat_urlparse, -) from ..utils import ( ExtractorError, + int_or_none, + update_url_query, ) @@ -51,48 +49,74 @@ class NaverIE(InfoExtractor): if error: raise ExtractorError(error, expected=True) raise ExtractorError('couldn\'t extract vid and key') - vid = m_id.group(1) - key = m_id.group(2) - query = compat_urllib_parse_urlencode({'vid': vid, 'inKey': key, }) - query_urls = compat_urllib_parse_urlencode({ - 'masterVid': vid, - 'protocol': 'p2p', - 'inKey': key, - }) - info = self._download_xml( - 'http://serviceapi.rmcnmv.naver.com/flash/videoInfo.nhn?' + query, - video_id, 'Downloading video info') - urls = self._download_xml( - 'http://serviceapi.rmcnmv.naver.com/flash/playableEncodingOption.nhn?' + query_urls, - video_id, 'Downloading video formats info') - + video_data = self._download_json( + 'http://play.rmcnmv.naver.com/vod/play/v2.0/' + m_id.group(1), + video_id, query={ + 'key': m_id.group(2), + }) + meta = video_data['meta'] + title = meta['subject'] formats = [] - for format_el in urls.findall('EncodingOptions/EncodingOption'): - domain = format_el.find('Domain').text - uri = format_el.find('uri').text - f = { - 'url': compat_urlparse.urljoin(domain, uri), - 'ext': 'mp4', - 'width': int(format_el.find('width').text), - 'height': int(format_el.find('height').text), - } - if domain.startswith('rtmp'): - # urlparse does not support custom schemes - # https://bugs.python.org/issue18828 - f.update({ - 'url': domain + uri, - 'ext': 'flv', - 'rtmp_protocol': '1', # rtmpt + + def extract_formats(streams, stream_type, query={}): + for stream in streams: + stream_url = stream.get('source') + if not stream_url: + continue + stream_url = update_url_query(stream_url, query) + encoding_option = stream.get('encodingOption', {}) + bitrate = stream.get('bitrate', {}) + formats.append({ + 'format_id': '%s_%s' % (stream.get('type') or stream_type, encoding_option.get('id') or encoding_option.get('name')), + 'url': stream_url, + 'width': int_or_none(encoding_option.get('width')), + 'height': int_or_none(encoding_option.get('height')), + 'vbr': int_or_none(bitrate.get('video')), + 'abr': int_or_none(bitrate.get('audio')), + 'filesize': int_or_none(stream.get('size')), + 'protocol': 'm3u8_native' if stream_type == 'HLS' else None, }) - formats.append(f) + + extract_formats(video_data.get('videos', {}).get('list', []), 'H264') + for stream_set in video_data.get('streams', []): + query = {} + for param in stream_set.get('keys', []): + query[param['name']] = param['value'] + stream_type = stream_set.get('type') + videos = stream_set.get('videos') + if videos: + extract_formats(videos, stream_type, query) + elif stream_type == 'HLS': + stream_url = stream_set.get('source') + if not stream_url: + continue + formats.extend(self._extract_m3u8_formats( + update_url_query(stream_url, query), video_id, + 'mp4', 'm3u8_native', m3u8_id=stream_type, fatal=False)) self._sort_formats(formats) + subtitles = {} + for caption in video_data.get('captions', {}).get('list', []): + caption_url = caption.get('source') + if not caption_url: + continue + subtitles.setdefault(caption.get('language') or caption.get('locale'), []).append({ + 'url': caption_url, + }) + + upload_date = self._search_regex( + r'<span[^>]+class="date".*?(\d{4}\.\d{2}\.\d{2})', + webpage, 'upload date', fatal=False) + if upload_date: + upload_date = upload_date.replace('.', '') + return { 'id': video_id, - 'title': info.find('Subject').text, + 'title': title, 'formats': formats, + 'subtitles': subtitles, 'description': self._og_search_description(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'upload_date': info.find('WriteDate').text.replace('.', ''), - 'view_count': int(info.find('PlayCount').text), + 'thumbnail': meta.get('cover', {}).get('source') or self._og_search_thumbnail(webpage), + 'view_count': int_or_none(meta.get('count')), + 'upload_date': upload_date, } diff --git a/youtube_dl/extractor/nextmovie.py b/youtube_dl/extractor/nextmovie.py deleted file mode 100644 index 9ccd7d7..0000000 --- a/youtube_dl/extractor/nextmovie.py +++ /dev/null @@ -1,30 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -from .mtv import MTVServicesInfoExtractor -from ..compat import compat_urllib_parse_urlencode - - -class NextMovieIE(MTVServicesInfoExtractor): - IE_NAME = 'nextmovie.com' - _VALID_URL = r'https?://(?:www\.)?nextmovie\.com/shows/[^/]+/\d{4}-\d{2}-\d{2}/(?P<id>[^/?#]+)' - _FEED_URL = 'http://lite.dextr.mtvi.com/service1/dispatch.htm' - _TESTS = [{ - 'url': 'http://www.nextmovie.com/shows/exclusives/2013-03-10/mgid:uma:videolist:nextmovie.com:1715019/', - 'md5': '09a9199f2f11f10107d04fcb153218aa', - 'info_dict': { - 'id': '961726', - 'ext': 'mp4', - 'title': 'The Muppets\' Gravity', - }, - }] - - def _get_feed_query(self, uri): - return compat_urllib_parse_urlencode({ - 'feed': '1505', - 'mgid': uri, - }) - - def _real_extract(self, url): - mgid = self._match_id(url) - return self._get_videos_info(mgid) diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py index e960137..9c54846 100644 --- a/youtube_dl/extractor/nick.py +++ b/youtube_dl/extractor/nick.py @@ -7,8 +7,9 @@ from ..utils import update_url_query class NickIE(MTVServicesInfoExtractor): + # None of videos on the website are still alive? IE_NAME = 'nick.com' - _VALID_URL = r'https?://(?:www\.)?nick\.com/videos/clip/(?P<id>[^/?#.]+)' + _VALID_URL = r'https?://(?:www\.)?nick(?:jr)?\.com/(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm' _TESTS = [{ 'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html', @@ -52,6 +53,9 @@ class NickIE(MTVServicesInfoExtractor): } }, ], + }, { + 'url': 'http://www.nickjr.com/paw-patrol/videos/pups-save-a-goldrush-s3-ep302-full-episode/', + 'only_matching': True, }] def _get_feed_query(self, uri): diff --git a/youtube_dl/extractor/ninecninemedia.py b/youtube_dl/extractor/ninecninemedia.py new file mode 100644 index 0000000..d889245 --- /dev/null +++ b/youtube_dl/extractor/ninecninemedia.py @@ -0,0 +1,55 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + parse_iso8601, + parse_duration, + ExtractorError +) + + +class NineCNineMediaIE(InfoExtractor): + _VALID_URL = r'9c9media:(?P<destination_code>[^:]+):(?P<id>\d+)' + + def _real_extract(self, url): + destination_code, video_id = re.match(self._VALID_URL, url).groups() + api_base_url = 'http://capi.9c9media.com/destinations/%s/platforms/desktop/contents/%s/' % (destination_code, video_id) + content = self._download_json(api_base_url, video_id, query={ + '$include': '[contentpackages]', + }) + title = content['Name'] + if len(content['ContentPackages']) > 1: + raise ExtractorError('multiple content packages') + content_package = content['ContentPackages'][0] + stacks_base_url = api_base_url + 'contentpackages/%s/stacks/' % content_package['Id'] + stacks = self._download_json(stacks_base_url, video_id)['Items'] + if len(stacks) > 1: + raise ExtractorError('multiple stacks') + stack = stacks[0] + stack_base_url = '%s%s/manifest.' % (stacks_base_url, stack['Id']) + formats = [] + formats.extend(self._extract_m3u8_formats( + stack_base_url + 'm3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + stack_base_url + 'f4m', video_id, + f4m_id='hds', fatal=False)) + mp4_url = self._download_webpage(stack_base_url + 'pd', video_id, fatal=False) + if mp4_url: + formats.append({ + 'url': mp4_url, + 'format_id': 'mp4', + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': content.get('Desc') or content.get('ShortDesc'), + 'timestamp': parse_iso8601(content.get('BroadcastDateTime')), + 'duration': parse_duration(content.get('BroadcastTime')), + 'formats': formats, + } diff --git a/youtube_dl/extractor/ninenow.py b/youtube_dl/extractor/ninenow.py new file mode 100644 index 0000000..faa5772 --- /dev/null +++ b/youtube_dl/extractor/ninenow.py @@ -0,0 +1,72 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_str +from ..utils import ( + int_or_none, + float_or_none, + ExtractorError, +) + + +class NineNowIE(InfoExtractor): + IE_NAME = '9now.com.au' + _VALID_URL = r'https?://(?:www\.)?9now\.com\.au/(?:[^/]+/){2}(?P<id>[^/?#]+)' + _TESTS = [{ + # clip + 'url': 'https://www.9now.com.au/afl-footy-show/2016/clip-ciql02091000g0hp5oktrnytc', + 'md5': '17cf47d63ec9323e562c9957a968b565', + 'info_dict': { + 'id': '16801', + 'ext': 'mp4', + 'title': 'St. Kilda\'s Joey Montagna on the potential for a player\'s strike', + 'description': 'Is a boycott of the NAB Cup "on the table"?', + 'uploader_id': '4460760524001', + 'upload_date': '20160713', + 'timestamp': 1468421266, + }, + 'skip': 'Only available in Australia', + }, { + # episode + 'url': 'https://www.9now.com.au/afl-footy-show/2016/episode-19', + 'only_matching': True, + }, { + # DRM protected + 'url': 'https://www.9now.com.au/andrew-marrs-history-of-the-world/season-1/episode-1', + 'only_matching': True, + }] + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4460760524001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + page_data = self._parse_json(self._search_regex( + r'window\.__data\s*=\s*({.*?});', webpage, + 'page data'), display_id) + common_data = page_data.get('episode', {}).get('episode') or page_data.get('clip', {}).get('clip') + video_data = common_data['video'] + + if video_data.get('drm'): + raise ExtractorError('This video is DRM protected.', expected=True) + + brightcove_id = video_data.get('brightcoveId') or 'ref:' + video_data['referenceId'] + video_id = compat_str(video_data.get('id') or brightcove_id) + title = common_data['name'] + + thumbnails = [{ + 'id': thumbnail_id, + 'url': thumbnail_url, + 'width': int_or_none(thumbnail_id[1:]) + } for thumbnail_id, thumbnail_url in common_data.get('image', {}).get('sizes', {}).items()] + + return { + '_type': 'url_transparent', + 'url': self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, + 'id': video_id, + 'title': title, + 'description': common_data.get('description'), + 'duration': float_or_none(video_data.get('duration'), 1000), + 'thumbnails': thumbnails, + 'ie_key': 'BrightcoveNew', + } diff --git a/youtube_dl/extractor/nintendo.py b/youtube_dl/extractor/nintendo.py new file mode 100644 index 0000000..4b4e66b --- /dev/null +++ b/youtube_dl/extractor/nintendo.py @@ -0,0 +1,46 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from .ooyala import OoyalaIE +from ..utils import unescapeHTML + + +class NintendoIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?nintendo\.com/games/detail/(?P<id>[^/?#&]+)' + _TESTS = [{ + 'url': 'http://www.nintendo.com/games/detail/yEiAzhU2eQI1KZ7wOHhngFoAHc1FpHwj', + 'info_dict': { + 'id': 'MzMmticjp0VPzO3CCj4rmFOuohEuEWoW', + 'ext': 'flv', + 'title': 'Duck Hunt Wii U VC NES - Trailer', + 'duration': 60.326, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['Ooyala'], + }, { + 'url': 'http://www.nintendo.com/games/detail/tokyo-mirage-sessions-fe-wii-u', + 'info_dict': { + 'id': 'tokyo-mirage-sessions-fe-wii-u', + 'title': 'Tokyo Mirage Sessions ♯FE', + }, + 'playlist_count': 3, + }] + + def _real_extract(self, url): + page_id = self._match_id(url) + + webpage = self._download_webpage(url, page_id) + + entries = [ + OoyalaIE._build_url_result(m.group('code')) + for m in re.finditer( + r'class=(["\'])embed-video\1[^>]+data-video-code=(["\'])(?P<code>(?:(?!\2).)+)\2', + webpage)] + + return self.playlist_result( + entries, page_id, unescapeHTML(self._og_search_title(webpage, fatal=False))) diff --git a/youtube_dl/extractor/ntvru.py b/youtube_dl/extractor/ntvru.py index 0895d7e..e8702eb 100644 --- a/youtube_dl/extractor/ntvru.py +++ b/youtube_dl/extractor/ntvru.py @@ -11,70 +11,64 @@ from ..utils import ( class NTVRuIE(InfoExtractor): IE_NAME = 'ntv.ru' - _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?P<id>.+)' + _VALID_URL = r'https?://(?:www\.)?ntv\.ru/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TESTS = [ - { - 'url': 'http://www.ntv.ru/novosti/863142/', - 'md5': 'ba7ea172a91cb83eb734cad18c10e723', - 'info_dict': { - 'id': '746000', - 'ext': 'mp4', - 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', - 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 136, - }, + _TESTS = [{ + 'url': 'http://www.ntv.ru/novosti/863142/', + 'md5': 'ba7ea172a91cb83eb734cad18c10e723', + 'info_dict': { + 'id': '746000', + 'ext': 'mp4', + 'title': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', + 'description': 'Командующий Черноморским флотом провел переговоры в штабе ВМС Украины', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 136, }, - { - 'url': 'http://www.ntv.ru/video/novosti/750370/', - 'md5': 'adecff79691b4d71e25220a191477124', - 'info_dict': { - 'id': '750370', - 'ext': 'mp4', - 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', - 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 172, - }, + }, { + 'url': 'http://www.ntv.ru/video/novosti/750370/', + 'md5': 'adecff79691b4d71e25220a191477124', + 'info_dict': { + 'id': '750370', + 'ext': 'mp4', + 'title': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', + 'description': 'Родные пассажиров пропавшего Boeing не верят в трагический исход', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 172, }, - { - 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', - 'md5': '82dbd49b38e3af1d00df16acbeab260c', - 'info_dict': { - 'id': '747480', - 'ext': 'mp4', - 'title': '«Сегодня». 21 марта 2014 года. 16:00', - 'description': '«Сегодня». 21 марта 2014 года. 16:00', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 1496, - }, + }, { + 'url': 'http://www.ntv.ru/peredacha/segodnya/m23700/o232416', + 'md5': '82dbd49b38e3af1d00df16acbeab260c', + 'info_dict': { + 'id': '747480', + 'ext': 'mp4', + 'title': '«Сегодня». 21 марта 2014 года. 16:00', + 'description': '«Сегодня». 21 марта 2014 года. 16:00', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 1496, }, - { - 'url': 'http://www.ntv.ru/kino/Koma_film', - 'md5': 'f825770930937aa7e5aca0dc0d29319a', - 'info_dict': { - 'id': '1007609', - 'ext': 'mp4', - 'title': 'Остросюжетный фильм «Кома»', - 'description': 'Остросюжетный фильм «Кома»', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 5592, - }, + }, { + 'url': 'http://www.ntv.ru/kino/Koma_film', + 'md5': 'f825770930937aa7e5aca0dc0d29319a', + 'info_dict': { + 'id': '1007609', + 'ext': 'mp4', + 'title': 'Остросюжетный фильм «Кома»', + 'description': 'Остросюжетный фильм «Кома»', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 5592, }, - { - 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', - 'md5': '9320cd0e23f3ea59c330dc744e06ff3b', - 'info_dict': { - 'id': '751482', - 'ext': 'mp4', - 'title': '«Дело врачей»: «Деревце жизни»', - 'description': '«Дело врачей»: «Деревце жизни»', - 'thumbnail': 're:^http://.*\.jpg', - 'duration': 2590, - }, + }, { + 'url': 'http://www.ntv.ru/serial/Delo_vrachey/m31760/o233916/', + 'md5': '9320cd0e23f3ea59c330dc744e06ff3b', + 'info_dict': { + 'id': '751482', + 'ext': 'mp4', + 'title': '«Дело врачей»: «Деревце жизни»', + 'description': '«Дело врачей»: «Деревце жизни»', + 'thumbnail': 're:^http://.*\.jpg', + 'duration': 2590, }, - ] + }] _VIDEO_ID_REGEXES = [ r'<meta property="og:url" content="http://www\.ntv\.ru/video/(\d+)', @@ -87,11 +81,21 @@ class NTVRuIE(InfoExtractor): webpage = self._download_webpage(url, video_id) - video_id = self._html_search_regex(self._VIDEO_ID_REGEXES, webpage, 'video id') + video_url = self._og_search_property( + ('video', 'video:iframe'), webpage, default=None) + if video_url: + video_id = self._search_regex( + r'https?://(?:www\.)?ntv\.ru/video/(?:embed/)?(\d+)', + video_url, 'video id', default=None) + + if not video_id: + video_id = self._html_search_regex( + self._VIDEO_ID_REGEXES, webpage, 'video id') player = self._download_xml( 'http://www.ntv.ru/vi%s/' % video_id, video_id, 'Downloading video XML') + title = clean_html(xpath_text(player, './data/title', 'title', fatal=True)) description = clean_html(xpath_text(player, './data/description', 'description')) diff --git a/youtube_dl/extractor/odatv.py b/youtube_dl/extractor/odatv.py new file mode 100644 index 0000000..314527f --- /dev/null +++ b/youtube_dl/extractor/odatv.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + NO_DEFAULT, + remove_start +) + + +class OdaTVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?odatv\.com/(?:mob|vid)_video\.php\?.*\bid=(?P<id>[^&]+)' + _TESTS = [{ + 'url': 'http://odatv.com/vid_video.php?id=8E388', + 'md5': 'dc61d052f205c9bf2da3545691485154', + 'info_dict': { + 'id': '8E388', + 'ext': 'mp4', + 'title': 'Artık Davutoğlu ile devam edemeyiz' + } + }, { + # mobile URL + 'url': 'http://odatv.com/mob_video.php?id=8E388', + 'only_matching': True, + }, { + # no video + 'url': 'http://odatv.com/mob_video.php?id=8E900', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + no_video = 'NO VIDEO!' in webpage + + video_url = self._search_regex( + r'mp4\s*:\s*(["\'])(?P<url>http.+?)\1', webpage, 'video url', + default=None if no_video else NO_DEFAULT, group='url') + + if no_video: + raise ExtractorError('Video %s does not exist' % video_id, expected=True) + + return { + 'id': video_id, + 'url': video_url, + 'title': remove_start(self._og_search_title(webpage), 'Video: '), + 'thumbnail': self._og_search_thumbnail(webpage), + } diff --git a/youtube_dl/extractor/onet.py b/youtube_dl/extractor/onet.py new file mode 100644 index 0000000..fc22ad5 --- /dev/null +++ b/youtube_dl/extractor/onet.py @@ -0,0 +1,169 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + determine_ext, + ExtractorError, + float_or_none, + get_element_by_class, + int_or_none, + js_to_json, + parse_iso8601, + remove_start, + strip_or_none, + url_basename, +) + + +class OnetBaseIE(InfoExtractor): + def _search_mvp_id(self, webpage): + return self._search_regex( + r'id=(["\'])mvp:(?P<id>.+?)\1', webpage, 'mvp id', group='id') + + def _extract_from_id(self, video_id, webpage): + response = self._download_json( + 'http://qi.ckm.onetapi.pl/', video_id, + query={ + 'body[id]': video_id, + 'body[jsonrpc]': '2.0', + 'body[method]': 'get_asset_detail', + 'body[params][ID_Publikacji]': video_id, + 'body[params][Service]': 'www.onet.pl', + 'content-type': 'application/jsonp', + 'x-onet-app': 'player.front.onetapi.pl', + }) + + error = response.get('error') + if error: + raise ExtractorError( + '%s said: %s' % (self.IE_NAME, error['message']), expected=True) + + video = response['result'].get('0') + + formats = [] + for _, formats_dict in video['formats'].items(): + if not isinstance(formats_dict, dict): + continue + for format_id, format_list in formats_dict.items(): + if not isinstance(format_list, list): + continue + for f in format_list: + video_url = f.get('url') + if not video_url: + continue + ext = determine_ext(video_url) + if format_id == 'ism': + # TODO: Support Microsoft Smooth Streaming + continue + elif ext == 'mpd': + formats.extend(self._extract_mpd_formats( + video_url, video_id, mpd_id='dash', fatal=False)) + else: + formats.append({ + 'url': video_url, + 'format_id': format_id, + 'height': int_or_none(f.get('vertical_resolution')), + 'width': int_or_none(f.get('horizontal_resolution')), + 'abr': float_or_none(f.get('audio_bitrate')), + 'vbr': float_or_none(f.get('video_bitrate')), + }) + self._sort_formats(formats) + + meta = video.get('meta', {}) + + title = self._og_search_title(webpage, default=None) or meta['title'] + description = self._og_search_description(webpage, default=None) or meta.get('description') + duration = meta.get('length') or meta.get('lenght') + timestamp = parse_iso8601(meta.get('addDate'), ' ') + + return { + 'id': video_id, + 'title': title, + 'description': description, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, + } + + +class OnetIE(OnetBaseIE): + _VALID_URL = 'https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/(?P<display_id>[0-9a-z-]+)/(?P<id>[0-9a-z]+)' + IE_NAME = 'onet.tv' + + _TEST = { + 'url': 'http://onet.tv/k/openerfestival/open-er-festival-2016-najdziwniejsze-wymagania-gwiazd/qbpyqc', + 'md5': 'e3ffbf47590032ac3f27249204173d50', + 'info_dict': { + 'id': 'qbpyqc', + 'display_id': 'open-er-festival-2016-najdziwniejsze-wymagania-gwiazd', + 'ext': 'mp4', + 'title': 'Open\'er Festival 2016: najdziwniejsze wymagania gwiazd', + 'description': 'Trzy samochody, których nigdy nie użyto, prywatne spa, hotel dekorowany czarnym suknem czy nielegalne używki. Organizatorzy koncertów i festiwali muszą stawać przed nie lada wyzwaniem zapraszając gwia...', + 'upload_date': '20160705', + 'timestamp': 1467721580, + }, + } + + def _real_extract(self, url): + mobj = re.match(self._VALID_URL, url) + display_id, video_id = mobj.group('display_id', 'id') + + webpage = self._download_webpage(url, display_id) + + mvp_id = self._search_mvp_id(webpage) + + info_dict = self._extract_from_id(mvp_id, webpage) + info_dict.update({ + 'id': video_id, + 'display_id': display_id, + }) + + return info_dict + + +class OnetChannelIE(OnetBaseIE): + _VALID_URL = r'https?://(?:www\.)?onet\.tv/[a-z]/(?P<id>[a-z]+)(?:[?#]|$)' + IE_NAME = 'onet.tv:channel' + + _TEST = { + 'url': 'http://onet.tv/k/openerfestival', + 'info_dict': { + 'id': 'openerfestival', + 'title': 'Open\'er Festival Live', + 'description': 'Dziękujemy, że oglądaliście transmisje. Zobaczcie nasze relacje i wywiady z artystami.', + }, + 'playlist_mincount': 46, + } + + def _real_extract(self, url): + channel_id = self._match_id(url) + + webpage = self._download_webpage(url, channel_id) + + current_clip_info = self._parse_json(self._search_regex( + r'var\s+currentClip\s*=\s*({[^}]+})', webpage, 'video info'), channel_id, + transform_source=lambda s: js_to_json(re.sub(r'\'\s*\+\s*\'', '', s))) + video_id = remove_start(current_clip_info['ckmId'], 'mvp:') + video_name = url_basename(current_clip_info['url']) + + if self._downloader.params.get('noplaylist'): + self.to_screen( + 'Downloading just video %s because of --no-playlist' % video_name) + return self._extract_from_id(video_id, webpage) + + self.to_screen( + 'Downloading channel %s - add --no-playlist to just download video %s' % ( + channel_id, video_name)) + matches = re.findall( + r'<a[^>]+href=[\'"](https?://(?:www\.)?onet\.tv/[a-z]/[a-z]+/[0-9a-z-]+/[0-9a-z]+)', + webpage) + entries = [ + self.url_result(video_link, OnetIE.ie_key()) + for video_link in matches] + + channel_title = strip_or_none(get_element_by_class('o_channelName', webpage)) + channel_description = strip_or_none(get_element_by_class('o_channelDesc', webpage)) + return self.playlist_result(entries, channel_id, channel_title, channel_description) diff --git a/youtube_dl/extractor/onionstudios.py b/youtube_dl/extractor/onionstudios.py index d7b13a0..6fb1a3f 100644 --- a/youtube_dl/extractor/onionstudios.py +++ b/youtube_dl/extractor/onionstudios.py @@ -7,6 +7,8 @@ from .common import InfoExtractor from ..utils import ( determine_ext, int_or_none, + float_or_none, + mimetype2ext, ) @@ -15,15 +17,14 @@ class OnionStudiosIE(InfoExtractor): _TESTS = [{ 'url': 'http://www.onionstudios.com/videos/hannibal-charges-forward-stops-for-a-cocktail-2937', - 'md5': 'd4851405d31adfadf71cd7a487b765bb', + 'md5': 'e49f947c105b8a78a675a0ee1bddedfe', 'info_dict': { 'id': '2937', 'ext': 'mp4', 'title': 'Hannibal charges forward, stops for a cocktail', - 'description': 'md5:e786add7f280b7f0fe237b64cc73df76', 'thumbnail': 're:^https?://.*\.jpg$', 'uploader': 'The A.V. Club', - 'uploader_id': 'TheAVClub', + 'uploader_id': 'the-av-club', }, }, { 'url': 'http://www.onionstudios.com/embed?id=2855&autoplay=true', @@ -40,50 +41,38 @@ class OnionStudiosIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage( - 'http://www.onionstudios.com/embed?id=%s' % video_id, video_id) + video_data = self._download_json( + 'http://www.onionstudios.com/video/%s.json' % video_id, video_id) + + title = video_data['title'] formats = [] - for src in re.findall(r'<source[^>]+src="([^"]+)"', webpage): - ext = determine_ext(src) + for source in video_data.get('sources', []): + source_url = source.get('url') + if not source_url: + continue + ext = mimetype2ext(source.get('content_type')) or determine_ext(source_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - src, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) + source_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)) else: - height = int_or_none(self._search_regex( - r'/(\d+)\.%s' % ext, src, 'height', default=None)) + tbr = int_or_none(source.get('bitrate')) formats.append({ - 'format_id': ext + ('-%sp' % height if height else ''), - 'url': src, - 'height': height, + 'format_id': ext + ('-%d' % tbr if tbr else ''), + 'url': source_url, + 'width': int_or_none(source.get('width')), + 'tbr': tbr, 'ext': ext, - 'preference': 1, }) self._sort_formats(formats) - title = self._search_regex( - r'share_title\s*=\s*(["\'])(?P<title>[^\1]+?)\1', - webpage, 'title', group='title') - description = self._search_regex( - r'share_description\s*=\s*(["\'])(?P<description>[^\'"]+?)\1', - webpage, 'description', default=None, group='description') - thumbnail = self._search_regex( - r'poster\s*=\s*(["\'])(?P<thumbnail>[^\1]+?)\1', - webpage, 'thumbnail', default=False, group='thumbnail') - - uploader_id = self._search_regex( - r'twitter_handle\s*=\s*(["\'])(?P<uploader_id>[^\1]+?)\1', - webpage, 'uploader id', fatal=False, group='uploader_id') - uploader = self._search_regex( - r'window\.channelName\s*=\s*(["\'])Embedded:(?P<uploader>[^\1]+?)\1', - webpage, 'uploader', default=False, group='uploader') - return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'uploader_id': uploader_id, + 'thumbnail': video_data.get('poster_url'), + 'uploader': video_data.get('channel_name'), + 'uploader_id': video_data.get('channel_slug'), + 'duration': float_or_none(video_data.get('duration', 1000)), + 'tags': video_data.get('tags'), 'formats': formats, } diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py index 6415b8f..4e80ca9 100644 --- a/youtube_dl/extractor/openload.py +++ b/youtube_dl/extractor/openload.py @@ -1,15 +1,14 @@ # coding: utf-8 -from __future__ import unicode_literals +from __future__ import unicode_literals, division -import re +import math from .common import InfoExtractor from ..compat import compat_chr from ..utils import ( + decode_png, determine_ext, - encode_base_n, ExtractorError, - mimetype2ext, ) @@ -41,60 +40,6 @@ class OpenloadIE(InfoExtractor): 'only_matching': True, }] - @staticmethod - def openload_level2_debase(m): - radix, num = int(m.group(1)) + 27, int(m.group(2)) - return '"' + encode_base_n(num, radix) + '"' - - @classmethod - def openload_level2(cls, txt): - # The function name is ǃ \u01c3 - # Using escaped unicode literals does not work in Python 3.2 - return re.sub(r'ǃ\((\d+),(\d+)\)', cls.openload_level2_debase, txt, re.UNICODE).replace('"+"', '') - - # Openload uses a variant of aadecode - # openload_decode and related functions are originally written by - # vitas@matfyz.cz and released with public domain - # See https://github.com/rg3/youtube-dl/issues/8489 - @classmethod - def openload_decode(cls, txt): - symbol_table = [ - ('_', '(゚Д゚) [゚Θ゚]'), - ('a', '(゚Д゚) [゚ω゚ノ]'), - ('b', '(゚Д゚) [゚Θ゚ノ]'), - ('c', '(゚Д゚) [\'c\']'), - ('d', '(゚Д゚) [゚ー゚ノ]'), - ('e', '(゚Д゚) [゚Д゚ノ]'), - ('f', '(゚Д゚) [1]'), - - ('o', '(゚Д゚) [\'o\']'), - ('u', '(o゚ー゚o)'), - ('c', '(゚Д゚) [\'c\']'), - - ('7', '((゚ー゚) + (o^_^o))'), - ('6', '((o^_^o) +(o^_^o) +(c^_^o))'), - ('5', '((゚ー゚) + (゚Θ゚))'), - ('4', '(-~3)'), - ('3', '(-~-~1)'), - ('2', '(-~1)'), - ('1', '(-~0)'), - ('0', '((c^_^o)-(c^_^o))'), - ] - delim = '(゚Д゚)[゚ε゚]+' - ret = '' - for aachar in txt.split(delim): - for val, pat in symbol_table: - aachar = aachar.replace(pat, val) - aachar = aachar.replace('+ ', '') - m = re.match(r'^\d+', aachar) - if m: - ret += compat_chr(int(m.group(0), 8)) - else: - m = re.match(r'^u([\da-f]+)', aachar) - if m: - ret += compat_chr(int(m.group(1), 16)) - return cls.openload_level2(ret) - def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage(url, video_id) @@ -102,29 +47,77 @@ class OpenloadIE(InfoExtractor): if 'File not found' in webpage: raise ExtractorError('File not found', expected=True) - code = self._search_regex( - r'</video>\s*</div>\s*<script[^>]+>[^>]+</script>\s*<script[^>]+>([^<]+)</script>', - webpage, 'JS code') - - decoded = self.openload_decode(code) - - video_url = self._search_regex( - r'return\s+"(https?://[^"]+)"', decoded, 'video URL') + # The following extraction logic is proposed by @Belderak and @gdkchan + # and declared to be used freely in youtube-dl + # See https://github.com/rg3/youtube-dl/issues/9706 + + numbers_js = self._download_webpage( + 'https://openload.co/assets/js/obfuscator/n.js', video_id, + note='Downloading signature numbers') + signums = self._search_regex( + r'window\.signatureNumbers\s*=\s*[\'"](?P<data>[a-z]+)[\'"]', + numbers_js, 'signature numbers', group='data') + + linkimg_uri = self._search_regex( + r'<img[^>]+id="linkimg"[^>]+src="([^"]+)"', webpage, 'link image') + linkimg = self._request_webpage( + linkimg_uri, video_id, note=False).read() + + width, height, pixels = decode_png(linkimg) + + output = '' + for y in range(height): + for x in range(width): + r, g, b = pixels[y][3 * x:3 * x + 3] + if r == 0 and g == 0 and b == 0: + break + else: + output += compat_chr(r) + output += compat_chr(g) + output += compat_chr(b) + + img_str_length = len(output) // 200 + img_str = [[0 for x in range(img_str_length)] for y in range(10)] + + sig_str_length = len(signums) // 260 + sig_str = [[0 for x in range(sig_str_length)] for y in range(10)] + + for i in range(10): + for j in range(img_str_length): + begin = i * img_str_length * 20 + j * 20 + img_str[i][j] = output[begin:begin + 20] + for j in range(sig_str_length): + begin = i * sig_str_length * 26 + j * 26 + sig_str[i][j] = signums[begin:begin + 26] + + parts = [] + # TODO: find better names for str_, chr_ and sum_ + str_ = '' + for i in [2, 3, 5, 7]: + str_ = '' + sum_ = float(99) + for j in range(len(sig_str[i])): + for chr_idx in range(len(img_str[i][j])): + if sum_ > float(122): + sum_ = float(98) + chr_ = compat_chr(int(math.floor(sum_))) + if sig_str[i][j][chr_idx] == chr_ and j >= len(str_): + sum_ += float(2.5) + str_ += img_str[i][j][chr_idx] + parts.append(str_.replace(',', '')) + + video_url = 'https://openload.co/stream/%s~%s~%s~%s' % (parts[3], parts[1], parts[2], parts[0]) title = self._og_search_title(webpage, default=None) or self._search_regex( r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage, 'title', default=None) or self._html_search_meta( 'description', webpage, 'title', fatal=True) - ext = mimetype2ext(self._search_regex( - r'window\.vt\s*=\s*(["\'])(?P<mimetype>.+?)\1', decoded, - 'mimetype', default=None, group='mimetype')) or determine_ext( - video_url, 'mp4') - return { 'id': video_id, 'title': title, - 'ext': ext, 'thumbnail': self._og_search_thumbnail(webpage, default=None), 'url': video_url, + # Seems all videos have extensions in their titles + 'ext': determine_ext(title), } diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py index 4e3864f..6ae3067 100644 --- a/youtube_dl/extractor/orf.py +++ b/youtube_dl/extractor/orf.py @@ -40,16 +40,16 @@ class ORFTVthekIE(InfoExtractor): 'skip': 'Blocked outside of Austria / Germany', }, { 'url': 'http://tvthek.orf.at/topic/Im-Wandel-der-Zeit/8002126/Best-of-Ingrid-Thurnher/7982256', - 'playlist': [{ - 'md5': '68f543909aea49d621dfc7703a11cfaf', - 'info_dict': { - 'id': '7982259', - 'ext': 'mp4', - 'title': 'Best of Ingrid Thurnher', - 'upload_date': '20140527', - 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', - } - }], + 'info_dict': { + 'id': '7982259', + 'ext': 'mp4', + 'title': 'Best of Ingrid Thurnher', + 'upload_date': '20140527', + 'description': 'Viele Jahre war Ingrid Thurnher das "Gesicht" der ZIB 2. Vor ihrem Wechsel zur ZIB 2 im Jahr 1995 moderierte sie unter anderem "Land und Leute", "Österreich-Bild" und "Niederösterreich heute".', + }, + 'params': { + 'skip_download': True, # rtsp downloads + }, '_skip': 'Blocked outside of Austria / Germany', }] @@ -137,13 +137,16 @@ class ORFTVthekIE(InfoExtractor): class ORFOE1IE(InfoExtractor): IE_NAME = 'orf:oe1' IE_DESC = 'Radio Österreich 1' - _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole.*?#\?track_id=)(?P<id>[0-9]+)' + _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)' # Audios on ORF radio are only available for 7 days, so we can't add tests. - _TEST = { + _TESTS = [{ 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211', 'only_matching': True, - } + }, { + 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726', + 'only_matching': True, + }] def _real_extract(self, url): show_id = self._match_id(url) diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py index 81918ac..b490ef7 100644 --- a/youtube_dl/extractor/pbs.py +++ b/youtube_dl/extractor/pbs.py @@ -4,13 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_HTTPError from ..utils import ( ExtractorError, determine_ext, int_or_none, js_to_json, strip_jsonp, + strip_or_none, unified_strdate, US_RATINGS, ) @@ -201,7 +201,7 @@ class PBSIE(InfoExtractor): 'id': '2365006249', 'ext': 'mp4', 'title': 'Constitution USA with Peter Sagal - A More Perfect Union', - 'description': 'md5:36f341ae62e251b8f5bd2b754b95a071', + 'description': 'md5:31b664af3c65fd07fa460d306b837d00', 'duration': 3190, }, }, @@ -212,7 +212,7 @@ class PBSIE(InfoExtractor): 'id': '2365297690', 'ext': 'mp4', 'title': 'FRONTLINE - Losing Iraq', - 'description': 'md5:4d3eaa01f94e61b3e73704735f1196d9', + 'description': 'md5:5979a4d069b157f622d02bff62fbe654', 'duration': 5050, }, }, @@ -223,7 +223,7 @@ class PBSIE(InfoExtractor): 'id': '2201174722', 'ext': 'mp4', 'title': 'PBS NewsHour - Cyber Schools Gain Popularity, but Quality Questions Persist', - 'description': 'md5:95a19f568689d09a166dff9edada3301', + 'description': 'md5:86ab9a3d04458b876147b355788b8781', 'duration': 801, }, }, @@ -268,7 +268,7 @@ class PBSIE(InfoExtractor): 'display_id': 'player', 'ext': 'mp4', 'title': 'American Experience - Death and the Civil War, Chapter 1', - 'description': 'md5:1b80a74e0380ed2a4fb335026de1600d', + 'description': 'md5:67fa89a9402e2ee7d08f53b920674c18', 'duration': 682, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -294,13 +294,13 @@ class PBSIE(InfoExtractor): # "<iframe style='position: absolute;<br />\ntop: 0; left: 0;' ...", see # https://github.com/rg3/youtube-dl/issues/7059) 'url': 'http://www.pbs.org/food/features/a-chefs-life-season-3-episode-5-prickly-business/', - 'md5': '84ced42850d78f1d4650297356e95e6f', + 'md5': '59b0ef5009f9ac8a319cc5efebcd865e', 'info_dict': { 'id': '2365546844', 'display_id': 'a-chefs-life-season-3-episode-5-prickly-business', 'ext': 'mp4', 'title': "A Chef's Life - Season 3, Ep. 5: Prickly Business", - 'description': 'md5:54033c6baa1f9623607c6e2ed245888b', + 'description': 'md5:c0ff7475a4b70261c7e58f493c2792a5', 'duration': 1480, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -313,7 +313,7 @@ class PBSIE(InfoExtractor): 'display_id': 'the-atomic-artists', 'ext': 'mp4', 'title': 'FRONTLINE - The Atomic Artists', - 'description': 'md5:1a2481e86b32b2e12ec1905dd473e2c1', + 'description': 'md5:f677e4520cfacb4a5ce1471e31b57800', 'duration': 723, 'thumbnail': 're:^https?://.*\.jpg$', }, @@ -324,7 +324,7 @@ class PBSIE(InfoExtractor): { # Serves hd only via wigget/partnerplayer page 'url': 'http://www.pbs.org/video/2365641075/', - 'md5': 'acfd4c400b48149a44861cb16dd305cf', + 'md5': 'fdf907851eab57211dd589cf12006666', 'info_dict': { 'id': '2365641075', 'ext': 'mp4', @@ -353,11 +353,16 @@ class PBSIE(InfoExtractor): def _extract_webpage(self, url): mobj = re.match(self._VALID_URL, url) + description = None + presumptive_id = mobj.group('presumptive_id') display_id = presumptive_id if presumptive_id: webpage = self._download_webpage(url, display_id) + description = strip_or_none(self._og_search_description( + webpage, default=None) or self._html_search_meta( + 'description', webpage, default=None)) upload_date = unified_strdate(self._search_regex( r'<input type="hidden" id="air_date_[0-9]+" value="([^"]+)"', webpage, 'upload date', default=None)) @@ -370,7 +375,7 @@ class PBSIE(InfoExtractor): for p in MULTI_PART_REGEXES: tabbed_videos = re.findall(p, webpage) if tabbed_videos: - return tabbed_videos, presumptive_id, upload_date + return tabbed_videos, presumptive_id, upload_date, description MEDIA_ID_REGEXES = [ r"div\s*:\s*'videoembed'\s*,\s*mediaid\s*:\s*'(\d+)'", # frontline video embed @@ -382,7 +387,7 @@ class PBSIE(InfoExtractor): media_id = self._search_regex( MEDIA_ID_REGEXES, webpage, 'media ID', fatal=False, default=None) if media_id: - return media_id, presumptive_id, upload_date + return media_id, presumptive_id, upload_date, description # Fronline video embedded via flp video_id = self._search_regex( @@ -399,7 +404,7 @@ class PBSIE(InfoExtractor): 'http://www.pbs.org/wgbh/pages/frontline/.json/getdir/getdir%d.json' % prg_id, presumptive_id, 'Downloading getdir JSON', transform_source=strip_jsonp) - return getdir['mid'], presumptive_id, upload_date + return getdir['mid'], presumptive_id, upload_date, description for iframe in re.findall(r'(?s)<iframe(.+?)></iframe>', webpage): url = self._search_regex( @@ -423,10 +428,10 @@ class PBSIE(InfoExtractor): video_id = mobj.group('id') display_id = video_id - return video_id, display_id, None + return video_id, display_id, None, description def _real_extract(self, url): - video_id, display_id, upload_date = self._extract_webpage(url) + video_id, display_id, upload_date, description = self._extract_webpage(url) if isinstance(video_id, list): entries = [self.url_result( @@ -448,17 +453,6 @@ class PBSIE(InfoExtractor): redirects.append(redirect) redirect_urls.add(redirect_url) - try: - video_info = self._download_json( - 'http://player.pbs.org/videoInfo/%s?format=json&type=partner' % video_id, - display_id, 'Downloading video info JSON') - extract_redirect_urls(video_info) - info = video_info - except ExtractorError as e: - # videoInfo API may not work for some videos - if not isinstance(e.cause, compat_HTTPError) or e.cause.code != 404: - raise - # Player pages may also serve different qualities for page in ('widget/partnerplayer', 'portalplayer'): player = self._download_webpage( @@ -511,14 +505,23 @@ class PBSIE(InfoExtractor): formats)) if http_url: for m3u8_format in m3u8_formats: - bitrate = self._search_regex(r'(\d+k)', m3u8_format['url'], 'bitrate', default=None) - # extract only the formats that we know that they will be available as http format. - # https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications - if not bitrate or bitrate not in ('400k', '800k', '1200k', '2500k'): + bitrate = self._search_regex(r'(\d+)k', m3u8_format['url'], 'bitrate', default=None) + # Lower qualities (150k and 192k) are not available as HTTP formats (see [1]), + # we won't try extracting them. + # Since summer 2016 higher quality formats (4500k and 6500k) are also available + # albeit they are not documented in [2]. + # 1. https://github.com/rg3/youtube-dl/commit/cbc032c8b70a038a69259378c92b4ba97b42d491#commitcomment-17313656 + # 2. https://projects.pbs.org/confluence/display/coveapi/COVE+Video+Specifications + if not bitrate or int(bitrate) < 400: + continue + f_url = re.sub(r'\d+k|baseline', bitrate + 'k', http_url) + # This may produce invalid links sometimes (e.g. + # http://www.pbs.org/wgbh/frontline/film/suicide-plan) + if not self._is_valid_url(f_url, display_id, 'http-%sk video' % bitrate): continue f = m3u8_format.copy() f.update({ - 'url': re.sub(r'\d+k|baseline', bitrate, http_url), + 'url': f_url, 'format_id': m3u8_format['format_id'].replace('hls', 'http'), 'protocol': 'http', }) @@ -557,11 +560,14 @@ class PBSIE(InfoExtractor): if alt_title: info['title'] = alt_title + ' - ' + re.sub(r'^' + alt_title + '[\s\-:]+', '', info['title']) + description = info.get('description') or info.get( + 'program', {}).get('description') or description + return { 'id': video_id, 'display_id': display_id, 'title': info['title'], - 'description': info.get('description') or info.get('program', {}).get('description'), + 'description': description, 'thumbnail': info.get('image_url'), 'duration': int_or_none(info.get('duration')), 'age_limit': age_limit, diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py index c23b314..75f5884 100644 --- a/youtube_dl/extractor/periscope.py +++ b/youtube_dl/extractor/periscope.py @@ -120,9 +120,12 @@ class PeriscopeUserIE(InfoExtractor): title = user.get('display_name') or user.get('username') description = user.get('description') + broadcast_ids = (data_store.get('UserBroadcastHistory', {}).get('broadcastIds') or + data_store.get('BroadcastCache', {}).get('broadcastIds', [])) + entries = [ self.url_result( - 'https://www.periscope.tv/%s/%s' % (user_id, broadcast['id'])) - for broadcast in data_store.get('UserBroadcastHistory', {}).get('broadcasts', [])] + 'https://www.periscope.tv/%s/%s' % (user_id, broadcast_id)) + for broadcast_id in broadcast_ids] return self.playlist_result(entries, user_id, title, description) diff --git a/youtube_dl/extractor/pladform.py b/youtube_dl/extractor/pladform.py index bc559d1..77e1211 100644 --- a/youtube_dl/extractor/pladform.py +++ b/youtube_dl/extractor/pladform.py @@ -49,7 +49,7 @@ class PladformIE(InfoExtractor): @staticmethod def _extract_url(webpage): mobj = re.search( - r'<iframe[^>]+src="(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)"', webpage) + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?:)?//out\.pladform\.ru/player\?.+?)\1', webpage) if mobj: return mobj.group('url') diff --git a/youtube_dl/extractor/playvid.py b/youtube_dl/extractor/playvid.py index 2eb4fd9..78d2192 100644 --- a/youtube_dl/extractor/playvid.py +++ b/youtube_dl/extractor/playvid.py @@ -15,7 +15,7 @@ from ..utils import ( class PlayvidIE(InfoExtractor): _VALID_URL = r'https?://www\.playvid\.com/watch(\?v=|/)(?P<id>.+?)(?:#|$)' - _TEST = { + _TESTS = [{ 'url': 'http://www.playvid.com/watch/RnmBNgtrrJu', 'md5': 'ffa2f6b2119af359f544388d8c01eb6c', 'info_dict': { @@ -24,8 +24,19 @@ class PlayvidIE(InfoExtractor): 'title': 'md5:9256d01c6317e3f703848b5906880dc8', 'duration': 82, 'age_limit': 18, - } - } + }, + 'skip': 'Video removed due to ToS', + }, { + 'url': 'http://www.playvid.com/watch/hwb0GpNkzgH', + 'md5': '39d49df503ad7b8f23a4432cbf046477', + 'info_dict': { + 'id': 'hwb0GpNkzgH', + 'ext': 'mp4', + 'title': 'Ellen Euro Cutie Blond Takes a Sexy Survey Get Facial in The Park', + 'age_limit': 18, + 'thumbnail': 're:^https?://.*\.jpg$', + }, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/pokemon.py b/youtube_dl/extractor/pokemon.py new file mode 100644 index 0000000..2d87e7e --- /dev/null +++ b/youtube_dl/extractor/pokemon.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + extract_attributes, + int_or_none, +) + + +class PokemonIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?pokemon\.com/[a-z]{2}(?:.*?play=(?P<id>[a-z0-9]{32})|/[^/]+/\d+_\d+-(?P<display_id>[^/?#]+))' + _TESTS = [{ + 'url': 'http://www.pokemon.com/us/pokemon-episodes/19_01-from-a-to-z/?play=true', + 'md5': '9fb209ae3a569aac25de0f5afc4ee08f', + 'info_dict': { + 'id': 'd0436c00c3ce4071ac6cee8130ac54a1', + 'ext': 'mp4', + 'title': 'From A to Z!', + 'description': 'Bonnie makes a new friend, Ash runs into an old friend, and a terrifying premonition begins to unfold!', + 'timestamp': 1460478136, + 'upload_date': '20160412', + }, + 'add_id': ['LimelightMedia'] + }, { + 'url': 'http://www.pokemon.com/uk/pokemon-episodes/?play=2e8b5c761f1d4a9286165d7748c1ece2', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/fr/episodes-pokemon/18_09-un-hiver-inattendu/', + 'only_matching': True, + }, { + 'url': 'http://www.pokemon.com/de/pokemon-folgen/01_20-bye-bye-smettbo/', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id, display_id = re.match(self._VALID_URL, url).groups() + webpage = self._download_webpage(url, video_id or display_id) + video_data = extract_attributes(self._search_regex( + r'(<[^>]+data-video-id="%s"[^>]*>)' % (video_id if video_id else '[a-z0-9]{32}'), + webpage, 'video data element')) + video_id = video_data['data-video-id'] + title = video_data['data-video-title'] + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'limelight:media:%s' % video_id, + 'title': title, + 'description': video_data.get('data-video-summary'), + 'thumbnail': video_data.get('data-video-poster'), + 'series': 'Pokémon', + 'season_number': int_or_none(video_data.get('data-video-season')), + 'episode': title, + 'episode_number': int_or_none(video_data.get('data-video-episode')), + 'ie_key': 'LimelightMedia', + } diff --git a/youtube_dl/extractor/polskieradio.py b/youtube_dl/extractor/polskieradio.py new file mode 100644 index 0000000..f559b89 --- /dev/null +++ b/youtube_dl/extractor/polskieradio.py @@ -0,0 +1,99 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..compat import ( + compat_str, + compat_urllib_parse_unquote, +) +from ..utils import ( + int_or_none, + strip_or_none, + unified_timestamp, +) + + +class PolskieRadioIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?polskieradio\.pl/\d+/\d+/Artykul/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943,Prof-Andrzej-Nowak-o-historii-nie-da-sie-myslec-beznamietnie', + 'info_dict': { + 'id': '1587943', + 'title': 'Prof. Andrzej Nowak: o historii nie da się myśleć beznamiętnie', + 'description': 'md5:12f954edbf3120c5e7075e17bf9fc5c5', + }, + 'playlist': [{ + 'md5': '2984ee6ce9046d91fc233bc1a864a09a', + 'info_dict': { + 'id': '1540576', + 'ext': 'mp3', + 'title': 'md5:d4623290d4ac983bf924061c75c23a0d', + 'timestamp': 1456594200, + 'upload_date': '20160227', + 'duration': 2364, + 'thumbnail': 're:^https?://static\.prsa\.pl/images/.*\.jpg$' + }, + }], + }, { + 'url': 'http://www.polskieradio.pl/265/5217/Artykul/1635803,Euro-2016-nie-ma-miejsca-na-blad-Polacy-graja-ze-Szwajcaria-o-cwiercfinal', + 'info_dict': { + 'id': '1635803', + 'title': 'Euro 2016: nie ma miejsca na błąd. Polacy grają ze Szwajcarią o ćwierćfinał', + 'description': 'md5:01cb7d0cad58664095d72b51a1ebada2', + }, + 'playlist_mincount': 12, + }, { + 'url': 'http://polskieradio.pl/9/305/Artykul/1632955,Bardzo-popularne-slowo-remis', + 'only_matching': True, + }, { + 'url': 'http://www.polskieradio.pl/7/5102/Artykul/1587943', + 'only_matching': True, + }, { + # with mp4 video + 'url': 'http://www.polskieradio.pl/9/299/Artykul/1634903,Brexit-Leszek-Miller-swiat-sie-nie-zawali-Europa-bedzie-trwac-dalej', + 'only_matching': True, + }] + + def _real_extract(self, url): + playlist_id = self._match_id(url) + + webpage = self._download_webpage(url, playlist_id) + + content = self._search_regex( + r'(?s)<div[^>]+class="audio atarticle"[^>]*>(.+?)<script>', + webpage, 'content') + + timestamp = unified_timestamp(self._html_search_regex( + r'(?s)<span[^>]+id="datetime2"[^>]*>(.+?)</span>', + webpage, 'timestamp', fatal=False)) + + thumbnail_url = self._og_search_thumbnail(webpage) + + entries = [] + + media_urls = set() + + for data_media in re.findall(r'<[^>]+data-media=({[^>]+})', content): + media = self._parse_json(data_media, playlist_id, fatal=False) + if not media.get('file') or not media.get('desc'): + continue + media_url = self._proto_relative_url(media['file'], 'http:') + if media_url in media_urls: + continue + media_urls.add(media_url) + entries.append({ + 'id': compat_str(media['id']), + 'url': media_url, + 'title': compat_urllib_parse_unquote(media['desc']), + 'duration': int_or_none(media.get('length')), + 'vcodec': 'none' if media.get('provider') == 'audio' else None, + 'timestamp': timestamp, + 'thumbnail': thumbnail_url + }) + + title = self._og_search_title(webpage).strip() + description = strip_or_none(self._og_search_description(webpage)) + + return self.playlist_result(entries, playlist_id, title, description) diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py index 6d57e1d..20976c1 100644 --- a/youtube_dl/extractor/pornhub.py +++ b/youtube_dl/extractor/pornhub.py @@ -25,7 +25,15 @@ from ..aes import ( class PornHubIE(InfoExtractor): - _VALID_URL = r'https?://(?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)(?P<id>[0-9a-z]+)' + IE_DESC = 'PornHub and Thumbzilla' + _VALID_URL = r'''(?x) + https?:// + (?: + (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)| + (?:www\.)?thumbzilla\.com/video/ + ) + (?P<id>[0-9a-z]+) + ''' _TESTS = [{ 'url': 'http://www.pornhub.com/view_video.php?viewkey=648719015', 'md5': '1e19b41231a02eba417839222ac9d58e', @@ -63,8 +71,24 @@ class PornHubIE(InfoExtractor): 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph557bbb6676d2d', 'only_matching': True, }, { + # removed at the request of cam4.com 'url': 'http://fr.pornhub.com/view_video.php?viewkey=ph55ca2f9760862', 'only_matching': True, + }, { + # removed at the request of the copyright owner + 'url': 'http://www.pornhub.com/view_video.php?viewkey=788152859', + 'only_matching': True, + }, { + # removed by uploader + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph572716d15a111', + 'only_matching': True, + }, { + # private video + 'url': 'http://www.pornhub.com/view_video.php?viewkey=ph56fd731fce6b7', + 'only_matching': True, + }, { + 'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex', + 'only_matching': True, }] @classmethod @@ -87,8 +111,8 @@ class PornHubIE(InfoExtractor): webpage = self._download_webpage(req, video_id) error_msg = self._html_search_regex( - r'(?s)<div class="userMessageSection[^"]*".*?>(.*?)</div>', - webpage, 'error message', default=None) + r'(?s)<div[^>]+class=(["\'])(?:(?!\1).)*\b(?:removed|userMessageSection)\b(?:(?!\1).)*\1[^>]*>(?P<error>.+?)</div>', + webpage, 'error message', default=None, group='error') if error_msg: error_msg = re.sub(r'\s+', ' ', error_msg) raise ExtractorError( diff --git a/youtube_dl/extractor/pornotube.py b/youtube_dl/extractor/pornotube.py index 5398e70..63816c3 100644 --- a/youtube_dl/extractor/pornotube.py +++ b/youtube_dl/extractor/pornotube.py @@ -3,10 +3,7 @@ from __future__ import unicode_literals import json from .common import InfoExtractor -from ..utils import ( - int_or_none, - sanitized_Request, -) +from ..utils import int_or_none class PornotubeIE(InfoExtractor): @@ -31,59 +28,55 @@ class PornotubeIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - # Fetch origin token - js_config = self._download_webpage( - 'http://www.pornotube.com/assets/src/app/config.js', video_id, - note='Download JS config') - originAuthenticationSpaceKey = self._search_regex( - r"constant\('originAuthenticationSpaceKey',\s*'([^']+)'", - js_config, 'originAuthenticationSpaceKey') + token = self._download_json( + 'https://api.aebn.net/auth/v2/origins/authenticate', + video_id, note='Downloading token', + data=json.dumps({'credentials': 'Clip Application'}).encode('utf-8'), + headers={ + 'Content-Type': 'application/json', + 'Origin': 'http://www.pornotube.com', + })['tokenKey'] - # Fetch actual token - token_req_data = { - 'authenticationSpaceKey': originAuthenticationSpaceKey, - 'credentials': 'Clip Application', - } - token_req = sanitized_Request( - 'https://api.aebn.net/auth/v1/token/primal', - data=json.dumps(token_req_data).encode('utf-8')) - token_req.add_header('Content-Type', 'application/json') - token_req.add_header('Origin', 'http://www.pornotube.com') - token_answer = self._download_json( - token_req, video_id, note='Requesting primal token') - token = token_answer['tokenKey'] + video_url = self._download_json( + 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id, + video_id, note='Downloading delivery information', + headers={'Authorization': token})['mediaUrl'] - # Get video URL - delivery_req = sanitized_Request( - 'https://api.aebn.net/delivery/v1/clips/%s/MP4' % video_id) - delivery_req.add_header('Authorization', token) - delivery_info = self._download_json( - delivery_req, video_id, note='Downloading delivery information') - video_url = delivery_info['mediaUrl'] + FIELDS = ( + 'title', 'description', 'startSecond', 'endSecond', 'publishDate', + 'studios{name}', 'categories{name}', 'movieId', 'primaryImageNumber' + ) - # Get additional info (title etc.) - info_req = sanitized_Request( - 'https://api.aebn.net/content/v1/clips/%s?expand=' - 'title,description,primaryImageNumber,startSecond,endSecond,' - 'movie.title,movie.MovieId,movie.boxCoverFront,movie.stars,' - 'movie.studios,stars.name,studios.name,categories.name,' - 'clipActive,movieActive,publishDate,orientations' % video_id) - info_req.add_header('Authorization', token) info = self._download_json( - info_req, video_id, note='Downloading metadata') + 'https://api.aebn.net/content/v2/clips/%s?fields=%s' + % (video_id, ','.join(FIELDS)), video_id, + note='Downloading metadata', + headers={'Authorization': token}) + + if isinstance(info, list): + info = info[0] + + title = info['title'] timestamp = int_or_none(info.get('publishDate'), scale=1000) uploader = info.get('studios', [{}])[0].get('name') - movie_id = info['movie']['movieId'] - thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( - movie_id, movie_id, info['primaryImageNumber']) - categories = [c['name'] for c in info.get('categories')] + movie_id = info.get('movieId') + primary_image_number = info.get('primaryImageNumber') + thumbnail = None + if movie_id and primary_image_number: + thumbnail = 'http://pic.aebn.net/dis/t/%s/%s_%08d.jpg' % ( + movie_id, movie_id, primary_image_number) + start = int_or_none(info.get('startSecond')) + end = int_or_none(info.get('endSecond')) + duration = end - start if start and end else None + categories = [c['name'] for c in info.get('categories', []) if c.get('name')] return { 'id': video_id, 'url': video_url, - 'title': info['title'], + 'title': title, 'description': info.get('description'), + 'duration': duration, 'timestamp': timestamp, 'uploader': uploader, 'thumbnail': thumbnail, diff --git a/youtube_dl/extractor/prosiebensat1.py b/youtube_dl/extractor/prosiebensat1.py index 07d49d4..c6eee3b 100644 --- a/youtube_dl/extractor/prosiebensat1.py +++ b/youtube_dl/extractor/prosiebensat1.py @@ -5,7 +5,7 @@ import re from hashlib import sha1 from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode +from ..compat import compat_str from ..utils import ( ExtractorError, determine_ext, @@ -71,6 +71,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sixx.de/stars-style/video/sexy-laufen-in-ugg-boots-clip', @@ -86,6 +87,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.sat1.de/film/der-ruecktritt/video/im-interview-kai-wiesinger-clip', @@ -101,6 +103,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.kabeleins.de/tv/rosins-restaurants/videos/jagd-auf-fertigkost-im-elsthal-teil-2-ganze-folge', @@ -116,6 +119,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.ran.de/fussball/bundesliga/video/schalke-toennies-moechte-raul-zurueck-ganze-folge', @@ -131,6 +135,7 @@ class ProSiebenSat1IE(InfoExtractor): # rtmp download 'skip_download': True, }, + 'skip': 'This video is unavailable', }, { 'url': 'http://www.the-voice-of-germany.de/video/31-andreas-kuemmert-rocket-man-clip', @@ -227,70 +232,42 @@ class ProSiebenSat1IE(InfoExtractor): ] def _extract_clip(self, url, webpage): - clip_id = self._html_search_regex(self._CLIPID_REGEXES, webpage, 'clip id') + clip_id = self._html_search_regex( + self._CLIPID_REGEXES, webpage, 'clip id') access_token = 'prosieben' client_name = 'kolibri-2.0.19-splec4' client_location = url - videos_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos?%s' % compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_location': client_location, - 'client_name': client_name, - 'ids': clip_id, - }) - - video = self._download_json(videos_api_url, clip_id, 'Downloading videos JSON')[0] + video = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos', + clip_id, 'Downloading videos JSON', query={ + 'access_token': access_token, + 'client_location': client_location, + 'client_name': client_name, + 'ids': clip_id, + })[0] if video.get('is_protected') is True: raise ExtractorError('This video is DRM protected.', expected=True) duration = float_or_none(video.get('duration')) - source_ids = [source['id'] for source in video['sources']] - source_ids_str = ','.join(map(str, source_ids)) + source_ids = [compat_str(source['id']) for source in video['sources']] g = '01!8d8F_)r9]4s[qeuXfP%' + client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]).encode('utf-8')).hexdigest() - client_id = g[:2] + sha1(''.join([clip_id, g, access_token, client_location, g, client_name]) - .encode('utf-8')).hexdigest() - - sources_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources?%s' % (clip_id, compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - })) - - sources = self._download_json(sources_api_url, clip_id, 'Downloading sources JSON') + sources = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources' % clip_id, + clip_id, 'Downloading sources JSON', query={ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + }) server_id = sources['server_id'] - client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, - client_location, source_ids_str, g, client_name]) - .encode('utf-8')).hexdigest() - - url_api_url = 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url?%s' % (clip_id, compat_urllib_parse_urlencode({ - 'access_token': access_token, - 'client_id': client_id, - 'client_location': client_location, - 'client_name': client_name, - 'server_id': server_id, - 'source_ids': source_ids_str, - })) - - urls = self._download_json(url_api_url, clip_id, 'Downloading urls JSON') - title = self._html_search_regex(self._TITLE_REGEXES, webpage, 'title') - description = self._html_search_regex(self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - - upload_date = unified_strdate(self._html_search_regex( - self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) - - formats = [] - - urls_sources = urls['sources'] - if isinstance(urls_sources, dict): - urls_sources = urls_sources.values() def fix_bitrate(bitrate): bitrate = int_or_none(bitrate) @@ -298,37 +275,73 @@ class ProSiebenSat1IE(InfoExtractor): return None return (bitrate // 1000) if bitrate % 1000 == 0 else bitrate - for source in urls_sources: - protocol = source['protocol'] - source_url = source['url'] - if protocol == 'rtmp' or protocol == 'rtmpe': - mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) - if not mobj: - continue - path = mobj.group('path') - mp4colon_index = path.rfind('mp4:') - app = path[:mp4colon_index] - play_path = path[mp4colon_index:] - formats.append({ - 'url': '%s/%s' % (mobj.group('url'), app), - 'app': app, - 'play_path': play_path, - 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', - 'page_url': 'http://www.prosieben.de', - 'vbr': fix_bitrate(source['bitrate']), - 'ext': 'mp4', - 'format_id': '%s_%s' % (source['cdn'], source['bitrate']), - }) - elif 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': - formats.extend(self._extract_f4m_formats(source_url, clip_id)) - else: - formats.append({ - 'url': source_url, - 'vbr': fix_bitrate(source['bitrate']), + formats = [] + for source_id in source_ids: + client_id = g[:2] + sha1(''.join([g, clip_id, access_token, server_id, client_location, source_id, g, client_name]).encode('utf-8')).hexdigest() + urls = self._download_json( + 'http://vas.sim-technik.de/vas/live/v2/videos/%s/sources/url' % clip_id, + clip_id, 'Downloading urls JSON', fatal=False, query={ + 'access_token': access_token, + 'client_id': client_id, + 'client_location': client_location, + 'client_name': client_name, + 'server_id': server_id, + 'source_ids': source_id, }) - + if not urls: + continue + if urls.get('status_code') != 0: + raise ExtractorError('This video is unavailable', expected=True) + urls_sources = urls['sources'] + if isinstance(urls_sources, dict): + urls_sources = urls_sources.values() + for source in urls_sources: + source_url = source.get('url') + if not source_url: + continue + protocol = source.get('protocol') + mimetype = source.get('mimetype') + if mimetype == 'application/f4m+xml' or 'f4mgenerator' in source_url or determine_ext(source_url) == 'f4m': + formats.extend(self._extract_f4m_formats( + source_url, clip_id, f4m_id='hds', fatal=False)) + elif mimetype == 'application/x-mpegURL': + formats.extend(self._extract_m3u8_formats( + source_url, clip_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + else: + tbr = fix_bitrate(source['bitrate']) + if protocol in ('rtmp', 'rtmpe'): + mobj = re.search(r'^(?P<url>rtmpe?://[^/]+)/(?P<path>.+)$', source_url) + if not mobj: + continue + path = mobj.group('path') + mp4colon_index = path.rfind('mp4:') + app = path[:mp4colon_index] + play_path = path[mp4colon_index:] + formats.append({ + 'url': '%s/%s' % (mobj.group('url'), app), + 'app': app, + 'play_path': play_path, + 'player_url': 'http://livepassdl.conviva.com/hf/ver/2.79.0.17083/LivePassModuleMain.swf', + 'page_url': 'http://www.prosieben.de', + 'tbr': tbr, + 'ext': 'flv', + 'format_id': 'rtmp%s' % ('-%d' % tbr if tbr else ''), + }) + else: + formats.append({ + 'url': source_url, + 'tbr': tbr, + 'format_id': 'http%s' % ('-%d' % tbr if tbr else ''), + }) self._sort_formats(formats) + description = self._html_search_regex( + self._DESCRIPTION_REGEXES, webpage, 'description', fatal=False) + thumbnail = self._og_search_thumbnail(webpage) + upload_date = unified_strdate(self._html_search_regex( + self._UPLOAD_DATE_REGEXES, webpage, 'upload date', default=None)) + return { 'id': clip_id, 'title': title, diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py index 4f05bbd..8ec4026 100644 --- a/youtube_dl/extractor/radiocanada.py +++ b/youtube_dl/extractor/radiocanada.py @@ -12,6 +12,7 @@ from ..utils import ( unified_strdate, xpath_element, ExtractorError, + determine_protocol, ) @@ -22,13 +23,13 @@ class RadioCanadaIE(InfoExtractor): 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272', 'info_dict': { 'id': '7184272', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Le parcours du tireur capté sur vidéo', 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa', 'upload_date': '20141023', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, } @@ -36,11 +37,14 @@ class RadioCanadaIE(InfoExtractor): def _real_extract(self, url): app_code, video_id = re.match(self._VALID_URL, url).groups() + device_types = ['ipad', 'android'] + if app_code != 'toutv': + device_types.append('flash') + formats = [] - # TODO: extract m3u8 and f4m formats - # m3u8 formats can be extracted using ipad device_type return 403 error code when ffmpeg try to download segements + # TODO: extract f4m formats # f4m formats can be extracted using flashhd device_type but they produce unplayable file - for device_type in ('flash',): + for device_type in device_types: v_data = self._download_xml( 'http://api.radio-canada.ca/validationMedia/v1/Validation.ashx', video_id, note='Downloading %s XML' % device_type, query={ @@ -52,7 +56,7 @@ class RadioCanadaIE(InfoExtractor): # paysJ391wsHjbOJwvCs26toz and bypasslock are used to bypass geo-restriction 'paysJ391wsHjbOJwvCs26toz': 'CA', 'bypasslock': 'NZt5K62gRqfc', - }) + }, fatal=False) v_url = xpath_text(v_data, 'url') if not v_url: continue @@ -64,7 +68,8 @@ class RadioCanadaIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( v_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) elif ext == 'f4m': - formats.extend(self._extract_f4m_formats(v_url, video_id, f4m_id='hds', fatal=False)) + formats.extend(self._extract_f4m_formats( + v_url, video_id, f4m_id='hds', fatal=False)) else: ext = determine_ext(v_url) bitrates = xpath_element(v_data, 'bitrates') @@ -72,15 +77,28 @@ class RadioCanadaIE(InfoExtractor): tbr = int_or_none(url_e.get('bitrate')) if not tbr: continue + f_url = re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url) + protocol = determine_protocol({'url': f_url}) formats.append({ - 'format_id': 'rtmp-%d' % tbr, - 'url': re.sub(r'\d+\.%s' % ext, '%d.%s' % (tbr, ext), v_url), - 'ext': 'flv', - 'protocol': 'rtmp', + 'format_id': '%s-%d' % (protocol, tbr), + 'url': f_url, + 'ext': 'flv' if protocol == 'rtmp' else ext, + 'protocol': protocol, 'width': int_or_none(url_e.get('width')), 'height': int_or_none(url_e.get('height')), 'tbr': tbr, }) + if protocol == 'rtsp': + base_url = self._search_regex( + r'rtsp://([^?]+)', f_url, 'base url', default=None) + if base_url: + base_url = 'http://' + base_url + formats.extend(self._extract_m3u8_formats( + base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + base_url + '/manifest.f4m', video_id, + f4m_id='hds', fatal=False)) self._sort_formats(formats) metadata = self._download_xml( @@ -115,13 +133,13 @@ class RadioCanadaAudioVideoIE(InfoExtractor): 'url': 'http://ici.radio-canada.ca/audio-video/media-7527184/barack-obama-au-vietnam', 'info_dict': { 'id': '7527184', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Barack Obama au Vietnam', 'description': 'Les États-Unis lèvent l\'embargo sur la vente d\'armes qui datait de la guerre du Vietnam', 'upload_date': '20160523', }, 'params': { - # rtmp download + # m3u8 download 'skip_download': True, }, } diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py index e36ce1a..dc640b1 100644 --- a/youtube_dl/extractor/rai.py +++ b/youtube_dl/extractor/rai.py @@ -1,47 +1,141 @@ from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse, - compat_urlparse, -) +from ..compat import compat_urlparse from ..utils import ( - ExtractorError, determine_ext, + ExtractorError, + find_xpath_attr, + fix_xml_ampersands, + int_or_none, parse_duration, unified_strdate, - int_or_none, + update_url_query, xpath_text, ) -class RaiTVIE(InfoExtractor): - _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+media/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' +class RaiBaseIE(InfoExtractor): + def _extract_relinker_formats(self, relinker_url, video_id): + formats = [] + + for platform in ('mon', 'flash', 'native'): + relinker = self._download_xml( + relinker_url, video_id, + note='Downloading XML metadata for platform %s' % platform, + transform_source=fix_xml_ampersands, + query={'output': 45, 'pl': platform}, + headers=self.geo_verification_headers()) + + media_url = find_xpath_attr(relinker, './url', 'type', 'content').text + if media_url == 'http://download.rai.it/video_no_available.mp4': + self.raise_geo_restricted() + + ext = determine_ext(media_url) + if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'): + continue + + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + media_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + elif ext == 'f4m': + manifest_url = update_url_query( + media_url.replace('manifest#live_hds.f4m', 'manifest.f4m'), + {'hdcore': '3.7.0', 'plugin': 'aasp-3.7.0.39.44'}) + formats.extend(self._extract_f4m_formats( + manifest_url, video_id, f4m_id='hds', fatal=False)) + else: + bitrate = int_or_none(xpath_text(relinker, 'bitrate')) + formats.append({ + 'url': media_url, + 'tbr': bitrate if bitrate > 0 else None, + 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', + }) + + return formats + + def _extract_from_content_id(self, content_id, base_url): + media = self._download_json( + 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id, + content_id, 'Downloading video JSON') + + thumbnails = [] + for image_type in ('image', 'image_medium', 'image_300'): + thumbnail_url = media.get(image_type) + if thumbnail_url: + thumbnails.append({ + 'url': compat_urlparse.urljoin(base_url, thumbnail_url), + }) + + formats = [] + media_type = media['type'] + if 'Audio' in media_type: + formats.append({ + 'format_id': media.get('formatoAudio'), + 'url': media['audioUrl'], + 'ext': media.get('formatoAudio'), + }) + elif 'Video' in media_type: + formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id)) + self._sort_formats(formats) + else: + raise ExtractorError('not a media file') + + subtitles = {} + captions = media.get('subtitlesUrl') + if captions: + STL_EXT = '.stl' + SRT_EXT = '.srt' + if captions.endswith(STL_EXT): + captions = captions[:-len(STL_EXT)] + SRT_EXT + subtitles['it'] = [{ + 'ext': 'srt', + 'url': captions, + }] + + return { + 'id': content_id, + 'title': media['name'], + 'description': media.get('desc'), + 'thumbnails': thumbnails, + 'uploader': media.get('author'), + 'upload_date': unified_strdate(media.get('date')), + 'duration': parse_duration(media.get('length')), + 'formats': formats, + 'subtitles': subtitles, + } + + +class RaiTVIE(RaiBaseIE): + _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html', - 'md5': '96382709b61dd64a6b88e0f791e6df4c', + 'md5': '8970abf8caf8aef4696e7b1f2adfc696', 'info_dict': { 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Report del 07/04/2014', 'description': 'md5:f27c544694cacb46a078db84ec35d2d9', 'upload_date': '20140407', 'duration': 6160, + 'thumbnail': 're:^https?://.*\.jpg$', } }, { + # no m3u8 stream 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html', - 'md5': 'd9751b78eac9710d62c2447b224dea39', + # HDS download, MD5 is unstable 'info_dict': { 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9', 'ext': 'flv', 'title': 'TG PRIMO TEMPO', 'upload_date': '20140612', 'duration': 1758, + 'thumbnail': 're:^https?://.*\.jpg$', }, + 'skip': 'Geo-restricted to Italy', }, { 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html', @@ -67,127 +161,70 @@ class RaiTVIE(InfoExtractor): }, { 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html', - 'md5': '496ab63e420574447f70d02578333437', + 'md5': 'e57493e1cb8bc7c564663f363b171847', 'info_dict': { 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Il Candidato - Primo episodio: "Le Primarie"', 'description': 'md5:364b604f7db50594678f483353164fb8', 'upload_date': '20140923', 'duration': 386, + 'thumbnail': 're:^https?://.*\.jpg$', } }, ] def _real_extract(self, url): video_id = self._match_id(url) - media = self._download_json( - 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % video_id, - video_id, 'Downloading video JSON') - - thumbnails = [] - for image_type in ('image', 'image_medium', 'image_300'): - thumbnail_url = media.get(image_type) - if thumbnail_url: - thumbnails.append({ - 'url': thumbnail_url, - }) - - subtitles = [] - formats = [] - media_type = media['type'] - if 'Audio' in media_type: - formats.append({ - 'format_id': media.get('formatoAudio'), - 'url': media['audioUrl'], - 'ext': media.get('formatoAudio'), - }) - elif 'Video' in media_type: - def fix_xml(xml): - return xml.replace(' tag elementi', '').replace('>/', '</') - - relinker = self._download_xml( - media['mediaUri'] + '&output=43', - video_id, transform_source=fix_xml) - - has_subtitle = False - - for element in relinker.findall('element'): - media_url = xpath_text(element, 'url') - ext = determine_ext(media_url) - content_type = xpath_text(element, 'content-type') - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - media_url, video_id, 'mp4', 'm3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - media_url + '?hdcore=3.7.0&plugin=aasp-3.7.0.39.44', - video_id, f4m_id='hds', fatal=False)) - elif ext == 'stl': - has_subtitle = True - elif content_type.startswith('video/'): - bitrate = int_or_none(xpath_text(element, 'bitrate')) - formats.append({ - 'url': media_url, - 'tbr': bitrate if bitrate > 0 else None, - 'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http', - }) - elif content_type.startswith('image/'): - thumbnails.append({ - 'url': media_url, - }) - - self._sort_formats(formats) - if has_subtitle: - webpage = self._download_webpage(url, video_id) - subtitles = self._get_subtitles(video_id, webpage) - else: - raise ExtractorError('not a media file') + return self._extract_from_content_id(video_id, url) - return { - 'id': video_id, - 'title': media['name'], - 'description': media.get('desc'), - 'thumbnails': thumbnails, - 'uploader': media.get('author'), - 'upload_date': unified_strdate(media.get('date')), - 'duration': parse_duration(media.get('length')), - 'formats': formats, - 'subtitles': subtitles, - } - def _get_subtitles(self, video_id, webpage): - subtitles = {} - m = re.search(r'<meta name="closedcaption" content="(?P<captions>[^"]+)"', webpage) - if m: - captions = m.group('captions') - STL_EXT = '.stl' - SRT_EXT = '.srt' - if captions.endswith(STL_EXT): - captions = captions[:-len(STL_EXT)] + SRT_EXT - subtitles['it'] = [{ - 'ext': 'srt', - 'url': 'http://www.rai.tv%s' % compat_urllib_parse.quote(captions), - }] - return subtitles - - -class RaiIE(InfoExtractor): +class RaiIE(RaiBaseIE): _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html' _TESTS = [ { 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html', - 'md5': 'e0e7a8a131e249d1aa0ebf270d1d8db7', + 'md5': '2dd727e61114e1ee9c47f0da6914e178', 'info_dict': { 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Il pacco', 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a', 'upload_date': '20141221', }, - } + }, + { + # Direct relinker URL + 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc', + 'ext': 'flv', + 'title': 'EuroNews', + }, + 'skip': 'Geo-restricted to Italy', + }, + { + # Embedded content item ID + 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined', + 'md5': '84c1135ce960e8822ae63cec34441d63', + 'info_dict': { + 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8', + 'ext': 'mp4', + 'title': 'TG1 ore 20:00 del 02/07/2016', + 'upload_date': '20160702', + }, + }, + { + 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html', + # HDS live stream, MD5 is unstable + 'info_dict': { + 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9', + 'ext': 'flv', + 'title': 'La diretta di Rainews24', + }, + }, ] @classmethod @@ -201,7 +238,30 @@ class RaiIE(InfoExtractor): iframe_url = self._search_regex( [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"', r'drawMediaRaiTV\(["\'](.+?)["\']'], - webpage, 'iframe') - if not iframe_url.startswith('http'): - iframe_url = compat_urlparse.urljoin(url, iframe_url) - return self.url_result(iframe_url) + webpage, 'iframe', default=None) + if iframe_url: + if not iframe_url.startswith('http'): + iframe_url = compat_urlparse.urljoin(url, iframe_url) + return self.url_result(iframe_url) + + content_item_id = self._search_regex( + r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)', + webpage, 'content item ID', group='content_id', default=None) + if content_item_id: + return self._extract_from_content_id(content_item_id, url) + + relinker_url = compat_urlparse.urljoin(url, self._search_regex( + r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)', + webpage, 'relinker URL', group='url')) + formats = self._extract_relinker_formats(relinker_url, video_id) + self._sort_formats(formats) + + title = self._search_regex( + r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1', + webpage, 'title', group='title', default=None) or self._og_search_title(webpage) + + return { + 'id': video_id, + 'title': title, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py index 7932af6..471928e 100644 --- a/youtube_dl/extractor/rbmaradio.py +++ b/youtube_dl/extractor/rbmaradio.py @@ -1,55 +1,71 @@ -# encoding: utf-8 from __future__ import unicode_literals -import json import re from .common import InfoExtractor +from ..compat import compat_str from ..utils import ( - ExtractorError, + clean_html, + int_or_none, + unified_timestamp, + update_url_query, ) class RBMARadioIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<videoID>[^/]+)$' + _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)' _TEST = { - 'url': 'http://www.rbmaradio.com/shows/ford-lopatin-live-at-primavera-sound-2011', + 'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011', 'md5': '6bc6f9bcb18994b4c983bc3bf4384d95', 'info_dict': { 'id': 'ford-lopatin-live-at-primavera-sound-2011', 'ext': 'mp3', - 'uploader_id': 'ford-lopatin', - 'location': 'Spain', - 'description': 'Joel Ford and Daniel ’Oneohtrix Point Never’ Lopatin fly their midified pop extravaganza to Spain. Live at Primavera Sound 2011.', - 'uploader': 'Ford & Lopatin', - 'title': 'Live at Primavera Sound 2011', + 'title': 'Main Stage - Ford & Lopatin', + 'description': 'md5:4f340fb48426423530af5a9d87bd7b91', + 'thumbnail': 're:^https?://.*\.jpg', + 'duration': 2452, + 'timestamp': 1307103164, + 'upload_date': '20110603', }, } def _real_extract(self, url): - m = re.match(self._VALID_URL, url) - video_id = m.group('videoID') + mobj = re.match(self._VALID_URL, url) + show_id = mobj.group('show_id') + episode_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage(url, episode_id) - json_data = self._search_regex(r'window\.gon.*?gon\.show=(.+?);$', - webpage, 'json data', flags=re.MULTILINE) + episode = self._parse_json( + self._search_regex( + r'__INITIAL_STATE__\s*=\s*({.+?})\s*</script>', + webpage, 'json data'), + episode_id)['episodes'][show_id][episode_id] - try: - data = json.loads(json_data) - except ValueError as e: - raise ExtractorError('Invalid JSON: ' + str(e)) + title = episode['title'] - video_url = data['akamai_url'] + '&cbr=256' + show_title = episode.get('showTitle') + if show_title: + title = '%s - %s' % (show_title, title) + + formats = [{ + 'url': update_url_query(episode['audioURL'], query={'cbr': abr}), + 'format_id': compat_str(abr), + 'abr': abr, + 'vcodec': 'none', + } for abr in (96, 128, 256)] + + description = clean_html(episode.get('longTeaser')) + thumbnail = self._proto_relative_url(episode.get('imageURL', {}).get('landscape')) + duration = int_or_none(episode.get('duration')) + timestamp = unified_timestamp(episode.get('publishedAt')) return { - 'id': video_id, - 'url': video_url, - 'title': data['title'], - 'description': data.get('teaser_text'), - 'location': data.get('country_of_origin'), - 'uploader': data.get('host', {}).get('name'), - 'uploader_id': data.get('host', {}).get('slug'), - 'thumbnail': data.get('image', {}).get('large_url_2x'), - 'duration': data.get('duration'), + 'id': episode_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'duration': duration, + 'timestamp': timestamp, + 'formats': formats, } diff --git a/youtube_dl/extractor/rds.py b/youtube_dl/extractor/rds.py index 796adfd..bf200ea 100644 --- a/youtube_dl/extractor/rds.py +++ b/youtube_dl/extractor/rds.py @@ -1,23 +1,23 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor from ..utils import ( parse_duration, parse_iso8601, + js_to_json, ) +from ..compat import compat_str class RDSIE(InfoExtractor): IE_DESC = 'RDS.ca' - _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<display_id>[^/]+)-(?P<id>\d+\.\d+)' + _VALID_URL = r'https?://(?:www\.)?rds\.ca/vid(?:[eé]|%C3%A9)os/(?:[^/]+/)*(?P<id>[^/]+)-\d+\.\d+' _TESTS = [{ 'url': 'http://www.rds.ca/videos/football/nfl/fowler-jr-prend-la-direction-de-jacksonville-3.1132799', 'info_dict': { - 'id': '3.1132799', + 'id': '604333', 'display_id': 'fowler-jr-prend-la-direction-de-jacksonville', 'ext': 'mp4', 'title': 'Fowler Jr. prend la direction de Jacksonville', @@ -33,22 +33,17 @@ class RDSIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') + display_id = self._match_id(url) webpage = self._download_webpage(url, display_id) - # TODO: extract f4m from 9c9media.com - video_url = self._search_regex( - r'<span[^>]+itemprop="contentURL"[^>]+content="([^"]+)"', - webpage, 'video url') - - title = self._og_search_title(webpage) or self._html_search_meta( + item = self._parse_json(self._search_regex(r'(?s)itemToPush\s*=\s*({.+?});', webpage, 'item'), display_id, js_to_json) + video_id = compat_str(item['id']) + title = item.get('title') or self._og_search_title(webpage) or self._html_search_meta( 'title', webpage, 'title', fatal=True) description = self._og_search_description(webpage) or self._html_search_meta( 'description', webpage, 'description') - thumbnail = self._og_search_thumbnail(webpage) or self._search_regex( + thumbnail = item.get('urlImageBig') or self._og_search_thumbnail(webpage) or self._search_regex( [r'<link[^>]+itemprop="thumbnailUrl"[^>]+href="([^"]+)"', r'<span[^>]+itemprop="thumbnailUrl"[^>]+content="([^"]+)"'], webpage, 'thumbnail', fatal=False) @@ -61,13 +56,15 @@ class RDSIE(InfoExtractor): age_limit = self._family_friendly_search(webpage) return { + '_type': 'url_transparent', 'id': video_id, 'display_id': display_id, - 'url': video_url, + 'url': '9c9media:rds_web:%s' % video_id, 'title': title, 'description': description, 'thumbnail': thumbnail, 'timestamp': timestamp, 'duration': duration, 'age_limit': age_limit, + 'ie_key': 'NineCNineMedia', } diff --git a/youtube_dl/extractor/roosterteeth.py b/youtube_dl/extractor/roosterteeth.py new file mode 100644 index 0000000..f5b2f56 --- /dev/null +++ b/youtube_dl/extractor/roosterteeth.py @@ -0,0 +1,148 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + int_or_none, + strip_or_none, + unescapeHTML, + urlencode_postdata, +) + + +class RoosterTeethIE(InfoExtractor): + _VALID_URL = r'https?://(?:.+?\.)?roosterteeth\.com/episode/(?P<id>[^/?#&]+)' + _LOGIN_URL = 'https://roosterteeth.com/login' + _NETRC_MACHINE = 'roosterteeth' + _TESTS = [{ + 'url': 'http://roosterteeth.com/episode/million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'md5': 'e2bd7764732d785ef797700a2489f212', + 'info_dict': { + 'id': '26576', + 'display_id': 'million-dollars-but-season-2-million-dollars-but-the-game-announcement', + 'ext': 'mp4', + 'title': 'Million Dollars, But...: Million Dollars, But... The Game Announcement', + 'description': 'md5:0cc3b21986d54ed815f5faeccd9a9ca5', + 'thumbnail': 're:^https?://.*\.png$', + 'series': 'Million Dollars, But...', + 'episode': 'Million Dollars, But... The Game Announcement', + 'comment_count': int, + }, + }, { + 'url': 'http://achievementhunter.roosterteeth.com/episode/off-topic-the-achievement-hunter-podcast-2016-i-didn-t-think-it-would-pass-31', + 'only_matching': True, + }, { + 'url': 'http://funhaus.roosterteeth.com/episode/funhaus-shorts-2016-austin-sucks-funhaus-shorts', + 'only_matching': True, + }, { + 'url': 'http://screwattack.roosterteeth.com/episode/death-battle-season-3-mewtwo-vs-shadow', + 'only_matching': True, + }, { + 'url': 'http://theknow.roosterteeth.com/episode/the-know-game-news-season-1-boring-steam-sales-are-better', + 'only_matching': True, + }, { + # only available for FIRST members + 'url': 'http://roosterteeth.com/episode/rt-docs-the-world-s-greatest-head-massage-the-world-s-greatest-head-massage-an-asmr-journey-part-one', + 'only_matching': True, + }] + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page = self._download_webpage( + self._LOGIN_URL, None, + note='Downloading login page', + errnote='Unable to download login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'username': username, + 'password': password, + }) + + login_request = self._download_webpage( + self._LOGIN_URL, None, + note='Logging in as %s' % username, + data=urlencode_postdata(login_form), + headers={ + 'Referer': self._LOGIN_URL, + }) + + if not any(re.search(p, login_request) for p in ( + r'href=["\']https?://(?:www\.)?roosterteeth\.com/logout"', + r'>Sign Out<')): + error = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\balert-danger\b.*?\1[^>]*>(?:\s*<button[^>]*>.*?</button>)?(?P<error>.+?)</div>', + login_request, 'alert', default=None, group='error') + if error: + raise ExtractorError('Unable to login: %s' % error, expected=True) + raise ExtractorError('Unable to log in') + + def _real_initialize(self): + self._login() + + def _real_extract(self, url): + display_id = self._match_id(url) + + webpage = self._download_webpage(url, display_id) + + episode = strip_or_none(unescapeHTML(self._search_regex( + (r'videoTitle\s*=\s*(["\'])(?P<title>(?:(?!\1).)+)\1', + r'<title>(?P<title>[^<]+)</title>'), webpage, 'title', + default=None, group='title'))) + + title = strip_or_none(self._og_search_title( + webpage, default=None)) or episode + + m3u8_url = self._search_regex( + r'file\s*:\s*(["\'])(?P<url>http.+?\.m3u8.*?)\1', + webpage, 'm3u8 url', default=None, group='url') + + if not m3u8_url: + if re.search(r'<div[^>]+class=["\']non-sponsor', webpage): + self.raise_login_required( + '%s is only available for FIRST members' % display_id) + + if re.search(r'<div[^>]+class=["\']golive-gate', webpage): + self.raise_login_required('%s is not available yet' % display_id) + + raise ExtractorError('Unable to extract m3u8 URL') + + formats = self._extract_m3u8_formats( + m3u8_url, display_id, ext='mp4', + entry_protocol='m3u8_native', m3u8_id='hls') + self._sort_formats(formats) + + description = strip_or_none(self._og_search_description(webpage)) + thumbnail = self._proto_relative_url(self._og_search_thumbnail(webpage)) + + series = self._search_regex( + (r'<h2>More ([^<]+)</h2>', r'<a[^>]+>See All ([^<]+) Videos<'), + webpage, 'series', fatal=False) + + comment_count = int_or_none(self._search_regex( + r'>Comments \((\d+)\)<', webpage, + 'comment count', fatal=False)) + + video_id = self._search_regex( + (r'containerId\s*=\s*["\']episode-(\d+)\1', + r'<div[^<]+id=["\']episode-(\d+)'), webpage, + 'video id', default=display_id) + + return { + 'id': video_id, + 'display_id': display_id, + 'title': title, + 'description': description, + 'thumbnail': thumbnail, + 'series': series, + 'episode': episode, + 'comment_count': comment_count, + 'formats': formats, + } diff --git a/youtube_dl/extractor/rozhlas.py b/youtube_dl/extractor/rozhlas.py new file mode 100644 index 0000000..f8eda8d --- /dev/null +++ b/youtube_dl/extractor/rozhlas.py @@ -0,0 +1,50 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + int_or_none, + remove_start, +) + + +class RozhlasIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?prehravac\.rozhlas\.cz/audio/(?P<id>[0-9]+)' + _TESTS = [{ + 'url': 'http://prehravac.rozhlas.cz/audio/3421320', + 'md5': '504c902dbc9e9a1fd50326eccf02a7e2', + 'info_dict': { + 'id': '3421320', + 'ext': 'mp3', + 'title': 'Echo Pavla Klusáka (30.06.2015 21:00)', + 'description': 'Osmdesátiny Terryho Rileyho jsou skvělou příležitostí proletět se elektronickými i akustickými díly zakladatatele minimalismu, který je aktivní už přes padesát let' + } + }, { + 'url': 'http://prehravac.rozhlas.cz/audio/3421320/embed', + 'skip_download': True, + }] + + def _real_extract(self, url): + audio_id = self._match_id(url) + + webpage = self._download_webpage( + 'http://prehravac.rozhlas.cz/audio/%s' % audio_id, audio_id) + + title = self._html_search_regex( + r'<h3>(.+?)</h3>\s*<p[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'title', default=None) or remove_start( + self._og_search_title(webpage), 'Radio Wave - ') + description = self._html_search_regex( + r'<p[^>]+title=(["\'])(?P<url>(?:(?!\1).)+)\1[^>]*>.*?</p>\s*<div[^>]+id=["\']player-track', + webpage, 'description', fatal=False, group='url') + duration = int_or_none(self._search_regex( + r'data-duration=["\'](\d+)', webpage, 'duration', default=None)) + + return { + 'id': audio_id, + 'url': 'http://media.rozhlas.cz/_audio/%s.mp3' % audio_id, + 'title': title, + 'description': description, + 'duration': duration, + 'vcodec': 'none', + } diff --git a/youtube_dl/extractor/rtlnl.py b/youtube_dl/extractor/rtlnl.py index 4d612b5..f0250af 100644 --- a/youtube_dl/extractor/rtlnl.py +++ b/youtube_dl/extractor/rtlnl.py @@ -14,7 +14,7 @@ class RtlNlIE(InfoExtractor): _VALID_URL = r'''(?x) https?://(?:www\.)? (?: - rtlxl\.nl/\#!/[^/]+/| + rtlxl\.nl/[^\#]*\#!/[^/]+/| rtl\.nl/system/videoplayer/(?:[^/]+/)+(?:video_)?embed\.html\b.+?\buuid= ) (?P<id>[0-9a-f-]+)''' @@ -67,6 +67,9 @@ class RtlNlIE(InfoExtractor): }, { 'url': 'http://www.rtl.nl/system/videoplayer/derden/embed.html#!/uuid=bb0353b0-d6a4-1dad-90e9-18fe75b8d1f0', 'only_matching': True, + }, { + 'url': 'http://rtlxl.nl/?_ga=1.204735956.572365465.1466978370#!/rtl-nieuws-132237/3c487912-023b-49ac-903e-2c5d79f8410f', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/rtve.py b/youtube_dl/extractor/rtve.py index f11e358..34f9c4a 100644 --- a/youtube_dl/extractor/rtve.py +++ b/youtube_dl/extractor/rtve.py @@ -113,9 +113,9 @@ class RTVEALaCartaIE(InfoExtractor): png = self._download_webpage(png_request, video_id, 'Downloading url information') video_url = _decrypt_url(png) if not video_url.endswith('.f4m'): - video_url = video_url.replace( - 'resources/', 'auth/resources/' - ).replace('.net.rtve', '.multimedia.cdn.rtve') + if '?' not in video_url: + video_url = video_url.replace('resources/', 'auth/resources/') + video_url = video_url.replace('.net.rtve', '.multimedia.cdn.rtve') subtitles = None if info.get('sbtFile') is not None: @@ -222,3 +222,34 @@ class RTVELiveIE(InfoExtractor): 'formats': formats, 'is_live': True, } + + +class RTVETelevisionIE(InfoExtractor): + IE_NAME = 'rtve.es:television' + _VALID_URL = r'https?://www\.rtve\.es/television/[^/]+/[^/]+/(?P<id>\d+).shtml' + + _TEST = { + 'url': 'http://www.rtve.es/television/20160628/revolucion-del-movil/1364141.shtml', + 'info_dict': { + 'id': '3069778', + 'ext': 'mp4', + 'title': 'Documentos TV - La revolución del móvil', + 'duration': 3496.948, + }, + 'params': { + 'skip_download': True, + }, + } + + def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + + alacarta_url = self._search_regex( + r'data-location="alacarta_videos"[^<]+url":"(http://www\.rtve\.es/alacarta.+?)&', + webpage, 'alacarta url', default=None) + if alacarta_url is None: + raise ExtractorError( + 'The webpage doesn\'t contain any video', expected=True) + + return self.url_result(alacarta_url, ie=RTVEALaCartaIE.ie_key()) diff --git a/youtube_dl/extractor/rtvnh.py b/youtube_dl/extractor/rtvnh.py index 4896d09..f6454c6 100644 --- a/youtube_dl/extractor/rtvnh.py +++ b/youtube_dl/extractor/rtvnh.py @@ -9,7 +9,7 @@ class RTVNHIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?rtvnh\.nl/video/(?P<id>[0-9]+)' _TEST = { 'url': 'http://www.rtvnh.nl/video/131946', - 'md5': '6e1d0ab079e2a00b6161442d3ceacfc1', + 'md5': 'cdbec9f44550763c8afc96050fa747dc', 'info_dict': { 'id': '131946', 'ext': 'mp4', @@ -29,15 +29,29 @@ class RTVNHIE(InfoExtractor): raise ExtractorError( '%s returned error code %d' % (self.IE_NAME, status), expected=True) - formats = self._extract_smil_formats( - 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id, fatal=False) - - for item in meta['source']['fb']: - if item.get('type') == 'hls': - formats.extend(self._extract_m3u8_formats( - item['file'], video_id, ext='mp4', entry_protocol='m3u8_native')) - elif item.get('type') == '': - formats.append({'url': item['file']}) + formats = [] + rtmp_formats = self._extract_smil_formats( + 'http://www.rtvnh.nl/video/smil?m=' + video_id, video_id) + formats.extend(rtmp_formats) + + for rtmp_format in rtmp_formats: + rtmp_url = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + rtsp_format = rtmp_format.copy() + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'url': rtmp_url.replace('rtmp://', 'rtsp://'), + 'protocol': 'rtsp', + }) + formats.append(rtsp_format) + http_base_url = rtmp_url.replace('rtmp://', 'http://') + formats.extend(self._extract_m3u8_formats( + http_base_url + '/playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + http_base_url + '/manifest.f4m', + video_id, f4m_id='hds', fatal=False)) self._sort_formats(formats) return { diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py new file mode 100644 index 0000000..38366b7 --- /dev/null +++ b/youtube_dl/extractor/rudo.py @@ -0,0 +1,53 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .jwplatform import JWPlatformBaseIE +from ..utils import ( + js_to_json, + get_element_by_class, + unified_strdate, +) + + +class RudoIE(JWPlatformBaseIE): + _VALID_URL = r'https?://rudo\.video/vod/(?P<id>[0-9a-zA-Z]+)' + + _TEST = { + 'url': 'http://rudo.video/vod/oTzw0MGnyG', + 'md5': '2a03a5b32dd90a04c83b6d391cf7b415', + 'info_dict': { + 'id': 'oTzw0MGnyG', + 'ext': 'mp4', + 'title': 'Comentario Tomás Mosciatti', + 'upload_date': '20160617', + }, + } + + @classmethod + def _extract_url(self, webpage): + mobj = re.search( + '<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)', + webpage) + if mobj: + return mobj.group('url') + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id, encoding='iso-8859-1') + + jwplayer_data = self._parse_json(self._search_regex( + r'(?s)playerInstance\.setup\(({.+?})\)', webpage, 'jwplayer data'), video_id, + transform_source=lambda s: js_to_json(re.sub(r'encodeURI\([^)]+\)', '""', s))) + + info_dict = self._parse_jwplayer_data( + jwplayer_data, video_id, require_title=False, m3u8_id='hls') + + info_dict.update({ + 'title': self._og_search_title(webpage), + 'upload_date': unified_strdate(get_element_by_class('date', webpage)), + }) + + return info_dict diff --git a/youtube_dl/extractor/safari.py b/youtube_dl/extractor/safari.py index 6ba91f2..08ddbe3 100644 --- a/youtube_dl/extractor/safari.py +++ b/youtube_dl/extractor/safari.py @@ -75,7 +75,7 @@ class SafariBaseIE(InfoExtractor): class SafariIE(SafariBaseIE): IE_NAME = 'safari' IE_DESC = 'safaribooksonline.com online video' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>part\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/library/view/[^/]+/(?P<course_id>[^/]+)/(?P<part>[^/?#&]+)\.html' _TESTS = [{ 'url': 'https://www.safaribooksonline.com/library/view/hadoop-fundamentals-livelessons/9780133392838/part00.html', @@ -92,6 +92,9 @@ class SafariIE(SafariBaseIE): # non-digits in course id 'url': 'https://www.safaribooksonline.com/library/view/create-a-nodejs/100000006A0210/part00.html', 'only_matching': True, + }, { + 'url': 'https://www.safaribooksonline.com/library/view/learning-path-red/9780134664057/RHCE_Introduction.html', + 'only_matching': True, }] def _real_extract(self, url): @@ -132,12 +135,15 @@ class SafariIE(SafariBaseIE): class SafariApiIE(SafariBaseIE): IE_NAME = 'safari:api' - _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>part\d+)\.html' + _VALID_URL = r'https?://(?:www\.)?safaribooksonline\.com/api/v1/book/(?P<course_id>[^/]+)/chapter(?:-content)?/(?P<part>[^/?#&]+)\.html' - _TEST = { + _TESTS = [{ 'url': 'https://www.safaribooksonline.com/api/v1/book/9780133392838/chapter/part00.html', 'only_matching': True, - } + }, { + 'url': 'https://www.safaribooksonline.com/api/v1/book/9780134664057/chapter/RHCE_Introduction.html', + 'only_matching': True, + }] def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) diff --git a/youtube_dl/extractor/sandia.py b/youtube_dl/extractor/sandia.py index 759898a..96e43af 100644 --- a/youtube_dl/extractor/sandia.py +++ b/youtube_dl/extractor/sandia.py @@ -1,18 +1,12 @@ # coding: utf-8 from __future__ import unicode_literals -import itertools import json -import re from .common import InfoExtractor -from ..compat import compat_urlparse from ..utils import ( int_or_none, - js_to_json, mimetype2ext, - sanitized_Request, - unified_strdate, ) @@ -27,7 +21,8 @@ class SandiaIE(InfoExtractor): 'ext': 'mp4', 'title': 'Xyce Software Training - Section 1', 'description': 're:(?s)SAND Number: SAND 2013-7800.{200,}', - 'upload_date': '20120904', + 'upload_date': '20120409', + 'timestamp': 1333983600, 'duration': 7794, } } @@ -35,81 +30,36 @@ class SandiaIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - req = sanitized_Request(url) - req.add_header('Cookie', 'MediasitePlayerCaps=ClientPlugins=4') - webpage = self._download_webpage(req, video_id) + presentation_data = self._download_json( + 'http://digitalops.sandia.gov/Mediasite/PlayerService/PlayerService.svc/json/GetPlayerOptions', + video_id, data=json.dumps({ + 'getPlayerOptionsRequest': { + 'ResourceId': video_id, + 'QueryString': '', + } + }), headers={ + 'Content-Type': 'application/json; charset=utf-8', + })['d']['Presentation'] - js_path = self._search_regex( - r'<script type="text/javascript" src="(/Mediasite/FileServer/Presentation/[^"]+)"', - webpage, 'JS code URL') - js_url = compat_urlparse.urljoin(url, js_path) - - js_code = self._download_webpage( - js_url, video_id, note='Downloading player') - - def extract_str(key, **args): - return self._search_regex( - r'Mediasite\.PlaybackManifest\.%s\s*=\s*(.+);\s*?\n' % re.escape(key), - js_code, key, **args) - - def extract_data(key, **args): - data_json = extract_str(key, **args) - if data_json is None: - return data_json - return self._parse_json( - data_json, video_id, transform_source=js_to_json) + title = presentation_data['Title'] formats = [] - for i in itertools.count(): - fd = extract_data('VideoUrls[%d]' % i, default=None) - if fd is None: - break - formats.append({ - 'format_id': '%s' % i, - 'format_note': fd['MimeType'].partition('/')[2], - 'ext': mimetype2ext(fd['MimeType']), - 'url': fd['Location'], - 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, - }) + for stream in presentation_data.get('Streams', []): + for fd in stream.get('VideoUrls', []): + formats.append({ + 'format_id': fd['MediaType'], + 'format_note': fd['MimeType'].partition('/')[2], + 'ext': mimetype2ext(fd['MimeType']), + 'url': fd['Location'], + 'protocol': 'f4m' if fd['MimeType'] == 'video/x-mp4-fragmented' else None, + }) self._sort_formats(formats) - slide_baseurl = compat_urlparse.urljoin( - url, extract_data('SlideBaseUrl')) - slide_template = slide_baseurl + re.sub( - r'\{0:D?([0-9+])\}', r'%0\1d', extract_data('SlideImageFileNameTemplate')) - slides = [] - last_slide_time = 0 - for i in itertools.count(1): - sd = extract_str('Slides[%d]' % i, default=None) - if sd is None: - break - timestamp = int_or_none(self._search_regex( - r'^Mediasite\.PlaybackManifest\.CreateSlide\("[^"]*"\s*,\s*([0-9]+),', - sd, 'slide %s timestamp' % i, fatal=False)) - slides.append({ - 'url': slide_template % i, - 'duration': timestamp - last_slide_time, - }) - last_slide_time = timestamp - formats.append({ - 'format_id': 'slides', - 'protocol': 'slideshow', - 'url': json.dumps(slides), - 'preference': -10000, # Downloader not yet written - }) - self._sort_formats(formats) - - title = extract_data('Title') - description = extract_data('Description', fatal=False) - duration = int_or_none(extract_data( - 'Duration', fatal=False), scale=1000) - upload_date = unified_strdate(extract_data('AirDate', fatal=False)) - return { 'id': video_id, 'title': title, - 'description': description, + 'description': presentation_data.get('Description'), 'formats': formats, - 'upload_date': upload_date, - 'duration': duration, + 'timestamp': int_or_none(presentation_data.get('UnixTime'), 1000), + 'duration': int_or_none(presentation_data.get('Duration'), 1000), } diff --git a/youtube_dl/extractor/sendtonews.py b/youtube_dl/extractor/sendtonews.py index 1c636f6..2dbe490 100644 --- a/youtube_dl/extractor/sendtonews.py +++ b/youtube_dl/extractor/sendtonews.py @@ -4,33 +4,43 @@ from __future__ import unicode_literals import re from .jwplatform import JWPlatformBaseIE -from ..compat import compat_parse_qs from ..utils import ( - ExtractorError, - parse_duration, + float_or_none, + parse_iso8601, + update_url_query, ) class SendtoNewsIE(JWPlatformBaseIE): - _VALID_URL = r'https?://embed\.sendtonews\.com/player/embed\.php\?(?P<query>[^#]+)' + _VALID_URL = r'https?://embed\.sendtonews\.com/player2/embedplayer\.php\?.*\bSC=(?P<id>[0-9A-Za-z-]+)' _TEST = { # From http://cleveland.cbslocal.com/2016/05/16/indians-score-season-high-15-runs-in-blowout-win-over-reds-rapid-reaction/ - 'url': 'http://embed.sendtonews.com/player/embed.php?SK=GxfCe0Zo7D&MK=175909&PK=5588&autoplay=on&sound=yes', + 'url': 'http://embed.sendtonews.com/player2/embedplayer.php?SC=GxfCe0Zo7D-175909-5588&type=single&autoplay=on&sound=YES', 'info_dict': { - 'id': 'GxfCe0Zo7D-175909-5588', - 'ext': 'mp4', - 'title': 'Recap: CLE 15, CIN 6', - 'description': '5/16/16: Indians\' bats explode for 15 runs in a win', - 'duration': 49, + 'id': 'GxfCe0Zo7D-175909-5588' }, + 'playlist_count': 9, + # test the first video only to prevent lengthy tests + 'playlist': [{ + 'info_dict': { + 'id': '198180', + 'ext': 'mp4', + 'title': 'Recap: CLE 5, LAA 4', + 'description': '8/14/16: Naquin, Almonte lead Indians in 5-4 win', + 'duration': 57.343, + 'thumbnail': 're:https?://.*\.jpg$', + 'upload_date': '20160815', + 'timestamp': 1471221961, + }, + }], 'params': { # m3u8 download 'skip_download': True, }, } - _URL_TEMPLATE = '//embed.sendtonews.com/player/embed.php?SK=%s&MK=%s&PK=%s' + _URL_TEMPLATE = '//embed.sendtonews.com/player2/embedplayer.php?SC=%s' @classmethod def _extract_url(cls, webpage): @@ -39,48 +49,41 @@ class SendtoNewsIE(JWPlatformBaseIE): .*\bSC=(?P<SC>[0-9a-zA-Z-]+).* \1>''', webpage) if mobj: - sk, mk, pk = mobj.group('SC').split('-') - return cls._URL_TEMPLATE % (sk, mk, pk) + sc = mobj.group('SC') + return cls._URL_TEMPLATE % sc def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - params = compat_parse_qs(mobj.group('query')) - - if 'SK' not in params or 'MK' not in params or 'PK' not in params: - raise ExtractorError('Invalid URL', expected=True) - - video_id = '-'.join([params['SK'][0], params['MK'][0], params['PK'][0]]) - - webpage = self._download_webpage(url, video_id) - - jwplayer_data_str = self._search_regex( - r'jwplayer\("[^"]+"\)\.setup\((.+?)\);', webpage, 'JWPlayer data') - js_vars = { - 'w': 1024, - 'h': 768, - 'modeVar': 'html5', - } - for name, val in js_vars.items(): - js_val = '%d' % val if isinstance(val, int) else '"%s"' % val - jwplayer_data_str = jwplayer_data_str.replace(':%s,' % name, ':%s,' % js_val) - - info_dict = self._parse_jwplayer_data( - self._parse_json(jwplayer_data_str, video_id), - video_id, require_title=False, rtmp_params={'no_resume': True}) - - title = self._html_search_regex( - r'<div[^>]+class="embedTitle">([^<]+)</div>', webpage, 'title') - description = self._html_search_regex( - r'<div[^>]+class="embedSubTitle">([^<]+)</div>', webpage, - 'description', fatal=False) - duration = parse_duration(self._html_search_regex( - r'<div[^>]+class="embedDetails">([0-9:]+)', webpage, - 'duration', fatal=False)) - - info_dict.update({ - 'title': title, - 'description': description, - 'duration': duration, - }) - - return info_dict + playlist_id = self._match_id(url) + + data_url = update_url_query( + url.replace('embedplayer.php', 'data_read.php'), + {'cmd': 'loadInitial'}) + playlist_data = self._download_json(data_url, playlist_id) + + entries = [] + for video in playlist_data['playlistData'][0]: + info_dict = self._parse_jwplayer_data( + video['jwconfiguration'], + require_title=False, rtmp_params={'no_resume': True}) + + thumbnails = [] + if video.get('thumbnailUrl'): + thumbnails.append({ + 'id': 'normal', + 'url': video['thumbnailUrl'], + }) + if video.get('smThumbnailUrl'): + thumbnails.append({ + 'id': 'small', + 'url': video['smThumbnailUrl'], + }) + info_dict.update({ + 'title': video['S_headLine'], + 'description': video.get('S_fullStory'), + 'thumbnails': thumbnails, + 'duration': float_or_none(video.get('SM_length')), + 'timestamp': parse_iso8601(video.get('S_sysDate'), delimiter=' '), + }) + entries.append(info_dict) + + return self.playlist_result(entries, playlist_id) diff --git a/youtube_dl/extractor/shahid.py b/youtube_dl/extractor/shahid.py index d95ea06..ca286ab 100644 --- a/youtube_dl/extractor/shahid.py +++ b/youtube_dl/extractor/shahid.py @@ -2,11 +2,11 @@ from __future__ import unicode_literals from .common import InfoExtractor -from ..compat import compat_urllib_parse_urlencode from ..utils import ( ExtractorError, int_or_none, parse_iso8601, + str_or_none, ) @@ -33,45 +33,27 @@ class ShahidIE(InfoExtractor): 'only_matching': True }] - def _handle_error(self, response): - if not isinstance(response, dict): - return - error = response.get('error') + def _call_api(self, path, video_id, note): + data = self._download_json( + 'http://api.shahid.net/api/v1_1/' + path, video_id, note, query={ + 'apiKey': 'sh@hid0nlin3', + 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', + }).get('data', {}) + + error = data.get('error') if error: raise ExtractorError( '%s returned error: %s' % (self.IE_NAME, '\n'.join(error.values())), expected=True) - def _download_json(self, url, video_id, note='Downloading JSON metadata'): - response = super(ShahidIE, self)._download_json(url, video_id, note)['data'] - self._handle_error(response) - return response + return data def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - api_vars = { - 'id': video_id, - 'type': 'player', - 'url': 'http://api.shahid.net/api/v1_1', - 'playerType': 'episode', - } - - flashvars = self._search_regex( - r'var\s+flashvars\s*=\s*({[^}]+})', webpage, 'flashvars', default=None) - if flashvars: - for key in api_vars.keys(): - value = self._search_regex( - r'\b%s\s*:\s*(?P<q>["\'])(?P<value>.+?)(?P=q)' % key, - flashvars, 'type', default=None, group='value') - if value: - api_vars[key] = value - - player = self._download_json( - 'https://shahid.mbc.net/arContent/getPlayerContent-param-.id-%s.type-%s.html' - % (video_id, api_vars['type']), video_id, 'Downloading player JSON') + player = self._call_api( + 'Content/Episode/%s' % video_id, + video_id, 'Downloading player JSON') if player.get('drm'): raise ExtractorError('This video is DRM protected.', expected=True) @@ -79,22 +61,11 @@ class ShahidIE(InfoExtractor): formats = self._extract_m3u8_formats(player['url'], video_id, 'mp4') self._sort_formats(formats) - video = self._download_json( - '%s/%s/%s?%s' % ( - api_vars['url'], api_vars['playerType'], api_vars['id'], - compat_urllib_parse_urlencode({ - 'apiKey': 'sh@hid0nlin3', - 'hash': 'b2wMCTHpSmyxGqQjJFOycRmLSex+BpTK/ooxy6vHaqs=', - })), - video_id, 'Downloading video JSON') - - video = video[api_vars['playerType']] + video = self._call_api( + 'episode/%s' % video_id, video_id, + 'Downloading video JSON')['episode'] title = video['title'] - description = video.get('description') - thumbnail = video.get('thumbnailUrl') - duration = int_or_none(video.get('duration')) - timestamp = parse_iso8601(video.get('referenceDate')) categories = [ category['name'] for category in video.get('genres', []) if 'name' in category] @@ -102,10 +73,16 @@ class ShahidIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'duration': duration, - 'timestamp': timestamp, + 'description': video.get('description'), + 'thumbnail': video.get('thumbnailUrl'), + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('referenceDate')), 'categories': categories, + 'series': video.get('showTitle') or video.get('showName'), + 'season': video.get('seasonTitle'), + 'season_number': int_or_none(video.get('seasonNumber')), + 'season_id': str_or_none(video.get('seasonId')), + 'episode_number': int_or_none(video.get('number')), + 'episode_id': video_id, 'formats': formats, } diff --git a/youtube_dl/extractor/shared.py b/youtube_dl/extractor/shared.py index e7e5f65..d592dfe 100644 --- a/youtube_dl/extractor/shared.py +++ b/youtube_dl/extractor/shared.py @@ -6,7 +6,6 @@ from .common import InfoExtractor from ..utils import ( ExtractorError, int_or_none, - sanitized_Request, urlencode_postdata, ) @@ -37,28 +36,33 @@ class SharedIE(InfoExtractor): def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + + webpage, urlh = self._download_webpage_handle(url, video_id) if '>File does not exist<' in webpage: raise ExtractorError( 'Video %s does not exist' % video_id, expected=True) download_form = self._hidden_inputs(webpage) - request = sanitized_Request( - url, urlencode_postdata(download_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') video_page = self._download_webpage( - request, video_id, 'Downloading video page') + urlh.geturl(), video_id, 'Downloading video page', + data=urlencode_postdata(download_form), + headers={ + 'Content-Type': 'application/x-www-form-urlencoded', + 'Referer': urlh.geturl(), + }) video_url = self._html_search_regex( - r'data-url="([^"]+)"', video_page, 'video URL') + r'data-url=(["\'])(?P<url>(?:(?!\1).)+)\1', + video_page, 'video URL', group='url') title = base64.b64decode(self._html_search_meta( 'full:title', webpage, 'title').encode('utf-8')).decode('utf-8') filesize = int_or_none(self._html_search_meta( 'full:size', webpage, 'file size', fatal=False)) thumbnail = self._html_search_regex( - r'data-poster="([^"]+)"', video_page, 'thumbnail', default=None) + r'data-poster=(["\'])(?P<url>(?:(?!\1).)+)\1', + video_page, 'thumbnail', default=None, group='url') return { 'id': video_id, diff --git a/youtube_dl/extractor/sixplay.py b/youtube_dl/extractor/sixplay.py new file mode 100644 index 0000000..d3aba58 --- /dev/null +++ b/youtube_dl/extractor/sixplay.py @@ -0,0 +1,64 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + qualities, + int_or_none, + mimetype2ext, + determine_ext, +) + + +class SixPlayIE(InfoExtractor): + _VALID_URL = r'(?:6play:|https?://(?:www\.)?6play\.fr/.+?-c_)(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.6play.fr/jamel-et-ses-amis-au-marrakech-du-rire-p_1316/jamel-et-ses-amis-au-marrakech-du-rire-2015-c_11495320', + 'md5': '42310bffe4ba3982db112b9cd3467328', + 'info_dict': { + 'id': '11495320', + 'ext': 'mp4', + 'title': 'Jamel et ses amis au Marrakech du rire 2015', + 'description': 'md5:ba2149d5c321d5201b78070ee839d872', + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + clip_data = self._download_json( + 'https://player.m6web.fr/v2/video/config/6play-auth/FR/%s.json' % video_id, + video_id) + video_data = clip_data['videoInfo'] + + quality_key = qualities(['lq', 'sd', 'hq', 'hd']) + formats = [] + for source in clip_data['sources']: + source_type, source_url = source.get('type'), source.get('src') + if not source_url or source_type == 'hls/primetime': + continue + ext = mimetype2ext(source_type) or determine_ext(source_url) + if ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + source_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + source_url.replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + elif ext == 'mp4': + quality = source.get('quality') + formats.append({ + 'url': source_url, + 'format_id': quality, + 'quality': quality_key(quality), + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': video_data['title'].strip(), + 'description': video_data.get('description'), + 'duration': int_or_none(video_data.get('duration')), + 'series': video_data.get('titlePgm'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/skynewsarabia.py b/youtube_dl/extractor/skynewsarabia.py index 05e1b02..fffc9aa 100644 --- a/youtube_dl/extractor/skynewsarabia.py +++ b/youtube_dl/extractor/skynewsarabia.py @@ -67,7 +67,7 @@ class SkyNewsArabiaIE(SkyNewsArabiaBaseIE): class SkyNewsArabiaArticleIE(SkyNewsArabiaBaseIE): - IE_NAME = 'skynewsarabia:video' + IE_NAME = 'skynewsarabia:article' _VALID_URL = r'https?://(?:www\.)?skynewsarabia\.com/web/article/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.skynewsarabia.com/web/article/794549/%D8%A7%D9%94%D8%AD%D8%AF%D8%A7%D8%AB-%D8%A7%D9%84%D8%B4%D8%B1%D9%82-%D8%A7%D9%84%D8%A7%D9%94%D9%88%D8%B3%D8%B7-%D8%AE%D8%B1%D9%8A%D8%B7%D8%A9-%D8%A7%D9%84%D8%A7%D9%94%D9%84%D8%B9%D8%A7%D8%A8-%D8%A7%D9%84%D8%B0%D9%83%D9%8A%D8%A9', diff --git a/youtube_dl/extractor/skysports.py b/youtube_dl/extractor/skysports.py new file mode 100644 index 0000000..9dc78c7 --- /dev/null +++ b/youtube_dl/extractor/skysports.py @@ -0,0 +1,33 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SkySportsIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?skysports\.com/watch/video/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://www.skysports.com/watch/video/10328419/bale-its-our-time-to-shine', + 'md5': 'c44a1db29f27daf9a0003e010af82100', + 'info_dict': { + 'id': '10328419', + 'ext': 'flv', + 'title': 'Bale: Its our time to shine', + 'description': 'md5:9fd1de3614d525f5addda32ac3c482c9', + }, + 'add_ie': ['Ooyala'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + return { + '_type': 'url_transparent', + 'id': video_id, + 'url': 'ooyala:%s' % self._search_regex( + r'data-video-id="([^"]+)"', webpage, 'ooyala id'), + 'title': self._og_search_title(webpage), + 'description': self._og_search_description(webpage), + 'ie_key': 'Ooyala', + } diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py index 0b717a1..4967c1b 100644 --- a/youtube_dl/extractor/slideshare.py +++ b/youtube_dl/extractor/slideshare.py @@ -9,6 +9,7 @@ from ..compat import ( ) from ..utils import ( ExtractorError, + get_element_by_id, ) @@ -40,7 +41,7 @@ class SlideshareIE(InfoExtractor): bucket = info['jsplayer']['video_bucket'] ext = info['jsplayer']['video_extension'] video_url = compat_urlparse.urljoin(bucket, doc + '-SD.' + ext) - description = self._html_search_regex( + description = get_element_by_id('slideshow-description-paragraph', webpage) or self._html_search_regex( r'(?s)<p[^>]+itemprop="description"[^>]*>(.+?)</p>', webpage, 'description', fatal=False) @@ -51,5 +52,5 @@ class SlideshareIE(InfoExtractor): 'ext': ext, 'url': video_url, 'thumbnail': info['slideshow']['pin_image_url'], - 'description': description, + 'description': description.strip() if description else None, } diff --git a/youtube_dl/extractor/smotri.py b/youtube_dl/extractor/smotri.py index 5c3fd0f..1143587 100644 --- a/youtube_dl/extractor/smotri.py +++ b/youtube_dl/extractor/smotri.py @@ -13,20 +13,21 @@ from ..utils import ( sanitized_Request, unified_strdate, urlencode_postdata, + xpath_text, ) class SmotriIE(InfoExtractor): IE_DESC = 'Smotri.com' IE_NAME = 'smotri' - _VALID_URL = r'^https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' + _VALID_URL = r'https?://(?:www\.)?(?:smotri\.com/video/view/\?id=|pics\.smotri\.com/(?:player|scrubber_custom8)\.swf\?file=)(?P<id>v(?P<realvideoid>[0-9]+)[a-z0-9]{4})' _NETRC_MACHINE = 'smotri' _TESTS = [ # real video id 2610366 { 'url': 'http://smotri.com/video/view/?id=v261036632ab', - 'md5': '2a7b08249e6f5636557579c368040eb9', + 'md5': '02c0dfab2102984e9c5bb585cc7cc321', 'info_dict': { 'id': 'v261036632ab', 'ext': 'mp4', @@ -174,11 +175,11 @@ class SmotriIE(InfoExtractor): if video_password: video_form['pass'] = hashlib.md5(video_password.encode('utf-8')).hexdigest() - request = sanitized_Request( - 'http://smotri.com/video/view/url/bot/', urlencode_postdata(video_form)) - request.add_header('Content-Type', 'application/x-www-form-urlencoded') - - video = self._download_json(request, video_id, 'Downloading video JSON') + video = self._download_json( + 'http://smotri.com/video/view/url/bot/', + video_id, 'Downloading video JSON', + data=urlencode_postdata(video_form), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) video_url = video.get('_vidURL') or video.get('_vidURL_mp4') @@ -196,11 +197,11 @@ class SmotriIE(InfoExtractor): raise ExtractorError(msg, expected=True) title = video['title'] - thumbnail = video['_imgURL'] - upload_date = unified_strdate(video['added']) - uploader = video['userNick'] - uploader_id = video['userLogin'] - duration = int_or_none(video['duration']) + thumbnail = video.get('_imgURL') + upload_date = unified_strdate(video.get('added')) + uploader = video.get('userNick') + uploader_id = video.get('userLogin') + duration = int_or_none(video.get('duration')) # Video JSON does not provide enough meta data # We will extract some from the video web page instead @@ -209,7 +210,7 @@ class SmotriIE(InfoExtractor): # Warning if video is unavailable warning = self._html_search_regex( - r'<div class="videoUnModer">(.*?)</div>', webpage, + r'<div[^>]+class="videoUnModer"[^>]*>(.+?)</div>', webpage, 'warning message', default=None) if warning is not None: self._downloader.report_warning( @@ -217,20 +218,22 @@ class SmotriIE(InfoExtractor): (video_id, warning)) # Adult content - if re.search('EroConfirmText">', webpage) is not None: + if 'EroConfirmText">' in webpage: self.report_age_confirmation() confirm_string = self._html_search_regex( - r'<a href="/video/view/\?id=%s&confirm=([^"]+)" title="[^"]+">' % video_id, + r'<a[^>]+href="/video/view/\?id=%s&confirm=([^"]+)"' % video_id, webpage, 'confirm string') confirm_url = webpage_url + '&confirm=%s' % confirm_string - webpage = self._download_webpage(confirm_url, video_id, 'Downloading video page (age confirmed)') + webpage = self._download_webpage( + confirm_url, video_id, + 'Downloading video page (age confirmed)') adult_content = True else: adult_content = False view_count = self._html_search_regex( - 'Общее количество просмотров.*?<span class="Number">(\\d+)</span>', - webpage, 'view count', fatal=False, flags=re.MULTILINE | re.DOTALL) + r'(?s)Общее количество просмотров.*?<span class="Number">(\d+)</span>', + webpage, 'view count', fatal=False) return { 'id': video_id, @@ -249,37 +252,33 @@ class SmotriIE(InfoExtractor): class SmotriCommunityIE(InfoExtractor): IE_DESC = 'Smotri.com community videos' IE_NAME = 'smotri:community' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/community/video/(?P<communityid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/community/video/(?P<id>[0-9A-Za-z_\'-]+)' _TEST = { 'url': 'http://smotri.com/community/video/kommuna', 'info_dict': { 'id': 'kommuna', - 'title': 'КПРФ', }, 'playlist_mincount': 4, } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - community_id = mobj.group('communityid') + community_id = self._match_id(url) - url = 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id - rss = self._download_xml(url, community_id, 'Downloading community RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/video/by/community/-/%s/video.xml' % community_id, + community_id, 'Downloading community RSS') - entries = [self.url_result(video_url.text, 'Smotri') - for video_url in rss.findall('./channel/item/link')] + entries = [ + self.url_result(video_url.text, SmotriIE.ie_key()) + for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - community_title = self._html_search_regex( - '^Видео сообщества "([^"]+)"$', description_text, 'community title') - - return self.playlist_result(entries, community_id, community_title) + return self.playlist_result(entries, community_id) class SmotriUserIE(InfoExtractor): IE_DESC = 'Smotri.com user videos' IE_NAME = 'smotri:user' - _VALID_URL = r'^https?://(?:www\.)?smotri\.com/user/(?P<userid>[0-9A-Za-z_\'-]+)' + _VALID_URL = r'https?://(?:www\.)?smotri\.com/user/(?P<id>[0-9A-Za-z_\'-]+)' _TESTS = [{ 'url': 'http://smotri.com/user/inspector', 'info_dict': { @@ -290,19 +289,19 @@ class SmotriUserIE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - user_id = mobj.group('userid') + user_id = self._match_id(url) - url = 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id - rss = self._download_xml(url, user_id, 'Downloading user RSS') + rss = self._download_xml( + 'http://smotri.com/export/rss/user/video/-/%s/video.xml' % user_id, + user_id, 'Downloading user RSS') entries = [self.url_result(video_url.text, 'Smotri') for video_url in rss.findall('./channel/item/link')] - description_text = rss.find('./channel/description').text - user_nickname = self._html_search_regex( - '^Видео режиссера (.*)$', description_text, - 'user nickname') + description_text = xpath_text(rss, './channel/description') or '' + user_nickname = self._search_regex( + '^Видео режиссера (.+)$', description_text, + 'user nickname', fatal=False) return self.playlist_result(entries, user_id, user_nickname) @@ -310,11 +309,11 @@ class SmotriUserIE(InfoExtractor): class SmotriBroadcastIE(InfoExtractor): IE_DESC = 'Smotri.com broadcasts' IE_NAME = 'smotri:broadcast' - _VALID_URL = r'^https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<broadcastid>[^/]+))/?.*' + _VALID_URL = r'https?://(?:www\.)?(?P<url>smotri\.com/live/(?P<id>[^/]+))/?.*' def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) - broadcast_id = mobj.group('broadcastid') + broadcast_id = mobj.group('id') broadcast_url = 'http://' + mobj.group('url') broadcast_page = self._download_webpage(broadcast_url, broadcast_id, 'Downloading broadcast page') @@ -328,7 +327,8 @@ class SmotriBroadcastIE(InfoExtractor): (username, password) = self._get_login_info() if username is None: - self.raise_login_required('Erotic broadcasts allowed only for registered users') + self.raise_login_required( + 'Erotic broadcasts allowed only for registered users') login_form = { 'login-hint53': '1', @@ -343,8 +343,9 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_page = self._download_webpage( request, broadcast_id, 'Logging in and confirming age') - if re.search('>Неверный логин или пароль<', broadcast_page) is not None: - raise ExtractorError('Unable to log in: bad username or password', expected=True) + if '>Неверный логин или пароль<' in broadcast_page: + raise ExtractorError( + 'Unable to log in: bad username or password', expected=True) adult_content = True else: @@ -383,11 +384,11 @@ class SmotriBroadcastIE(InfoExtractor): broadcast_playpath = broadcast_json['_streamName'] broadcast_app = '%s/%s' % (mobj.group('app'), broadcast_json['_vidURL']) - broadcast_thumbnail = broadcast_json['_imgURL'] + broadcast_thumbnail = broadcast_json.get('_imgURL') broadcast_title = self._live_title(broadcast_json['title']) - broadcast_description = broadcast_json['description'] - broadcaster_nick = broadcast_json['nick'] - broadcaster_login = broadcast_json['login'] + broadcast_description = broadcast_json.get('description') + broadcaster_nick = broadcast_json.get('nick') + broadcaster_login = broadcast_json.get('login') rtmp_conn = 'S:%s' % uuid.uuid4().hex except KeyError: if protected_broadcast: diff --git a/youtube_dl/extractor/sohu.py b/youtube_dl/extractor/sohu.py index 49e5d09..48e2ba2 100644 --- a/youtube_dl/extractor/sohu.py +++ b/youtube_dl/extractor/sohu.py @@ -8,19 +8,16 @@ from ..compat import ( compat_str, compat_urllib_parse_urlencode, ) -from ..utils import ( - ExtractorError, - sanitized_Request, -) +from ..utils import ExtractorError class SohuIE(InfoExtractor): _VALID_URL = r'https?://(?P<mytv>my\.)?tv\.sohu\.com/.+?/(?(mytv)|n)(?P<id>\d+)\.shtml.*?' + # Sohu videos give different MD5 sums on Travis CI and my machine _TESTS = [{ 'note': 'This video is available only in Mainland China', 'url': 'http://tv.sohu.com/20130724/n382479172.shtml#super', - 'md5': '29175c8cadd8b5cc4055001e85d6b372', 'info_dict': { 'id': '382479172', 'ext': 'mp4', @@ -29,7 +26,6 @@ class SohuIE(InfoExtractor): 'skip': 'On available in China', }, { 'url': 'http://tv.sohu.com/20150305/n409385080.shtml', - 'md5': '699060e75cf58858dd47fb9c03c42cfb', 'info_dict': { 'id': '409385080', 'ext': 'mp4', @@ -37,7 +33,6 @@ class SohuIE(InfoExtractor): } }, { 'url': 'http://my.tv.sohu.com/us/232799889/78693464.shtml', - 'md5': '9bf34be48f2f4dadcb226c74127e203c', 'info_dict': { 'id': '78693464', 'ext': 'mp4', @@ -51,7 +46,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', }, 'playlist': [{ - 'md5': 'bdbfb8f39924725e6589c146bc1883ad', 'info_dict': { 'id': '78910339_part1', 'ext': 'mp4', @@ -59,7 +53,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '3e1f46aaeb95354fd10e7fca9fc1804e', 'info_dict': { 'id': '78910339_part2', 'ext': 'mp4', @@ -67,7 +60,6 @@ class SohuIE(InfoExtractor): 'title': '【神探苍实战秘籍】第13期 战争之影 赫卡里姆', } }, { - 'md5': '8407e634175fdac706766481b9443450', 'info_dict': { 'id': '78910339_part3', 'ext': 'mp4', @@ -96,15 +88,10 @@ class SohuIE(InfoExtractor): else: base_data_url = 'http://hot.vrs.sohu.com/vrs_flash.action?vid=' - req = sanitized_Request(base_data_url + vid_id) - - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - req.add_header('Ytdl-request-proxy', cn_verification_proxy) - return self._download_json( - req, video_id, - 'Downloading JSON data for %s' % vid_id) + base_data_url + vid_id, video_id, + 'Downloading JSON data for %s' % vid_id, + headers=self.geo_verification_headers()) mobj = re.match(self._VALID_URL, url) video_id = mobj.group('id') diff --git a/youtube_dl/extractor/sonyliv.py b/youtube_dl/extractor/sonyliv.py new file mode 100644 index 0000000..accd112 --- /dev/null +++ b/youtube_dl/extractor/sonyliv.py @@ -0,0 +1,34 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class SonyLIVIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?sonyliv\.com/details/[^/]+/(?P<id>\d+)' + _TESTS = [{ + 'url': "http://www.sonyliv.com/details/episodes/5024612095001/Ep.-1---Achaari-Cheese-Toast---Bachelor's-Delight", + 'info_dict': { + 'title': "Ep. 1 - Achaari Cheese Toast - Bachelor's Delight", + 'id': '5024612095001', + 'ext': 'mp4', + 'upload_date': '20160707', + 'description': 'md5:7f28509a148d5be9d0782b4d5106410d', + 'uploader_id': '4338955589001', + 'timestamp': 1467870968, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': ['BrightcoveNew'], + }, { + 'url': 'http://www.sonyliv.com/details/full%20movie/4951168986001/Sei-Raat-(Bangla)', + 'only_matching': True, + }] + + BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/4338955589001/default_default/index.html?videoId=%s' + + def _real_extract(self, url): + brightcove_id = self._match_id(url) + return self.url_result( + self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id) diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py index 194dabc..aeae931 100644 --- a/youtube_dl/extractor/soundcloud.py +++ b/youtube_dl/extractor/soundcloud.py @@ -119,6 +119,12 @@ class SoundcloudIE(InfoExtractor): _CLIENT_ID = '02gUJC0hH2ct1EGOcYXQIzRFU91c72Ea' _IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf' + @staticmethod + def _extract_urls(webpage): + return [m.group('url') for m in re.finditer( + r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:w\.)?soundcloud\.com/player.+?)\1', + webpage)] + def report_resolve(self, video_id): """Report information extraction.""" self.to_screen('%s: Resolving id' % video_id) diff --git a/youtube_dl/extractor/southpark.py b/youtube_dl/extractor/southpark.py index 87b6504..a147f7d 100644 --- a/youtube_dl/extractor/southpark.py +++ b/youtube_dl/extractor/southpark.py @@ -17,6 +17,8 @@ class SouthParkIE(MTVServicesInfoExtractor): 'ext': 'mp4', 'title': 'South Park|Bat Daded', 'description': 'Randy disqualifies South Park by getting into a fight with Bat Dad.', + 'timestamp': 1112760000, + 'upload_date': '20050406', }, }] @@ -28,6 +30,10 @@ class SouthParkEsIE(SouthParkIE): _TESTS = [{ 'url': 'http://southpark.cc.com/episodios-en-espanol/s01e01-cartman-consigue-una-sonda-anal#source=351c1323-0b96-402d-a8b9-40d01b2e9bde&position=1&sort=!airdate', + 'info_dict': { + 'title': 'Cartman Consigue Una Sonda Anal', + 'description': 'Cartman Consigue Una Sonda Anal', + }, 'playlist_count': 4, }] @@ -42,17 +48,27 @@ class SouthParkDeIE(SouthParkIE): 'info_dict': { 'id': '85487c96-b3b9-4e39-9127-ad88583d9bf2', 'ext': 'mp4', - 'title': 'The Government Won\'t Respect My Privacy', + 'title': 'South Park|The Government Won\'t Respect My Privacy', 'description': 'Cartman explains the benefits of "Shitter" to Stan, Kyle and Craig.', + 'timestamp': 1380160800, + 'upload_date': '20130926', }, }, { # non-ASCII characters in initial URL 'url': 'http://www.southpark.de/alle-episoden/s18e09-hashtag-aufwärmen', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }, { # non-ASCII characters in redirect URL 'url': 'http://www.southpark.de/alle-episoden/s18e09', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Hashtag „Aufwärmen“', + 'description': 'Kyle will mit seinem kleinen Bruder Ike Videospiele spielen. Als der nicht mehr mit ihm spielen will, hat Kyle Angst, dass er die Kids von heute nicht mehr versteht.', + }, + 'playlist_count': 3, }] @@ -63,7 +79,11 @@ class SouthParkNlIE(SouthParkIE): _TESTS = [{ 'url': 'http://www.southpark.nl/full-episodes/s18e06-freemium-isnt-free', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Freemium Isn\'t Free', + 'description': 'Stan is addicted to the new Terrance and Phillip mobile game.', + }, + 'playlist_mincount': 3, }] @@ -74,5 +94,9 @@ class SouthParkDkIE(SouthParkIE): _TESTS = [{ 'url': 'http://www.southparkstudios.dk/full-episodes/s18e07-grounded-vindaloop', - 'playlist_count': 4, + 'info_dict': { + 'title': 'Grounded Vindaloop', + 'description': 'Butters is convinced he\'s living in a virtual reality.', + }, + 'playlist_mincount': 3, }] diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py index 39a7aaf..3c55280 100644 --- a/youtube_dl/extractor/spiegel.py +++ b/youtube_dl/extractor/spiegel.py @@ -4,8 +4,13 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_urlparse from .spiegeltv import SpiegeltvIE +from ..compat import compat_urlparse +from ..utils import ( + extract_attributes, + unified_strdate, + get_element_by_attribute, +) class SpiegelIE(InfoExtractor): @@ -19,6 +24,7 @@ class SpiegelIE(InfoExtractor): 'title': 'Vulkanausbruch in Ecuador: Der "Feuerschlund" ist wieder aktiv', 'description': 'md5:8029d8310232196eb235d27575a8b9f4', 'duration': 49, + 'upload_date': '20130311', }, }, { 'url': 'http://www.spiegel.de/video/schach-wm-videoanalyse-des-fuenften-spiels-video-1309159.html', @@ -29,6 +35,7 @@ class SpiegelIE(InfoExtractor): 'title': 'Schach-WM in der Videoanalyse: Carlsen nutzt die Fehlgriffe des Titelverteidigers', 'description': 'md5:c2322b65e58f385a820c10fa03b2d088', 'duration': 983, + 'upload_date': '20131115', }, }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-embed.html', @@ -38,6 +45,7 @@ class SpiegelIE(InfoExtractor): 'ext': 'mp4', 'description': 'SPIEGEL ONLINE-Nutzer durften den deutschen Astronauten Alexander Gerst über sein Leben auf der ISS-Station befragen. Hier kommen seine Antworten auf die besten sechs Fragen.', 'title': 'Fragen an Astronaut Alexander Gerst: "Bekommen Sie die Tageszeiten mit?"', + 'upload_date': '20140904', } }, { 'url': 'http://www.spiegel.de/video/astronaut-alexander-gerst-von-der-iss-station-beantwortet-fragen-video-1519126-iframe.html', @@ -52,10 +60,10 @@ class SpiegelIE(InfoExtractor): if SpiegeltvIE.suitable(handle.geturl()): return self.url_result(handle.geturl(), 'Spiegeltv') - title = re.sub(r'\s+', ' ', self._html_search_regex( - r'(?s)<(?:h1|div) class="module-title"[^>]*>(.*?)</(?:h1|div)>', - webpage, 'title')) - description = self._html_search_meta('description', webpage, 'description') + video_data = extract_attributes(self._search_regex(r'(<div[^>]+id="spVideoElements"[^>]+>)', webpage, 'video element', default='')) + + title = video_data.get('data-video-title') or get_element_by_attribute('class', 'module-title', webpage) + description = video_data.get('data-video-teaser') or self._html_search_meta('description', webpage, 'description') base_url = self._search_regex( [r'server\s*:\s*(["\'])(?P<url>.+?)\1', r'var\s+server\s*=\s*"(?P<url>[^"]+)\"'], @@ -87,8 +95,9 @@ class SpiegelIE(InfoExtractor): return { 'id': video_id, 'title': title, - 'description': description, + 'description': description.strip() if description else None, 'duration': duration, + 'upload_date': unified_strdate(video_data.get('data-video-date')), 'formats': formats, } @@ -104,6 +113,7 @@ class SpiegelArticleIE(InfoExtractor): 'ext': 'mp4', 'title': 'Faszination Badminton: Nennt es bloß nicht Federball', 'description': 're:^Patrick Kämnitz gehört.{100,}', + 'upload_date': '20140825', }, }, { 'url': 'http://www.spiegel.de/wissenschaft/weltall/astronaut-alexander-gerst-antwortet-spiegel-online-lesern-a-989876.html', diff --git a/youtube_dl/extractor/spike.py b/youtube_dl/extractor/spike.py index 182f286..218785e 100644 --- a/youtube_dl/extractor/spike.py +++ b/youtube_dl/extractor/spike.py @@ -4,26 +4,31 @@ from .mtv import MTVServicesInfoExtractor class SpikeIE(MTVServicesInfoExtractor): - _VALID_URL = r'''(?x)https?:// - (?:www\.spike\.com/(?:video-(?:clips|playlists)|(?:full-)?episodes)/.+| - m\.spike\.com/videos/video\.rbml\?id=(?P<id>[^&]+)) - ''' - _TEST = { + _VALID_URL = r'https?://(?:[^/]+\.)?spike\.com/[^/]+/[\da-z]{6}(?:[/?#&]|$)' + _TESTS = [{ 'url': 'http://www.spike.com/video-clips/lhtu8m/auction-hunters-can-allen-ride-a-hundred-year-old-motorcycle', 'md5': '1a9265f32b0c375793d6c4ce45255256', 'info_dict': { 'id': 'b9c8221a-4e50-479a-b86d-3333323e38ba', 'ext': 'mp4', - 'title': 'Auction Hunters|Can Allen Ride A Hundred Year-Old Motorcycle?', + 'title': 'Auction Hunters|December 27, 2013|4|414|Can Allen Ride A Hundred Year-Old Motorcycle?', 'description': 'md5:fbed7e82ed5fad493615b3094a9499cb', + 'timestamp': 1388120400, + 'upload_date': '20131227', }, - } + }, { + 'url': 'http://www.spike.com/video-clips/lhtu8m/', + 'only_matching': True, + }, { + 'url': 'http://www.spike.com/video-clips/lhtu8m', + 'only_matching': True, + }, { + 'url': 'http://bellator.spike.com/fight/atwr7k/bellator-158-michael-page-vs-evangelista-cyborg', + 'only_matching': True, + }, { + 'url': 'http://bellator.spike.com/video-clips/bw6k7n/bellator-158-foundations-michael-venom-page', + 'only_matching': True, + }] _FEED_URL = 'http://www.spike.com/feeds/mrss/' _MOBILE_TEMPLATE = 'http://m.spike.com/videos/video.rbml?id=%s' - - def _real_extract(self, url): - mobile_id = self._match_id(url) - if mobile_id: - url = 'http://www.spike.com/video-clips/%s' % mobile_id - return super(SpikeIE, self)._real_extract(url) diff --git a/youtube_dl/extractor/srmediathek.py b/youtube_dl/extractor/srmediathek.py index 74d0118..409d503 100644 --- a/youtube_dl/extractor/srmediathek.py +++ b/youtube_dl/extractor/srmediathek.py @@ -9,8 +9,9 @@ from ..utils import ( class SRMediathekIE(ARDMediathekIE): + IE_NAME = 'sr:mediathek' IE_DESC = 'Saarländischer Rundfunk' - _VALID_URL = r'https?://sr-mediathek\.sr-online\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' + _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://sr-mediathek.sr-online.de/index.php?seite=7&id=28455', @@ -34,7 +35,9 @@ class SRMediathekIE(ARDMediathekIE): # m3u8 download 'skip_download': True, }, - 'expected_warnings': ['Unable to download f4m manifest'] + }, { + 'url': 'http://sr-mediathek.de/index.php?seite=7&id=7480', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/stitcher.py b/youtube_dl/extractor/stitcher.py index d5c852f..0f8782d 100644 --- a/youtube_dl/extractor/stitcher.py +++ b/youtube_dl/extractor/stitcher.py @@ -56,7 +56,7 @@ class StitcherIE(InfoExtractor): episode = self._parse_json( js_to_json(self._search_regex( - r'(?s)var\s+stitcher\s*=\s*({.+?});\n', webpage, 'episode config')), + r'(?s)var\s+stitcher(?:Config)?\s*=\s*({.+?});\n', webpage, 'episode config')), display_id)['config']['episode'] title = unescapeHTML(episode['title']) diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py new file mode 100644 index 0000000..1c61437 --- /dev/null +++ b/youtube_dl/extractor/streamable.py @@ -0,0 +1,98 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + ExtractorError, + float_or_none, + int_or_none, +) + + +class StreamableIE(InfoExtractor): + _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)' + _TESTS = [ + { + 'url': 'https://streamable.com/dnd1', + 'md5': '3e3bc5ca088b48c2d436529b64397fef', + 'info_dict': { + 'id': 'dnd1', + 'ext': 'mp4', + 'title': 'Mikel Oiarzabal scores to make it 0-3 for La Real against Espanyol', + 'thumbnail': 're:https?://.*\.jpg$', + 'uploader': 'teabaker', + 'timestamp': 1454964157.35115, + 'upload_date': '20160208', + 'duration': 61.516, + 'view_count': int, + } + }, + # older video without bitrate, width/height, etc. info + { + 'url': 'https://streamable.com/moo', + 'md5': '2cf6923639b87fba3279ad0df3a64e73', + 'info_dict': { + 'id': 'moo', + 'ext': 'mp4', + 'title': '"Please don\'t eat me!"', + 'thumbnail': 're:https?://.*\.jpg$', + 'timestamp': 1426115495, + 'upload_date': '20150311', + 'duration': 12, + 'view_count': int, + } + }, + { + 'url': 'https://streamable.com/e/dnd1', + 'only_matching': True, + } + ] + + def _real_extract(self, url): + video_id = self._match_id(url) + + # Note: Using the ajax API, as the public Streamable API doesn't seem + # to return video info like the title properly sometimes, and doesn't + # include info like the video duration + video = self._download_json( + 'https://streamable.com/ajax/videos/%s' % video_id, video_id) + + # Format IDs: + # 0 The video is being uploaded + # 1 The video is being processed + # 2 The video has at least one file ready + # 3 The video is unavailable due to an error + status = video.get('status') + if status != 2: + raise ExtractorError( + 'This video is currently unavailable. It may still be uploading or processing.', + expected=True) + + title = video.get('reddit_title') or video['title'] + + formats = [] + for key, info in video['files'].items(): + if not info.get('url'): + continue + formats.append({ + 'format_id': key, + 'url': self._proto_relative_url(info['url']), + 'width': int_or_none(info.get('width')), + 'height': int_or_none(info.get('height')), + 'filesize': int_or_none(info.get('size')), + 'fps': int_or_none(info.get('framerate')), + 'vbr': float_or_none(info.get('bitrate'), 1000) + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video.get('description'), + 'thumbnail': self._proto_relative_url(video.get('thumbnail_url')), + 'uploader': video.get('owner', {}).get('user_name'), + 'timestamp': float_or_none(video.get('date_added')), + 'duration': float_or_none(video.get('duration')), + 'view_count': int_or_none(video.get('plays')), + 'formats': formats + } diff --git a/youtube_dl/extractor/sunporno.py b/youtube_dl/extractor/sunporno.py index e527aa9..ef9be79 100644 --- a/youtube_dl/extractor/sunporno.py +++ b/youtube_dl/extractor/sunporno.py @@ -12,25 +12,29 @@ from ..utils import ( class SunPornoIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?sunporno\.com/videos/(?P<id>\d+)' - _TEST = { + _VALID_URL = r'https?://(?:(?:www\.)?sunporno\.com/videos|embeds\.sunporno\.com/embed)/(?P<id>\d+)' + _TESTS = [{ 'url': 'http://www.sunporno.com/videos/807778/', - 'md5': '6457d3c165fd6de062b99ef6c2ff4c86', + 'md5': '507887e29033502f29dba69affeebfc9', 'info_dict': { 'id': '807778', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'md5:0a400058e8105d39e35c35e7c5184164', 'description': 'md5:a31241990e1bd3a64e72ae99afb325fb', 'thumbnail': 're:^https?://.*\.jpg$', 'duration': 302, 'age_limit': 18, } - } + }, { + 'url': 'http://embeds.sunporno.com/embed/807778', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://www.sunporno.com/videos/%s' % video_id, video_id) title = self._html_search_regex( r'<title>([^<]+)</title>', webpage, 'title') @@ -40,7 +44,8 @@ class SunPornoIE(InfoExtractor): r'poster="([^"]+)"', webpage, 'thumbnail', fatal=False) duration = parse_duration(self._search_regex( - r'itemprop="duration">\s*(\d+:\d+)\s*<', + (r'itemprop="duration"[^>]*>\s*(\d+:\d+)\s*<', + r'>Duration:\s*<span[^>]+>\s*(\d+:\d+)\s*<'), webpage, 'duration', fatal=False)) view_count = int_or_none(self._html_search_regex( @@ -48,7 +53,7 @@ class SunPornoIE(InfoExtractor): webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( r'(\d+)</b> Comments?', - webpage, 'comment count', fatal=False)) + webpage, 'comment count', fatal=False, default=None)) formats = [] quality = qualities(['mp4', 'flv']) diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py index 67f56fa..1c04dfb 100644 --- a/youtube_dl/extractor/svt.py +++ b/youtube_dl/extractor/svt.py @@ -120,7 +120,7 @@ class SVTIE(SVTBaseIE): class SVTPlayIE(SVTBaseIE): IE_DESC = 'SVT Play and Öppet arkiv' - _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/video/(?P<id>[0-9]+)' + _VALID_URL = r'https?://(?:www\.)?(?:svtplay|oppetarkiv)\.se/(?:video|klipp)/(?P<id>[0-9]+)' _TESTS = [{ 'url': 'http://www.svtplay.se/video/5996901/flygplan-till-haile-selassie/flygplan-till-haile-selassie-2', 'md5': '2b6704fe4a28801e1a098bbf3c5ac611', @@ -141,6 +141,9 @@ class SVTPlayIE(SVTBaseIE): # geo restricted to Sweden 'url': 'http://www.oppetarkiv.se/video/5219710/trollflojten', 'only_matching': True, + }, { + 'url': 'http://www.svtplay.se/klipp/9023742/stopptid-om-bjorn-borg', + 'only_matching': True, }] def _real_extract(self, url): diff --git a/youtube_dl/extractor/syfy.py b/youtube_dl/extractor/syfy.py index 5ca079f..cc81f60 100644 --- a/youtube_dl/extractor/syfy.py +++ b/youtube_dl/extractor/syfy.py @@ -1,46 +1,58 @@ from __future__ import unicode_literals -import re +from .adobepass import AdobePassIE +from ..utils import ( + update_url_query, + smuggle_url, +) -from .common import InfoExtractor - - -class SyfyIE(InfoExtractor): - _VALID_URL = r'https?://www\.syfy\.com/(?:videos/.+?vid:(?P<id>[0-9]+)|(?!videos)(?P<video_name>[^/]+)(?:$|[?#]))' +class SyfyIE(AdobePassIE): + _VALID_URL = r'https?://www\.syfy\.com/(?:[^/]+/)?videos/(?P<id>[^/?#]+)' _TESTS = [{ - 'url': 'http://www.syfy.com/videos/Robot%20Combat%20League/Behind%20the%20Scenes/vid:2631458', + 'url': 'http://www.syfy.com/theinternetruinedmylife/videos/the-internet-ruined-my-life-season-1-trailer', 'info_dict': { - 'id': 'NmqMrGnXvmO1', - 'ext': 'flv', - 'title': 'George Lucas has Advice for his Daughter', - 'description': 'Listen to what insights George Lucas give his daughter Amanda.', + 'id': '2968097', + 'ext': 'mp4', + 'title': 'The Internet Ruined My Life: Season 1 Trailer', + 'description': 'One tweet, one post, one click, can destroy everything.', + 'uploader': 'NBCU-MPAT', + 'upload_date': '20170113', + 'timestamp': 1484345640, }, - 'add_ie': ['ThePlatform'], - }, { - 'url': 'http://www.syfy.com/wilwheaton', - 'md5': '94dfa54ee3ccb63295b276da08c415f6', - 'info_dict': { - 'id': '4yoffOOXC767', - 'ext': 'flv', - 'title': 'The Wil Wheaton Project - Premiering May 27th at 10/9c.', - 'description': 'The Wil Wheaton Project premieres May 27th at 10/9c. Don\'t miss it.', + 'params': { + # m3u8 download + 'skip_download': True, }, 'add_ie': ['ThePlatform'], - 'skip': 'Blocked outside the US', }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_name = mobj.group('video_name') - if video_name: - generic_webpage = self._download_webpage(url, video_name) - video_id = self._search_regex( - r'<iframe.*?class="video_iframe_page"\s+src="/_utils/video/thP_video_controller.php.*?_vid([0-9]+)">', - generic_webpage, 'video ID') - url = 'http://www.syfy.com/videos/%s/%s/vid:%s' % ( - video_name, video_name, video_id) - else: - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - return self.url_result(self._og_search_video_url(webpage)) + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + syfy_mpx = list(self._parse_json(self._search_regex( + r'jQuery\.extend\([^,]+,\s*({.+})\);', webpage, 'drupal settings'), + display_id)['syfy']['syfy_mpx'].values())[0] + video_id = syfy_mpx['mpxGUID'] + title = syfy_mpx['episodeTitle'] + query = { + 'mbr': 'true', + 'manifest': 'm3u', + } + if syfy_mpx.get('entitlement') == 'auth': + resource = self._get_mvpd_resource( + 'syfy', title, video_id, + syfy_mpx.get('mpxRating', 'TV-14')) + query['auth'] = self._extract_mvpd_auth( + url, video_id, 'syfy', resource) + + return { + '_type': 'url_transparent', + 'ie_key': 'ThePlatform', + 'url': smuggle_url(update_url_query( + self._proto_relative_url(syfy_mpx['releaseURL']), query), + {'force_smil_url': True}), + 'title': title, + 'id': video_id, + 'display_id': display_id, + } diff --git a/youtube_dl/extractor/tapely.py b/youtube_dl/extractor/tapely.py deleted file mode 100644 index ed560bd..0000000 --- a/youtube_dl/extractor/tapely.py +++ /dev/null @@ -1,109 +0,0 @@ -# coding: utf-8 -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - clean_html, - ExtractorError, - float_or_none, - parse_iso8601, - sanitized_Request, -) - - -class TapelyIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?(?:tape\.ly|tapely\.com)/(?P<id>[A-Za-z0-9\-_]+)(?:/(?P<songnr>\d+))?' - _API_URL = 'http://tape.ly/showtape?id={0:}' - _S3_SONG_URL = 'http://mytape.s3.amazonaws.com/{0:}' - _SOUNDCLOUD_SONG_URL = 'http://api.soundcloud.com{0:}' - _TESTS = [ - { - 'url': 'http://tape.ly/my-grief-as-told-by-water', - 'info_dict': { - 'id': 23952, - 'title': 'my grief as told by water', - 'thumbnail': 're:^https?://.*\.png$', - 'uploader_id': 16484, - 'timestamp': 1411848286, - 'description': 'For Robin and Ponkers, whom the tides of life have taken out to sea.', - }, - 'playlist_count': 13, - }, - { - 'url': 'http://tape.ly/my-grief-as-told-by-water/1', - 'md5': '79031f459fdec6530663b854cbc5715c', - 'info_dict': { - 'id': 258464, - 'title': 'Dreaming Awake (My Brightest Diamond)', - 'ext': 'm4a', - }, - }, - { - 'url': 'https://tapely.com/my-grief-as-told-by-water', - 'only_matching': True, - }, - ] - - def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - display_id = mobj.group('id') - - playlist_url = self._API_URL.format(display_id) - request = sanitized_Request(playlist_url) - request.add_header('X-Requested-With', 'XMLHttpRequest') - request.add_header('Accept', 'application/json') - request.add_header('Referer', url) - - playlist = self._download_json(request, display_id) - - tape = playlist['tape'] - - entries = [] - for s in tape['songs']: - song = s['song'] - entry = { - 'id': song['id'], - 'duration': float_or_none(song.get('songduration'), 1000), - 'title': song['title'], - } - if song['source'] == 'S3': - entry.update({ - 'url': self._S3_SONG_URL.format(song['filename']), - }) - entries.append(entry) - elif song['source'] == 'YT': - self.to_screen('YouTube video detected') - yt_id = song['filename'].replace('/youtube/', '') - entry.update(self.url_result(yt_id, 'Youtube', video_id=yt_id)) - entries.append(entry) - elif song['source'] == 'SC': - self.to_screen('SoundCloud song detected') - sc_url = self._SOUNDCLOUD_SONG_URL.format(song['filename']) - entry.update(self.url_result(sc_url, 'Soundcloud')) - entries.append(entry) - else: - self.report_warning('Unknown song source: %s' % song['source']) - - if mobj.group('songnr'): - songnr = int(mobj.group('songnr')) - 1 - try: - return entries[songnr] - except IndexError: - raise ExtractorError( - 'No song with index: %s' % mobj.group('songnr'), - expected=True) - - return { - '_type': 'playlist', - 'id': tape['id'], - 'display_id': display_id, - 'title': tape['name'], - 'entries': entries, - 'thumbnail': tape.get('image_url'), - 'description': clean_html(tape.get('subtext')), - 'like_count': tape.get('likescount'), - 'uploader_id': tape.get('user_id'), - 'timestamp': parse_iso8601(tape.get('published_at')), - } diff --git a/youtube_dl/extractor/telecinco.py b/youtube_dl/extractor/telecinco.py index 4b4b740..2ecfd04 100644 --- a/youtube_dl/extractor/telecinco.py +++ b/youtube_dl/extractor/telecinco.py @@ -1,50 +1,41 @@ # coding: utf-8 from __future__ import unicode_literals -import json +from .mitele import MiTeleBaseIE -from .common import InfoExtractor -from ..compat import ( - compat_urllib_parse_unquote, - compat_urllib_parse_urlencode, - compat_urlparse, -) -from ..utils import ( - get_element_by_attribute, - parse_duration, - strip_jsonp, -) - -class TelecincoIE(InfoExtractor): +class TelecincoIE(MiTeleBaseIE): IE_DESC = 'telecinco.es, cuatro.com and mediaset.es' _VALID_URL = r'https?://www\.(?:telecinco\.es|cuatro\.com|mediaset\.es)/(?:[^/]+/)+(?P<id>.+?)\.html' _TESTS = [{ 'url': 'http://www.telecinco.es/robinfood/temporada-01/t01xp14/Bacalao-cocochas-pil-pil_0_1876350223.html', - 'md5': '5cbef3ad5ef17bf0d21570332d140729', + 'md5': '8d7b2d5f699ee2709d992a63d5cd1712', 'info_dict': { - 'id': 'MDSVID20141015_0058', + 'id': 'JEA5ijCnF6p5W08A1rNKn7', 'ext': 'mp4', - 'title': 'Con Martín Berasategui, hacer un bacalao al ...', + 'title': 'Bacalao con kokotxas al pil-pil', + 'description': 'md5:1382dacd32dd4592d478cbdca458e5bb', 'duration': 662, }, }, { 'url': 'http://www.cuatro.com/deportes/futbol/barcelona/Leo_Messi-Champions-Roma_2_2052780128.html', - 'md5': '0a5b9f3cc8b074f50a0578f823a12694', + 'md5': '284393e5387b3b947b77c613ef04749a', 'info_dict': { - 'id': 'MDSVID20150916_0128', + 'id': 'jn24Od1zGLG4XUZcnUnZB6', 'ext': 'mp4', - 'title': '¿Quién es este ex futbolista con el que hablan ...', + 'title': '¿Quién es este ex futbolista con el que hablan Leo Messi y Luis Suárez?', + 'description': 'md5:a62ecb5f1934fc787107d7b9a2262805', 'duration': 79, }, }, { 'url': 'http://www.mediaset.es/12meses/campanas/doylacara/conlatratanohaytrato/Ayudame-dar-cara-trata-trato_2_1986630220.html', - 'md5': 'ad1bfaaba922dd4a295724b05b68f86a', + 'md5': '749afab6ea5a136a8806855166ae46a2', 'info_dict': { - 'id': 'MDSVID20150513_0220', + 'id': 'aywerkD2Sv1vGNqq9b85Q2', 'ext': 'mp4', 'title': '#DOYLACARA. Con la trata no hay trato', + 'description': 'md5:2771356ff7bfad9179c5f5cd954f1477', 'duration': 50, }, }, { @@ -56,40 +47,16 @@ class TelecincoIE(InfoExtractor): }] def _real_extract(self, url): - episode = self._match_id(url) - webpage = self._download_webpage(url, episode) - embed_data_json = self._search_regex( - r'(?s)MSV\.embedData\[.*?\]\s*=\s*({.*?});', webpage, 'embed data', - ).replace('\'', '"') - embed_data = json.loads(embed_data_json) - - domain = embed_data['mediaUrl'] - if not domain.startswith('http'): - # only happens in telecinco.es videos - domain = 'http://' + domain - info_url = compat_urlparse.urljoin( - domain, - compat_urllib_parse_unquote(embed_data['flashvars']['host']) - ) - info_el = self._download_xml(info_url, episode).find('./video/info') - - video_link = info_el.find('videoUrl/link').text - token_query = compat_urllib_parse_urlencode({'id': video_link}) - token_info = self._download_json( - embed_data['flashvars']['ov_tk'] + '?' + token_query, - episode, - transform_source=strip_jsonp - ) - formats = self._extract_m3u8_formats( - token_info['tokenizedUrl'], episode, ext='mp4', entry_protocol='m3u8_native') - self._sort_formats(formats) - - return { - 'id': embed_data['videoId'], - 'display_id': episode, - 'title': info_el.find('title').text, - 'formats': formats, - 'description': get_element_by_attribute('class', 'text', webpage), - 'thumbnail': info_el.find('thumb').text, - 'duration': parse_duration(info_el.find('duration').text), - } + display_id = self._match_id(url) + webpage = self._download_webpage(url, display_id) + title = self._html_search_meta( + ['og:title', 'twitter:title'], webpage, 'title') + info = self._get_player_info(url, webpage) + info.update({ + 'display_id': display_id, + 'title': title, + 'description': self._html_search_meta( + ['og:description', 'twitter:description'], + webpage, 'title', fatal=False), + }) + return info diff --git a/youtube_dl/extractor/telegraaf.py b/youtube_dl/extractor/telegraaf.py index 9092e9b..58078c5 100644 --- a/youtube_dl/extractor/telegraaf.py +++ b/youtube_dl/extractor/telegraaf.py @@ -47,11 +47,10 @@ class TelegraafIE(InfoExtractor): ext = determine_ext(manifest_url) if ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - manifest_url, video_id, ext='mp4', m3u8_id='hls')) + manifest_url, video_id, ext='mp4', m3u8_id='hls', fatal=False)) elif ext == 'mpd': - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet - continue + formats.extend(self._extract_mpd_formats( + manifest_url, video_id, mpd_id='dash', fatal=False)) else: self.report_warning('Unknown adaptive format %s' % ext) for location in locations.get('progressive', []): diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py index 07d222a..23067e8 100644 --- a/youtube_dl/extractor/theplatform.py +++ b/youtube_dl/extractor/theplatform.py @@ -9,6 +9,7 @@ import hashlib from .once import OnceIE +from .adobepass import AdobePassIE from ..compat import ( compat_parse_qs, compat_urllib_parse_urlparse, @@ -62,19 +63,20 @@ class ThePlatformBaseIE(OnceIE): return formats, subtitles - def get_metadata(self, path, video_id): + def _download_theplatform_metadata(self, path, video_id): info_url = 'http://link.theplatform.com/s/%s?format=preview' % path - info = self._download_json(info_url, video_id) + return self._download_json(info_url, video_id) + def _parse_theplatform_metadata(self, info): subtitles = {} captions = info.get('captions') if isinstance(captions, list): for caption in captions: lang, src, mime = caption.get('lang', 'en'), caption.get('src'), caption.get('type') - subtitles[lang] = [{ + subtitles.setdefault(lang, []).append({ 'ext': mimetype2ext(mime), 'url': src, - }] + }) return { 'title': info['title'], @@ -86,8 +88,12 @@ class ThePlatformBaseIE(OnceIE): 'uploader': info.get('billingCode'), } + def _extract_theplatform_metadata(self, path, video_id): + info = self._download_theplatform_metadata(path, video_id) + return self._parse_theplatform_metadata(info) -class ThePlatformIE(ThePlatformBaseIE): + +class ThePlatformIE(ThePlatformBaseIE, AdobePassIE): _VALID_URL = r'''(?x) (?:https?://(?:link|player)\.theplatform\.com/[sp]/(?P<provider_id>[^/]+)/ (?:(?:(?:[^/]+/)+select/)?(?P<media>media/(?:guid/\d+/)?)|(?P<config>(?:[^/\?]+/(?:swf|config)|onsite)/select/))? @@ -265,7 +271,7 @@ class ThePlatformIE(ThePlatformBaseIE): formats, subtitles = self._extract_theplatform_smil(smil_url, video_id) self._sort_formats(formats) - ret = self.get_metadata(path, video_id) + ret = self._extract_theplatform_metadata(path, video_id) combined_subtitles = self._merge_subtitles(ret.get('subtitles', {}), subtitles) ret.update({ 'id': video_id, @@ -339,7 +345,7 @@ class ThePlatformFeedIE(ThePlatformBaseIE): timestamp = int_or_none(entry.get('media$availableDate'), scale=1000) categories = [item['media$name'] for item in entry.get('media$categories', [])] - ret = self.get_metadata('%s/%s' % (provider_id, first_video_id), video_id) + ret = self._extract_theplatform_metadata('%s/%s' % (provider_id, first_video_id), video_id) subtitles = self._merge_subtitles(subtitles, ret['subtitles']) ret.update({ 'id': video_id, diff --git a/youtube_dl/extractor/threeqsdn.py b/youtube_dl/extractor/threeqsdn.py index c77a079..f26937d 100644 --- a/youtube_dl/extractor/threeqsdn.py +++ b/youtube_dl/extractor/threeqsdn.py @@ -24,16 +24,20 @@ class ThreeQSDNIE(InfoExtractor): 'title': '0280d6b9-1215-11e6-b427-0cc47a188158', 'is_live': False, }, - 'expected_warnings': ['Failed to download MPD manifest'], + 'expected_warnings': ['Failed to download MPD manifest', 'Failed to parse JSON'], }, { # live video stream 'url': 'https://playout.3qsdn.com/d755d94b-4ab9-11e3-9162-0025907ad44f?js=true', 'info_dict': { 'id': 'd755d94b-4ab9-11e3-9162-0025907ad44f', 'ext': 'mp4', - 'title': 'd755d94b-4ab9-11e3-9162-0025907ad44f', - 'is_live': False, + 'title': 're:^d755d94b-4ab9-11e3-9162-0025907ad44f [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$', + 'is_live': True, }, + 'params': { + 'skip_download': True, # m3u8 downloads + }, + 'expected_warnings': ['Failed to download MPD manifest'], }, { # live audio stream 'url': 'http://playout.3qsdn.com/9edf36e0-6bf2-11e2-a16a-9acf09e2db48', @@ -92,12 +96,11 @@ class ThreeQSDNIE(InfoExtractor): if not item_url or item_url in urls: return urls.add(item_url) - type_ = item.get('type') - ext = determine_ext(item_url, default_ext=None) - if type_ == 'application/dash+xml' or ext == 'mpd': + ext = mimetype2ext(item.get('type')) or determine_ext(item_url, default_ext=None) + if ext == 'mpd': formats.extend(self._extract_mpd_formats( item_url, video_id, mpd_id='mpd', fatal=False)) - elif type_ in ('application/vnd.apple.mpegURL', 'application/x-mpegurl') or ext == 'm3u8': + elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( item_url, video_id, 'mp4', entry_protocol='m3u8' if live else 'm3u8_native', @@ -111,11 +114,11 @@ class ThreeQSDNIE(InfoExtractor): formats.append({ 'url': item_url, 'format_id': item.get('quality'), - 'ext': 'mp4' if item_url.startswith('rtsp') else mimetype2ext(type_) or ext, + 'ext': 'mp4' if item_url.startswith('rtsp') else ext, 'vcodec': 'none' if stream_type == 'audio' else None, }) - for item_js in re.findall(r'({.*?\b(?:src|source)\s*:\s*["\'].+?})', js): + for item_js in re.findall(r'({[^{]*?\b(?:src|source)\s*:\s*["\'].+?})', js): f = self._parse_json( item_js, video_id, transform_source=js_to_json, fatal=False) if not f: diff --git a/youtube_dl/extractor/tmz.py b/youtube_dl/extractor/tmz.py index 7dbe68b..979856e 100644 --- a/youtube_dl/extractor/tmz.py +++ b/youtube_dl/extractor/tmz.py @@ -5,31 +5,27 @@ from .common import InfoExtractor class TMZIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/]+)/?' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?tmz\.com/videos/(?P<id>[^/?#]+)' + _TESTS = [{ 'url': 'http://www.tmz.com/videos/0_okj015ty/', - 'md5': '791204e3bf790b1426cb2db0706184c0', + 'md5': '4d22a51ef205b6c06395d8394f72d560', 'info_dict': { 'id': '0_okj015ty', - 'url': 'http://tmz.vo.llnwd.net/o28/2014-03/13/0_okj015ty_0_rt8ro3si_2.mp4', 'ext': 'mp4', 'title': 'Kim Kardashian\'s Boobs Unlock a Mystery!', 'description': 'Did Kim Kardasain try to one-up Khloe by one-upping Kylie??? Or is she just showing off her amazing boobs?', - 'thumbnail': r're:http://cdnbakmi\.kaltura\.com/.*thumbnail.*', + 'timestamp': 1394747163, + 'uploader_id': 'batchUser', + 'upload_date': '20140313', } - } + }, { + 'url': 'http://www.tmz.com/videos/0-cegprt2p/', + 'only_matching': True, + }] def _real_extract(self, url): - video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) - - return { - 'id': video_id, - 'url': self._html_search_meta('VideoURL', webpage, fatal=True), - 'title': self._og_search_title(webpage), - 'description': self._og_search_description(webpage), - 'thumbnail': self._html_search_meta('ThumbURL', webpage), - } + video_id = self._match_id(url).replace('-', '_') + return self.url_result('kaltura:591531:%s' % video_id, 'Kaltura', video_id) class TMZArticleIE(InfoExtractor): diff --git a/youtube_dl/extractor/tnaflix.py b/youtube_dl/extractor/tnaflix.py index 7817417..7ddf777 100644 --- a/youtube_dl/extractor/tnaflix.py +++ b/youtube_dl/extractor/tnaflix.py @@ -118,8 +118,12 @@ class TNAFlixNetworkBaseIE(InfoExtractor): xpath_text(cfg_xml, './startThumb', 'thumbnail'), 'http:') thumbnails = self._extract_thumbnails(cfg_xml) - title = self._html_search_regex( - self._TITLE_REGEX, webpage, 'title') if self._TITLE_REGEX else self._og_search_title(webpage) + title = None + if self._TITLE_REGEX: + title = self._html_search_regex( + self._TITLE_REGEX, webpage, 'title', default=None) + if not title: + title = self._og_search_title(webpage) age_limit = self._rta_search(webpage) or 18 @@ -189,9 +193,9 @@ class TNAFlixNetworkEmbedIE(TNAFlixNetworkBaseIE): class TNAFlixIE(TNAFlixNetworkBaseIE): _VALID_URL = r'https?://(?:www\.)?tnaflix\.com/[^/]+/(?P<display_id>[^/]+)/video(?P<id>\d+)' - _TITLE_REGEX = r'<title>(.+?) - TNAFlix Porn Videos</title>' - _DESCRIPTION_REGEX = r'<meta[^>]+name="description"[^>]+content="([^"]+)"' - _UPLOADER_REGEX = r'<i>\s*Verified Member\s*</i>\s*<h1>(.+?)</h1>' + _TITLE_REGEX = r'<title>(.+?) - (?:TNAFlix Porn Videos|TNAFlix\.com)</title>' + _DESCRIPTION_REGEX = r'(?s)>Description:</[^>]+>(.+?)<' + _UPLOADER_REGEX = r'<i>\s*Verified Member\s*</i>\s*<h\d+>(.+?)<' _CATEGORIES_REGEX = r'(?s)<span[^>]*>Categories:</span>(.+?)</div>' _TESTS = [{ diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py index 4797d13..54c2d0a 100644 --- a/youtube_dl/extractor/toutv.py +++ b/youtube_dl/extractor/toutv.py @@ -1,74 +1,41 @@ # coding: utf-8 from __future__ import unicode_literals -import re - from .common import InfoExtractor -from ..utils import ( - ExtractorError, - unified_strdate, -) +from ..utils import int_or_none class TouTvIE(InfoExtractor): IE_NAME = 'tou.tv' - _VALID_URL = r'https?://www\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+(?:/(?P<episode>S[0-9]+E[0-9]+)))' + _VALID_URL = r'https?://ici\.tou\.tv/(?P<id>[a-zA-Z0-9_-]+/S[0-9]+E[0-9]+)' _TEST = { - 'url': 'http://www.tou.tv/30-vies/S04E41', + 'url': 'http://ici.tou.tv/garfield-tout-court/S2015E17', 'info_dict': { - 'id': '30-vies_S04E41', + 'id': '122017', 'ext': 'mp4', - 'title': '30 vies Saison 4 / Épisode 41', - 'description': 'md5:da363002db82ccbe4dafeb9cab039b09', - 'age_limit': 8, - 'uploader': 'Groupe des Nouveaux Médias', - 'duration': 1296, - 'upload_date': '20131118', - 'thumbnail': 'http://static.tou.tv/medias/images/2013-11-18_19_00_00_30VIES_0341_01_L.jpeg', + 'title': 'Saison 2015 Épisode 17', + 'description': 'La photo de famille 2', + 'upload_date': '20100717', }, 'params': { - 'skip_download': True, # Requires rtmpdump + # m3u8 download + 'skip_download': True, }, - 'skip': 'Only available in Canada' } def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - webpage = self._download_webpage(url, video_id) - - mediaId = self._search_regex( - r'"idMedia":\s*"([^"]+)"', webpage, 'media ID') - - streams_url = 'http://release.theplatform.com/content.select?pid=' + mediaId - streams_doc = self._download_xml( - streams_url, video_id, note='Downloading stream list') - - video_url = next(n.text - for n in streams_doc.findall('.//choice/url') - if '//ad.doubleclick' not in n.text) - if video_url.endswith('/Unavailable.flv'): - raise ExtractorError( - 'Access to this video is blocked from outside of Canada', - expected=True) - - duration_str = self._html_search_meta( - 'video:duration', webpage, 'duration') - duration = int(duration_str) if duration_str else None - upload_date_str = self._html_search_meta( - 'video:release_date', webpage, 'upload date') - upload_date = unified_strdate(upload_date_str) if upload_date_str else None + path = self._match_id(url) + metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path) + video_id = metadata['IdMedia'] + details = metadata['Details'] + title = details['OriginalTitle'] return { + '_type': 'url_transparent', + 'url': 'radiocanada:%s:%s' % (metadata.get('AppCode', 'toutv'), video_id), 'id': video_id, - 'title': self._og_search_title(webpage), - 'url': video_url, - 'description': self._og_search_description(webpage), - 'uploader': self._dc_search_uploader(webpage), - 'thumbnail': self._og_search_thumbnail(webpage), - 'age_limit': self._media_rating_search(webpage), - 'duration': duration, - 'upload_date': upload_date, - 'ext': 'mp4', + 'title': title, + 'thumbnail': details.get('ImageUrl'), + 'duration': int_or_none(details.get('LengthInSeconds')), } diff --git a/youtube_dl/extractor/tube8.py b/youtube_dl/extractor/tube8.py index 1d9271d..4053f6c 100644 --- a/youtube_dl/extractor/tube8.py +++ b/youtube_dl/extractor/tube8.py @@ -1,18 +1,13 @@ from __future__ import unicode_literals -import re - -from .common import InfoExtractor -from ..compat import compat_str from ..utils import ( int_or_none, - sanitized_Request, str_to_int, ) -from ..aes import aes_decrypt_text +from .keezmovies import KeezMoviesIE -class Tube8IE(InfoExtractor): +class Tube8IE(KeezMoviesIE): _VALID_URL = r'https?://(?:www\.)?tube8\.com/(?:[^/]+/)+(?P<display_id>[^/]+)/(?P<id>\d+)' _TESTS = [{ 'url': 'http://www.tube8.com/teen/kasia-music-video/229795/', @@ -33,47 +28,17 @@ class Tube8IE(InfoExtractor): }] def _real_extract(self, url): - mobj = re.match(self._VALID_URL, url) - video_id = mobj.group('id') - display_id = mobj.group('display_id') - - req = sanitized_Request(url) - req.add_header('Cookie', 'age_verified=1') - webpage = self._download_webpage(req, display_id) - - flashvars = self._parse_json( - self._search_regex( - r'flashvars\s*=\s*({.+?});\r?\n', webpage, 'flashvars'), - video_id) - - formats = [] - for key, video_url in flashvars.items(): - if not isinstance(video_url, compat_str) or not video_url.startswith('http'): - continue - height = self._search_regex( - r'quality_(\d+)[pP]', key, 'height', default=None) - if not height: - continue - if flashvars.get('encrypted') is True: - video_url = aes_decrypt_text( - video_url, flashvars['video_title'], 32).decode('utf-8') - formats.append({ - 'url': video_url, - 'format_id': '%sp' % height, - 'height': int(height), - }) - self._sort_formats(formats) + webpage, info = self._extract_info(url) - thumbnail = flashvars.get('image_url') + if not info['title']: + info['title'] = self._html_search_regex( + r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') - title = self._html_search_regex( - r'videoTitle\s*=\s*"([^"]+)', webpage, 'title') description = self._html_search_regex( r'>Description:</strong>\s*(.+?)\s*<', webpage, 'description', fatal=False) uploader = self._html_search_regex( r'<span class="username">\s*(.+?)\s*<', webpage, 'uploader', fatal=False) - duration = int_or_none(flashvars.get('video_duration')) like_count = int_or_none(self._search_regex( r'rupVar\s*=\s*"(\d+)"', webpage, 'like count', fatal=False)) @@ -86,18 +51,13 @@ class Tube8IE(InfoExtractor): r'<span id="allCommentsCount">(\d+)</span>', webpage, 'comment count', fatal=False)) - return { - 'id': video_id, - 'display_id': display_id, - 'title': title, + info.update({ 'description': description, - 'thumbnail': thumbnail, 'uploader': uploader, - 'duration': duration, 'view_count': view_count, 'like_count': like_count, 'dislike_count': dislike_count, 'comment_count': comment_count, - 'age_limit': 18, - 'formats': formats, - } + }) + + return info diff --git a/youtube_dl/extractor/tv2.py b/youtube_dl/extractor/tv2.py index 86bb791..f225ec6 100644 --- a/youtube_dl/extractor/tv2.py +++ b/youtube_dl/extractor/tv2.py @@ -8,6 +8,7 @@ from ..utils import ( determine_ext, int_or_none, float_or_none, + js_to_json, parse_iso8601, remove_end, ) @@ -54,10 +55,11 @@ class TV2IE(InfoExtractor): ext = determine_ext(video_url) if ext == 'f4m': formats.extend(self._extract_f4m_formats( - video_url, video_id, f4m_id=format_id)) + video_url, video_id, f4m_id=format_id, fatal=False)) elif ext == 'm3u8': formats.extend(self._extract_m3u8_formats( - video_url, video_id, 'mp4', m3u8_id=format_id)) + video_url, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id=format_id, fatal=False)) elif ext == 'ism' or video_url.endswith('.ism/Manifest'): pass else: @@ -105,7 +107,7 @@ class TV2ArticleIE(InfoExtractor): 'url': 'http://www.tv2.no/2015/05/16/nyheter/alesund/krim/pingvin/6930542', 'info_dict': { 'id': '6930542', - 'title': 'Russen hetses etter pingvintyveri – innrømmer å ha åpnet luken på buret', + 'title': 'Russen hetses etter pingvintyveri - innrømmer å ha åpnet luken på buret', 'description': 'md5:339573779d3eea3542ffe12006190954', }, 'playlist_count': 2, @@ -119,9 +121,23 @@ class TV2ArticleIE(InfoExtractor): webpage = self._download_webpage(url, playlist_id) + # Old embed pattern (looks unused nowadays) + assets = re.findall(r'data-assetid=["\'](\d+)', webpage) + + if not assets: + # New embed pattern + for v in re.findall('TV2ContentboxVideo\(({.+?})\)', webpage): + video = self._parse_json( + v, playlist_id, transform_source=js_to_json, fatal=False) + if not video: + continue + asset = video.get('assetId') + if asset: + assets.append(asset) + entries = [ - self.url_result('http://www.tv2.no/v/%s' % video_id, 'TV2') - for video_id in re.findall(r'data-assetid="(\d+)"', webpage)] + self.url_result('http://www.tv2.no/v/%s' % asset_id, 'TV2') + for asset_id in assets] title = remove_end(self._og_search_title(webpage), ' - TV2.no') description = remove_end(self._og_search_description(webpage), ' - TV2.no') diff --git a/youtube_dl/extractor/tvland.py b/youtube_dl/extractor/tvland.py index b73279d..cb76a2a 100644 --- a/youtube_dl/extractor/tvland.py +++ b/youtube_dl/extractor/tvland.py @@ -9,56 +9,23 @@ class TVLandIE(MTVServicesInfoExtractor): _VALID_URL = r'https?://(?:www\.)?tvland\.com/(?:video-clips|episodes)/(?P<id>[^/?#.]+)' _FEED_URL = 'http://www.tvland.com/feeds/mrss/' _TESTS = [{ + # Geo-restricted. Without a proxy metadata are still there. With a + # proxy it redirects to http://m.tvland.com/app/ 'url': 'http://www.tvland.com/episodes/hqhps2/everybody-loves-raymond-the-invasion-ep-048', - 'playlist': [ - { - 'md5': '227e9723b9669c05bf51098b10287aa7', - 'info_dict': { - 'id': 'bcbd3a83-3aca-4dca-809b-f78a87dcccdd', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 1 of 5', - } - }, - { - 'md5': '9fa2b764ec0e8194fb3ebb01a83df88b', - 'info_dict': { - 'id': 'f4279548-6e13-40dd-92e8-860d27289197', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 2 of 5', - } - }, - { - 'md5': 'fde4c3bccd7cc7e3576b338734153cec', - 'info_dict': { - 'id': '664e4a38-53ef-4115-9bc9-d0f789ec6334', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 3 of 5', - } - }, - { - 'md5': '247f6780cda6891f2e49b8ae2b10e017', - 'info_dict': { - 'id': '9146ecf5-b15a-4d78-879c-6679b77f4960', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 4 of 5', - } - }, - { - 'md5': 'fd269f33256e47bad5eb6c40de089ff6', - 'info_dict': { - 'id': '04334a2e-9a47-4214-a8c2-ae5792e2fab7', - 'ext': 'mp4', - 'title': 'Everybody Loves Raymond|Everybody Loves Raymond 048 HD, Part 5 of 5', - } - } - ], + 'info_dict': { + 'description': 'md5:80973e81b916a324e05c14a3fb506d29', + 'title': 'The Invasion', + }, + 'playlist': [], }, { 'url': 'http://www.tvland.com/video-clips/zea2ev/younger-younger--hilary-duff---little-lies', 'md5': 'e2c6389401cf485df26c79c247b08713', 'info_dict': { 'id': 'b8697515-4bbe-4e01-83d5-fa705ce5fa88', 'ext': 'mp4', - 'title': 'Younger|Younger: Hilary Duff - Little Lies', - 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269' + 'title': 'Younger|December 28, 2015|2|NO-EPISODE#|Younger: Hilary Duff - Little Lies', + 'description': 'md5:7d192f56ca8d958645c83f0de8ef0269', + 'upload_date': '20151228', + 'timestamp': 1451289600, }, }] diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py index 5070082..2abfb78 100644 --- a/youtube_dl/extractor/tvp.py +++ b/youtube_dl/extractor/tvp.py @@ -24,6 +24,7 @@ class TVPIE(InfoExtractor): 'id': '194536', 'ext': 'mp4', 'title': 'Czas honoru, I seria – odc. 13', + 'description': 'md5:76649d2014f65c99477be17f23a4dead', }, }, { 'url': 'http://www.tvp.pl/there-can-be-anything-so-i-shortened-it/17916176', @@ -32,6 +33,16 @@ class TVPIE(InfoExtractor): 'id': '17916176', 'ext': 'mp4', 'title': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + 'description': 'TVP Gorzów pokaże filmy studentów z podroży dookoła świata', + }, + }, { + # page id is not the same as video id(#7799) + 'url': 'http://vod.tvp.pl/22704887/08122015-1500', + 'md5': 'cf6a4705dfd1489aef8deb168d6ba742', + 'info_dict': { + 'id': '22680786', + 'ext': 'mp4', + 'title': 'Wiadomości, 08.12.2015, 15:00', }, }, { 'url': 'http://vod.tvp.pl/seriale/obyczajowe/na-sygnale/sezon-2-27-/odc-39/17834272', @@ -54,6 +65,39 @@ class TVPIE(InfoExtractor): }] def _real_extract(self, url): + page_id = self._match_id(url) + webpage = self._download_webpage(url, page_id) + video_id = self._search_regex([ + r'<iframe[^>]+src="[^"]*?object_id=(\d+)', + "object_id\s*:\s*'(\d+)'"], webpage, 'video id') + return { + '_type': 'url_transparent', + 'url': 'tvp:' + video_id, + 'description': self._og_search_description(webpage, default=None), + 'thumbnail': self._og_search_thumbnail(webpage), + 'ie_key': 'TVPEmbed', + } + + +class TVPEmbedIE(InfoExtractor): + IE_NAME = 'tvp:embed' + IE_DESC = 'Telewizja Polska' + _VALID_URL = r'(?:tvp:|https?://[^/]+\.tvp\.(?:pl|info)/sess/tvplayer\.php\?.*?object_id=)(?P<id>\d+)' + + _TESTS = [{ + 'url': 'http://www.tvp.pl/sess/tvplayer.php?object_id=22670268', + 'md5': '8c9cd59d16edabf39331f93bf8a766c7', + 'info_dict': { + 'id': '22670268', + 'ext': 'mp4', + 'title': 'Panorama, 07.12.2015, 15:40', + }, + }, { + 'url': 'tvp:22670268', + 'only_matching': True, + }] + + def _real_extract(self, url): video_id = self._match_id(url) webpage = self._download_webpage( @@ -89,8 +133,8 @@ class TVPIE(InfoExtractor): r'(https?://.+?/video)(?:\.(?:ism|f4m|m3u8)|-\d+\.mp4)', video_url, 'video base url', default=None) if video_url_base: - # TODO: Current DASH formats are broken - $Time$ pattern in - # <SegmentTemplate> not implemented yet + # TODO: <Group> found instead of <AdaptationSet> in MPD manifest. + # It's not mentioned in MPEG-DASH standard. Figure that out. # formats.extend(self._extract_mpd_formats( # video_url_base + '.ism/video.mpd', # video_id, mpd_id='dash', fatal=False)) diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py index df70a6b..4186e82 100644 --- a/youtube_dl/extractor/tvplay.py +++ b/youtube_dl/extractor/tvplay.py @@ -4,47 +4,58 @@ from __future__ import unicode_literals import re from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_HTTPError, + compat_str, + compat_urlparse, +) from ..utils import ( + determine_ext, + ExtractorError, + int_or_none, parse_iso8601, qualities, + try_get, + update_url_query, ) class TVPlayIE(InfoExtractor): - IE_DESC = 'TV3Play and related services' - _VALID_URL = r'''(?x)https?://(?:www\.)? - (?:tvplay\.lv/parraides| - tv3play\.lt/programos| - play\.tv3\.lt/programos| - tv3play\.ee/sisu| - tv3play\.se/program| - tv6play\.se/program| - tv8play\.se/program| - tv10play\.se/program| - tv3play\.no/programmer| - viasat4play\.no/programmer| - tv6play\.no/programmer| - tv3play\.dk/programmer| - play\.novatv\.bg/programi - )/[^/]+/(?P<id>\d+) - ''' + IE_NAME = 'mtg' + IE_DESC = 'MTG services' + _VALID_URL = r'''(?x) + (?: + mtg:| + https?:// + (?:www\.)? + (?: + tvplay(?:\.skaties)?\.lv/parraides| + (?:tv3play|play\.tv3)\.lt/programos| + tv3play(?:\.tv3)?\.ee/sisu| + (?:tv(?:3|6|8|10)play|viafree)\.se/program| + (?:(?:tv3play|viasat4play|tv6play|viafree)\.no|(?:tv3play|viafree)\.dk)/programmer| + play\.novatv\.bg/programi + ) + /(?:[^/]+/)+ + ) + (?P<id>\d+) + ''' _TESTS = [ { 'url': 'http://www.tvplay.lv/parraides/vinas-melo-labak/418113?autostart=true', + 'md5': 'a1612fe0849455423ad8718fe049be21', 'info_dict': { 'id': '418113', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Kādi ir īri? - Viņas melo labāk', 'description': 'Baiba apsmej īrus, kādi tie ir un ko viņi dara.', + 'series': 'Viņas melo labāk', + 'season': '2.sezona', + 'season_number': 2, 'duration': 25, 'timestamp': 1406097056, 'upload_date': '20140723', }, - 'params': { - # rtmp download - 'skip_download': True, - }, }, { 'url': 'http://play.tv3.lt/programos/moterys-meluoja-geriau/409229?autostart=true', @@ -53,6 +64,10 @@ class TVPlayIE(InfoExtractor): 'ext': 'flv', 'title': 'Moterys meluoja geriau', 'description': 'md5:9aec0fc68e2cbc992d2a140bd41fa89e', + 'series': 'Moterys meluoja geriau', + 'episode_number': 47, + 'season': '1 sezonas', + 'season_number': 1, 'duration': 1330, 'timestamp': 1403769181, 'upload_date': '20140626', @@ -82,7 +97,7 @@ class TVPlayIE(InfoExtractor): 'url': 'http://www.tv3play.se/program/husraddarna/395385?autostart=true', 'info_dict': { 'id': '395385', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Husräddarna S02E07', 'description': 'md5:f210c6c89f42d4fc39faa551be813777', 'duration': 2574, @@ -90,7 +105,6 @@ class TVPlayIE(InfoExtractor): 'upload_date': '20140520', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -98,7 +112,7 @@ class TVPlayIE(InfoExtractor): 'url': 'http://www.tv6play.se/program/den-sista-dokusapan/266636?autostart=true', 'info_dict': { 'id': '266636', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Den sista dokusåpan S01E08', 'description': 'md5:295be39c872520221b933830f660b110', 'duration': 1492, @@ -107,7 +121,6 @@ class TVPlayIE(InfoExtractor): 'age_limit': 18, }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -115,7 +128,7 @@ class TVPlayIE(InfoExtractor): 'url': 'http://www.tv8play.se/program/antikjakten/282756?autostart=true', 'info_dict': { 'id': '282756', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Antikjakten S01E10', 'description': 'md5:1b201169beabd97e20c5ad0ad67b13b8', 'duration': 2646, @@ -123,7 +136,6 @@ class TVPlayIE(InfoExtractor): 'upload_date': '20120925', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -131,7 +143,7 @@ class TVPlayIE(InfoExtractor): 'url': 'http://www.tv3play.no/programmer/anna-anka-soker-assistent/230898?autostart=true', 'info_dict': { 'id': '230898', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Anna Anka søker assistent - Ep. 8', 'description': 'md5:f80916bf5bbe1c5f760d127f8dd71474', 'duration': 2656, @@ -139,7 +151,6 @@ class TVPlayIE(InfoExtractor): 'upload_date': '20100628', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -147,7 +158,7 @@ class TVPlayIE(InfoExtractor): 'url': 'http://www.viasat4play.no/programmer/budbringerne/21873?autostart=true', 'info_dict': { 'id': '21873', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Budbringerne program 10', 'description': 'md5:4db78dc4ec8a85bb04fd322a3ee5092d', 'duration': 1297, @@ -155,7 +166,6 @@ class TVPlayIE(InfoExtractor): 'upload_date': '20090929', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -163,7 +173,7 @@ class TVPlayIE(InfoExtractor): 'url': 'http://www.tv6play.no/programmer/hotelinspektor-alex-polizzi/361883?autostart=true', 'info_dict': { 'id': '361883', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Hotelinspektør Alex Polizzi - Ep. 10', 'description': 'md5:3ecf808db9ec96c862c8ecb3a7fdaf81', 'duration': 2594, @@ -171,7 +181,6 @@ class TVPlayIE(InfoExtractor): 'upload_date': '20140224', }, 'params': { - # rtmp download 'skip_download': True, }, }, @@ -191,59 +200,173 @@ class TVPlayIE(InfoExtractor): 'skip_download': True, }, }, + { + 'url': 'http://tvplay.skaties.lv/parraides/vinas-melo-labak/418113?autostart=true', + 'only_matching': True, + }, + { + # views is null + 'url': 'http://tvplay.skaties.lv/parraides/tv3-zinas/760183', + 'only_matching': True, + }, + { + 'url': 'http://tv3play.tv3.ee/sisu/kodu-keset-linna/238551?autostart=true', + 'only_matching': True, + }, + { + 'url': 'http://www.viafree.se/program/underhallning/i-like-radio-live/sasong-1/676869', + 'only_matching': True, + }, + { + 'url': 'mtg:418113', + 'only_matching': True, + } ] def _real_extract(self, url): video_id = self._match_id(url) video = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/%s' % video_id, video_id, 'Downloading video JSON') + 'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON') - if video['is_geo_blocked']: - self.report_warning( - 'This content might not be available in your country due to copyright reasons') + title = video['title'] - streams = self._download_json( - 'http://playapi.mtgx.tv/v1/videos/stream/%s' % video_id, video_id, 'Downloading streams JSON') + try: + streams = self._download_json( + 'http://playapi.mtgx.tv/v3/videos/stream/%s' % video_id, + video_id, 'Downloading streams JSON') + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403: + msg = self._parse_json(e.cause.read().decode('utf-8'), video_id) + raise ExtractorError(msg['msg'], expected=True) + raise quality = qualities(['hls', 'medium', 'high']) formats = [] - for format_id, video_url in streams['streams'].items(): + for format_id, video_url in streams.get('streams', {}).items(): if not video_url or not isinstance(video_url, compat_str): continue - fmt = { - 'format_id': format_id, - 'preference': quality(format_id), - } - if video_url.startswith('rtmp'): - m = re.search(r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) - if not m: - continue - fmt.update({ - 'ext': 'flv', - 'url': m.group('url'), - 'app': m.group('app'), - 'play_path': m.group('playpath'), - }) - elif video_url.endswith('.f4m'): + ext = determine_ext(video_url) + if ext == 'f4m': formats.extend(self._extract_f4m_formats( - video_url + '?hdcore=3.5.0&plugin=aasp-3.5.0.151.81', video_id)) - continue + update_url_query(video_url, { + 'hdcore': '3.5.0', + 'plugin': 'aasp-3.5.0.151.81' + }), video_id, f4m_id='hds', fatal=False)) + elif ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + video_url, video_id, 'mp4', 'm3u8_native', + m3u8_id='hls', fatal=False)) else: - fmt.update({ - 'url': video_url, - }) - formats.append(fmt) + fmt = { + 'format_id': format_id, + 'quality': quality(format_id), + 'ext': ext, + } + if video_url.startswith('rtmp'): + m = re.search( + r'^(?P<url>rtmp://[^/]+/(?P<app>[^/]+))/(?P<playpath>.+)$', video_url) + if not m: + continue + fmt.update({ + 'ext': 'flv', + 'url': m.group('url'), + 'app': m.group('app'), + 'play_path': m.group('playpath'), + }) + else: + fmt.update({ + 'url': video_url, + }) + formats.append(fmt) + + if not formats and video.get('is_geo_blocked'): + self.raise_geo_restricted( + 'This content might not be available in your country due to copyright reasons') self._sort_formats(formats) + # TODO: webvtt in m3u8 + subtitles = {} + sami_path = video.get('sami_path') + if sami_path: + lang = self._search_regex( + r'_([a-z]{2})\.xml', sami_path, 'lang', + default=compat_urlparse.urlparse(url).netloc.rsplit('.', 1)[-1]) + subtitles[lang] = [{ + 'url': sami_path, + }] + + series = video.get('format_title') + episode_number = int_or_none(video.get('format_position', {}).get('episode')) + season = video.get('_embedded', {}).get('season', {}).get('title') + season_number = int_or_none(video.get('format_position', {}).get('season')) + return { 'id': video_id, - 'title': video['title'], - 'description': video['description'], - 'duration': video['duration'], - 'timestamp': parse_iso8601(video['created_at']), - 'view_count': video['views']['total'], - 'age_limit': video.get('age_limit', 0), + 'title': title, + 'description': video.get('description'), + 'series': series, + 'episode_number': episode_number, + 'season': season, + 'season_number': season_number, + 'duration': int_or_none(video.get('duration')), + 'timestamp': parse_iso8601(video.get('created_at')), + 'view_count': try_get(video, lambda x: x['views']['total'], int), + 'age_limit': int_or_none(video.get('age_limit', 0)), 'formats': formats, + 'subtitles': subtitles, } + + +class ViafreeIE(InfoExtractor): + _VALID_URL = r'''(?x) + https?:// + (?:www\.)? + viafree\. + (?: + (?:dk|no)/programmer| + se/program + ) + /(?:[^/]+/)+(?P<id>[^/?#&]+) + ''' + _TESTS = [{ + 'url': 'http://www.viafree.se/program/livsstil/husraddarna/sasong-2/avsnitt-2', + 'info_dict': { + 'id': '395375', + 'ext': 'mp4', + 'title': 'Husräddarna S02E02', + 'description': 'md5:4db5c933e37db629b5a2f75dfb34829e', + 'series': 'Husräddarna', + 'season': 'Säsong 2', + 'season_number': 2, + 'duration': 2576, + 'timestamp': 1400596321, + 'upload_date': '20140520', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [TVPlayIE.ie_key()], + }, { + 'url': 'http://www.viafree.no/programmer/underholdning/det-beste-vorspielet/sesong-2/episode-1', + 'only_matching': True, + }, { + 'url': 'http://www.viafree.dk/programmer/reality/paradise-hotel/saeson-7/episode-5', + 'only_matching': True, + }] + + @classmethod + def suitable(cls, url): + return False if TVPlayIE.suitable(url) else super(ViafreeIE, cls).suitable(url) + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + video_id = self._search_regex( + r'currentVideo["\']\s*:\s*.+?["\']id["\']\s*:\s*["\'](?P<id>\d{6,})', + webpage, 'video id') + + return self.url_result('mtg:%s' % video_id, TVPlayIE.ie_key()) diff --git a/youtube_dl/extractor/tweakers.py b/youtube_dl/extractor/tweakers.py index f3198fb..7a9386c 100644 --- a/youtube_dl/extractor/tweakers.py +++ b/youtube_dl/extractor/tweakers.py @@ -1,25 +1,62 @@ from __future__ import unicode_literals from .common import InfoExtractor +from ..utils import ( + int_or_none, + determine_ext, + mimetype2ext, +) class TweakersIE(InfoExtractor): _VALID_URL = r'https?://tweakers\.net/video/(?P<id>\d+)' _TEST = { 'url': 'https://tweakers.net/video/9926/new-nintendo-3ds-xl-op-alle-fronten-beter.html', - 'md5': '3147e4ddad366f97476a93863e4557c8', + 'md5': 'fe73e417c093a788e0160c4025f88b15', 'info_dict': { 'id': '9926', 'ext': 'mp4', 'title': 'New Nintendo 3DS XL - Op alle fronten beter', - 'description': 'md5:f97324cc71e86e11c853f0763820e3ba', + 'description': 'md5:3789b21fed9c0219e9bcaacd43fab280', 'thumbnail': 're:^https?://.*\.jpe?g$', 'duration': 386, + 'uploader_id': 's7JeEm', } } def _real_extract(self, url): - playlist_id = self._match_id(url) - entries = self._extract_xspf_playlist( - 'https://tweakers.net/video/s1playlist/%s/playlist.xspf' % playlist_id, playlist_id) - return self.playlist_result(entries, playlist_id) + video_id = self._match_id(url) + video_data = self._download_json( + 'https://tweakers.net/video/s1playlist/%s/1920/1080/playlist.json' % video_id, + video_id)['items'][0] + + title = video_data['title'] + + formats = [] + for location in video_data.get('locations', {}).get('progressive', []): + format_id = location.get('label') + width = int_or_none(location.get('width')) + height = int_or_none(location.get('height')) + for source in location.get('sources', []): + source_url = source.get('src') + if not source_url: + continue + ext = mimetype2ext(source.get('type')) or determine_ext(source_url) + formats.append({ + 'format_id': format_id, + 'url': source_url, + 'width': width, + 'height': height, + 'ext': ext, + }) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'description': video_data.get('description'), + 'thumbnail': video_data.get('poster'), + 'duration': int_or_none(video_data.get('duration')), + 'uploader_id': video_data.get('account'), + 'formats': formats, + } diff --git a/youtube_dl/extractor/twentyfourvideo.py b/youtube_dl/extractor/twentyfourvideo.py index 4025edf..af92b71 100644 --- a/youtube_dl/extractor/twentyfourvideo.py +++ b/youtube_dl/extractor/twentyfourvideo.py @@ -12,32 +12,32 @@ from ..utils import ( class TwentyFourVideoIE(InfoExtractor): IE_NAME = '24video' - _VALID_URL = r'https?://(?:www\.)?24video\.net/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' + _VALID_URL = r'https?://(?:www\.)?24video\.(?:net|me|xxx)/(?:video/(?:view|xml)/|player/new24_play\.swf\?id=)(?P<id>\d+)' - _TESTS = [ - { - 'url': 'http://www.24video.net/video/view/1044982', - 'md5': 'e09fc0901d9eaeedac872f154931deeb', - 'info_dict': { - 'id': '1044982', - 'ext': 'mp4', - 'title': 'Эротика каменного века', - 'description': 'Как смотрели порно в каменном веке.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'SUPERTELO', - 'duration': 31, - 'timestamp': 1275937857, - 'upload_date': '20100607', - 'age_limit': 18, - 'like_count': int, - 'dislike_count': int, - }, + _TESTS = [{ + 'url': 'http://www.24video.net/video/view/1044982', + 'md5': 'e09fc0901d9eaeedac872f154931deeb', + 'info_dict': { + 'id': '1044982', + 'ext': 'mp4', + 'title': 'Эротика каменного века', + 'description': 'Как смотрели порно в каменном веке.', + 'thumbnail': 're:^https?://.*\.jpg$', + 'uploader': 'SUPERTELO', + 'duration': 31, + 'timestamp': 1275937857, + 'upload_date': '20100607', + 'age_limit': 18, + 'like_count': int, + 'dislike_count': int, }, - { - 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', - 'only_matching': True, - } - ] + }, { + 'url': 'http://www.24video.net/player/new24_play.swf?id=1044982', + 'only_matching': True, + }, { + 'url': 'http://www.24video.me/video/view/1044982', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -64,7 +64,7 @@ class TwentyFourVideoIE(InfoExtractor): r'<span class="video-views">(\d+) просмотр', webpage, 'view count', fatal=False)) comment_count = int_or_none(self._html_search_regex( - r'<div class="comments-title" id="comments-count">(\d+) комментари', + r'<a[^>]+href="#tab-comments"[^>]*>(\d+) комментари', webpage, 'comment count', fatal=False)) # Sets some cookies diff --git a/youtube_dl/extractor/twitch.py b/youtube_dl/extractor/twitch.py index 2091977..890f551 100644 --- a/youtube_dl/extractor/twitch.py +++ b/youtube_dl/extractor/twitch.py @@ -29,7 +29,7 @@ class TwitchBaseIE(InfoExtractor): _VALID_URL_BASE = r'https?://(?:www\.)?twitch\.tv' _API_BASE = 'https://api.twitch.tv' - _USHER_BASE = 'http://usher.twitch.tv' + _USHER_BASE = 'https://usher.ttvnw.net' _LOGIN_URL = 'http://www.twitch.tv/login' _NETRC_MACHINE = 'twitch' @@ -461,7 +461,7 @@ class TwitchClipsIE(InfoExtractor): IE_NAME = 'twitch:clips' _VALID_URL = r'https?://clips\.twitch\.tv/(?:[^/]+/)*(?P<id>[^/?#&]+)' - _TEST = { + _TESTS = [{ 'url': 'https://clips.twitch.tv/ea/AggressiveCobraPoooound', 'md5': '761769e1eafce0ffebfb4089cb3847cd', 'info_dict': { @@ -473,7 +473,11 @@ class TwitchClipsIE(InfoExtractor): 'uploader': 'stereotype_', 'uploader_id': 'stereotype_', }, - } + }, { + # multiple formats + 'url': 'https://clips.twitch.tv/rflegendary/UninterestedBeeDAESuppy', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) @@ -485,15 +489,27 @@ class TwitchClipsIE(InfoExtractor): r'(?s)clipInfo\s*=\s*({.+?});', webpage, 'clip info'), video_id, transform_source=js_to_json) - video_url = clip['clip_video_url'] - title = clip['channel_title'] + title = clip.get('channel_title') or self._og_search_title(webpage) + + formats = [{ + 'url': option['source'], + 'format_id': option.get('quality'), + 'height': int_or_none(option.get('quality')), + } for option in clip.get('quality_options', []) if option.get('source')] + + if not formats: + formats = [{ + 'url': clip['clip_video_url'], + }] + + self._sort_formats(formats) return { 'id': video_id, - 'url': video_url, 'title': title, 'thumbnail': self._og_search_thumbnail(webpage), 'creator': clip.get('broadcaster_display_name') or clip.get('broadcaster_login'), 'uploader': clip.get('curator_login'), 'uploader_id': clip.get('curator_display_name'), + 'formats': formats, } diff --git a/youtube_dl/extractor/uol.py b/youtube_dl/extractor/uol.py new file mode 100644 index 0000000..c27c643 --- /dev/null +++ b/youtube_dl/extractor/uol.py @@ -0,0 +1,128 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import ( + clean_html, + int_or_none, + parse_duration, + update_url_query, + str_or_none, +) + + +class UOLIE(InfoExtractor): + IE_NAME = 'uol.com.br' + _VALID_URL = r'https?://(?:.+?\.)?uol\.com\.br/.*?(?:(?:mediaId|v)=|view/(?:[a-z0-9]+/)?|video(?:=|/(?:\d{4}/\d{2}/\d{2}/)?))(?P<id>\d+|[\w-]+-[A-Z0-9]+)' + _TESTS = [{ + 'url': 'http://player.mais.uol.com.br/player_video_v3.swf?mediaId=15951931', + 'md5': '25291da27dc45e0afb5718a8603d3816', + 'info_dict': { + 'id': '15951931', + 'ext': 'mp4', + 'title': 'Miss simpatia é encontrada morta', + 'description': 'md5:3f8c11a0c0556d66daf7e5b45ef823b2', + } + }, { + 'url': 'http://tvuol.uol.com.br/video/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'md5': 'e41a2fb7b7398a3a46b6af37b15c00c9', + 'info_dict': { + 'id': '15954259', + 'ext': 'mp4', + 'title': 'Incêndio destrói uma das maiores casas noturnas de Londres', + 'description': 'Em Londres, um incêndio destruiu uma das maiores boates da cidade. Não há informações sobre vítimas.', + } + }, { + 'url': 'http://mais.uol.com.br/static/uolplayer/index.html?mediaId=15951931', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/15954259', + 'only_matching': True, + }, { + 'url': 'http://noticias.band.uol.com.br/brasilurgente/video/2016/08/05/15951931/miss-simpatia-e-encontrada-morta.html', + 'only_matching': True, + }, { + 'url': 'http://videos.band.uol.com.br/programa.asp?e=noticias&pr=brasil-urgente&v=15951931&t=Policia-desmonte-base-do-PCC-na-Cracolandia', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/cphaa0gl2x8r/incendio-destroi-uma-das-maiores-casas-noturnas-de-londres-04024E9A3268D4C95326', + 'only_matching': True, + }, { + 'url': 'http://noticias.uol.com.br//videos/assistir.htm?video=rafaela-silva-inspira-criancas-no-judo-04024D983968D4C95326', + 'only_matching': True, + }, { + 'url': 'http://mais.uol.com.br/view/e0qbgxid79uv/15275470', + 'only_matching': True, + }] + + _FORMATS = { + '2': { + 'width': 640, + 'height': 360, + }, + '5': { + 'width': 1080, + 'height': 720, + }, + '6': { + 'width': 426, + 'height': 240, + }, + '7': { + 'width': 1920, + 'height': 1080, + }, + '8': { + 'width': 192, + 'height': 144, + }, + '9': { + 'width': 568, + 'height': 320, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + if not video_id.isdigit(): + embed_page = self._download_webpage('https://jsuol.com.br/c/tv/uol/embed/?params=[embed,%s]' % video_id, video_id) + video_id = self._search_regex(r'mediaId=(\d+)', embed_page, 'media id') + video_data = self._download_json( + 'http://mais.uol.com.br/apiuol/v3/player/getMedia/%s.json' % video_id, + video_id)['item'] + title = video_data['title'] + + query = { + 'ver': video_data.get('numRevision', 2), + 'r': 'http://mais.uol.com.br', + } + formats = [] + for f in video_data.get('formats', []): + f_url = f.get('url') or f.get('secureUrl') + if not f_url: + continue + format_id = str_or_none(f.get('id')) + fmt = { + 'format_id': format_id, + 'url': update_url_query(f_url, query), + } + fmt.update(self._FORMATS.get(format_id, {})) + formats.append(fmt) + self._sort_formats(formats) + + tags = [] + for tag in video_data.get('tags', []): + tag_description = tag.get('description') + if not tag_description: + continue + tags.append(tag_description) + + return { + 'id': video_id, + 'title': title, + 'description': clean_html(video_data.get('desMedia')), + 'thumbnail': video_data.get('thumbnail'), + 'duration': int_or_none(video_data.get('durationSeconds')) or parse_duration(video_data.get('duration')), + 'tags': tags, + 'formats': formats, + } diff --git a/youtube_dl/extractor/uplynk.py b/youtube_dl/extractor/uplynk.py new file mode 100644 index 0000000..ae529f6 --- /dev/null +++ b/youtube_dl/extractor/uplynk.py @@ -0,0 +1,70 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from .common import InfoExtractor +from ..utils import ( + float_or_none, + ExtractorError, +) + + +class UplynkIE(InfoExtractor): + IE_NAME = 'uplynk' + _VALID_URL = r'https?://.*?\.uplynk\.com/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.(?:m3u8|json)(?:.*?\bpbs=(?P<session_id>[^&]+))?' + _TEST = { + 'url': 'http://content.uplynk.com/e89eaf2ce9054aa89d92ddb2d817a52e.m3u8', + 'info_dict': { + 'id': 'e89eaf2ce9054aa89d92ddb2d817a52e', + 'ext': 'mp4', + 'title': '030816-kgo-530pm-solar-eclipse-vid_web.mp4', + 'uploader_id': '4413701bf5a1488db55b767f8ae9d4fa', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + } + + def _extract_uplynk_info(self, uplynk_content_url): + path, external_id, video_id, session_id = re.match(UplynkIE._VALID_URL, uplynk_content_url).groups() + display_id = video_id or external_id + formats = self._extract_m3u8_formats('http://content.uplynk.com/%s.m3u8' % path, display_id, 'mp4') + if session_id: + for f in formats: + f['extra_param_to_segment_url'] = { + 'pbs': session_id, + } + self._sort_formats(formats) + asset = self._download_json('http://content.uplynk.com/player/assetinfo/%s.json' % path, display_id) + if asset.get('error') == 1: + raise ExtractorError('% said: %s' % (self.IE_NAME, asset['msg']), expected=True) + + return { + 'id': asset['asset'], + 'title': asset['desc'], + 'thumbnail': asset.get('default_poster_url'), + 'duration': float_or_none(asset.get('duration')), + 'uploader_id': asset.get('owner'), + 'formats': formats, + } + + def _real_extract(self, url): + return self._extract_uplynk_info(url) + + +class UplynkPreplayIE(UplynkIE): + IE_NAME = 'uplynk:preplay' + _VALID_URL = r'https?://.*?\.uplynk\.com/preplay2?/(?P<path>ext/[0-9a-f]{32}/(?P<external_id>[^/?&]+)|(?P<id>[0-9a-f]{32}))\.json' + _TEST = None + + def _real_extract(self, url): + path, external_id, video_id = re.match(self._VALID_URL, url).groups() + display_id = video_id or external_id + preplay = self._download_json(url, display_id) + content_url = 'http://content.uplynk.com/%s.m3u8' % path + session_id = preplay.get('sid') + if session_id: + content_url += '?pbs=' + session_id + return self._extract_uplynk_info(content_url) diff --git a/youtube_dl/extractor/urplay.py b/youtube_dl/extractor/urplay.py new file mode 100644 index 0000000..ce3bf6b --- /dev/null +++ b/youtube_dl/extractor/urplay.py @@ -0,0 +1,67 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor + + +class URPlayIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?urplay\.se/program/(?P<id>[0-9]+)' + _TEST = { + 'url': 'http://urplay.se/program/190031-tripp-trapp-trad-sovkudde', + 'md5': '15ca67b63fd8fb320ac2bcd854bad7b6', + 'info_dict': { + 'id': '190031', + 'ext': 'mp4', + 'title': 'Tripp, Trapp, Träd : Sovkudde', + 'description': 'md5:b86bffdae04a7e9379d1d7e5947df1d1', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + urplayer_data = self._parse_json(self._search_regex( + r'urPlayer\.init\(({.+?})\);', webpage, 'urplayer data'), video_id) + host = self._download_json('http://streaming-loadbalancer.ur.se/loadbalancer.json', video_id)['redirect'] + + formats = [] + for quality_attr, quality, preference in (('', 'sd', 0), ('_hd', 'hd', 1)): + file_rtmp = urplayer_data.get('file_rtmp' + quality_attr) + if file_rtmp: + formats.append({ + 'url': 'rtmp://%s/urplay/mp4:%s' % (host, file_rtmp), + 'format_id': quality + '-rtmp', + 'ext': 'flv', + 'preference': preference, + }) + file_http = urplayer_data.get('file_http' + quality_attr) or urplayer_data.get('file_http_sub' + quality_attr) + if file_http: + file_http_base_url = 'http://%s/%s' % (host, file_http) + formats.extend(self._extract_f4m_formats( + file_http_base_url + 'manifest.f4m', video_id, + preference, '%s-hds' % quality, fatal=False)) + formats.extend(self._extract_m3u8_formats( + file_http_base_url + 'playlist.m3u8', video_id, 'mp4', + 'm3u8_native', preference, '%s-hls' % quality, fatal=False)) + self._sort_formats(formats) + + subtitles = {} + for subtitle in urplayer_data.get('subtitles', []): + subtitle_url = subtitle.get('file') + kind = subtitle.get('kind') + if subtitle_url or kind and kind != 'captions': + continue + subtitles.setdefault(subtitle.get('label', 'Svenska'), []).append({ + 'url': subtitle_url, + }) + + return { + 'id': video_id, + 'title': urplayer_data['title'], + 'description': self._og_search_description(webpage), + 'thumbnail': urplayer_data.get('image'), + 'series': urplayer_data.get('series_title'), + 'subtitles': subtitles, + 'formats': formats, + } diff --git a/youtube_dl/extractor/vbox7.py b/youtube_dl/extractor/vbox7.py index dff1bb7..e179885 100644 --- a/youtube_dl/extractor/vbox7.py +++ b/youtube_dl/extractor/vbox7.py @@ -1,18 +1,23 @@ # encoding: utf-8 from __future__ import unicode_literals +import re + from .common import InfoExtractor -from ..compat import compat_urlparse -from ..utils import ( - ExtractorError, - sanitized_Request, - urlencode_postdata, -) +from ..utils import urlencode_postdata class Vbox7IE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?vbox7\.com/play:(?P<id>[^/]+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?vbox7\.com/(?:play:|emb/external\.php\?.*?\bvid=)(?P<id>[\da-fA-F]+)' + _TESTS = [{ + 'url': 'http://vbox7.com/play:0946fff23c', + 'md5': 'a60f9ab3a3a2f013ef9a967d5f7be5bf', + 'info_dict': { + 'id': '0946fff23c', + 'ext': 'mp4', + 'title': 'Борисов: Притеснен съм за бъдещето на България', + }, + }, { 'url': 'http://vbox7.com/play:249bb972c2', 'md5': '99f65c0c9ef9b682b97313e052734c3f', 'info_dict': { @@ -20,43 +25,50 @@ class Vbox7IE(InfoExtractor): 'ext': 'mp4', 'title': 'Смях! Чудо - чист за секунди - Скрита камера', }, - } + 'skip': 'georestricted', + }, { + 'url': 'http://vbox7.com/emb/external.php?vid=a240d20f9c&autoplay=1', + 'only_matching': True, + }] + + @staticmethod + def _extract_url(webpage): + mobj = re.search( + '<iframe[^>]+src=(?P<q>["\'])(?P<url>(?:https?:)?//vbox7\.com/emb/external\.php.+?)(?P=q)', + webpage) + if mobj: + return mobj.group('url') def _real_extract(self, url): video_id = self._match_id(url) - # need to get the page 3 times for the correct jsSecretToken cookie - # which is necessary for the correct title - def get_session_id(): - redirect_page = self._download_webpage(url, video_id) - session_id_url = self._search_regex( - r'var\s*url\s*=\s*\'([^\']+)\';', redirect_page, - 'session id url') - self._download_webpage( - compat_urlparse.urljoin(url, session_id_url), video_id, - 'Getting session id') - - get_session_id() - get_session_id() - - webpage = self._download_webpage(url, video_id, - 'Downloading redirect page') - - title = self._html_search_regex(r'<title>(.*)</title>', - webpage, 'title').split('/')[0].strip() - - info_url = 'http://vbox7.com/play/magare.do' - data = urlencode_postdata({'as3': '1', 'vid': video_id}) - info_request = sanitized_Request(info_url, data) - info_request.add_header('Content-Type', 'application/x-www-form-urlencoded') - info_response = self._download_webpage(info_request, video_id, 'Downloading info webpage') - if info_response is None: - raise ExtractorError('Unable to extract the media url') - (final_url, thumbnail_url) = map(lambda x: x.split('=')[1], info_response.split('&')) + webpage = self._download_webpage( + 'http://vbox7.com/play:%s' % video_id, video_id) + + title = self._html_search_regex( + r'<title>(.+?)</title>', webpage, 'title').split('/')[0].strip() + + video_url = self._search_regex( + r'src\s*:\s*(["\'])(?P<url>.+?.mp4.*?)\1', + webpage, 'video url', default=None, group='url') + + thumbnail_url = self._og_search_thumbnail(webpage) + + if not video_url: + info_response = self._download_webpage( + 'http://vbox7.com/play/magare.do', video_id, + 'Downloading info webpage', + data=urlencode_postdata({'as3': '1', 'vid': video_id}), + headers={'Content-Type': 'application/x-www-form-urlencoded'}) + final_url, thumbnail_url = map( + lambda x: x.split('=')[1], info_response.split('&')) + + if '/na.mp4' in video_url: + self.raise_geo_restricted() return { 'id': video_id, - 'url': final_url, + 'url': self._proto_relative_url(video_url, 'http:'), 'title': title, 'thumbnail': thumbnail_url, } diff --git a/youtube_dl/extractor/vgtv.py b/youtube_dl/extractor/vgtv.py index b11cd25..1857563 100644 --- a/youtube_dl/extractor/vgtv.py +++ b/youtube_dl/extractor/vgtv.py @@ -8,6 +8,7 @@ from .xstream import XstreamIE from ..utils import ( ExtractorError, float_or_none, + try_get, ) @@ -129,6 +130,11 @@ class VGTVIE(XstreamIE): 'url': 'http://ap.vgtv.no/webtv#!/video/111084/de-nye-bysyklene-lettere-bedre-gir-stoerre-hjul-og-feste-til-mobil', 'only_matching': True, }, + { + # geoblocked + 'url': 'http://www.vgtv.no/#!/video/127205/inside-the-mind-of-favela-funk', + 'only_matching': True, + }, ] def _real_extract(self, url): @@ -196,6 +202,12 @@ class VGTVIE(XstreamIE): info['formats'].extend(formats) + if not info['formats']: + properties = try_get( + data, lambda x: x['streamConfiguration']['properties'], list) + if properties and 'geoblocked' in properties: + raise self.raise_geo_restricted() + self._sort_formats(info['formats']) info.update({ diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py new file mode 100644 index 0000000..8742b60 --- /dev/null +++ b/youtube_dl/extractor/viceland.py @@ -0,0 +1,107 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import time +import hashlib +import json + +from .adobepass import AdobePassIE +from ..compat import compat_HTTPError +from ..utils import ( + int_or_none, + parse_age_limit, + str_or_none, + parse_duration, + ExtractorError, + extract_attributes, +) + + +class VicelandIE(AdobePassIE): + _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)' + _TEST = { + 'url': 'https://www.viceland.com/en_us/video/cyberwar-trailer/57608447973ee7705f6fbd4e', + 'info_dict': { + 'id': '57608447973ee7705f6fbd4e', + 'ext': 'mp4', + 'title': 'CYBERWAR (Trailer)', + 'description': 'Tapping into the geopolitics of hacking and surveillance, Ben Makuch travels the world to meet with hackers, government officials, and dissidents to investigate the ecosystem of cyberwarfare.', + 'age_limit': 14, + 'timestamp': 1466008539, + 'upload_date': '20160615', + 'uploader_id': '11', + 'uploader': 'Viceland', + }, + 'params': { + # m3u8 download + 'skip_download': True, + }, + 'add_ie': ['UplynkPreplay'], + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + watch_hub_data = extract_attributes(self._search_regex( + r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub')) + video_id = watch_hub_data['vms-id'] + title = watch_hub_data['video-title'] + + query = {} + if watch_hub_data.get('video-locked') == '1': + resource = self._get_mvpd_resource( + 'VICELAND', title, video_id, + watch_hub_data.get('video-rating')) + query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource) + + # signature generation algorithm is reverse engineered from signatureGenerator in + # webpack:///../shared/~/vice-player/dist/js/vice-player.js in + # https://www.viceland.com/assets/common/js/web.vendor.bundle.js + exp = int(time.time()) + 14400 + query.update({ + 'exp': exp, + 'sign': hashlib.sha512(('%s:GET:%d' % (video_id, exp)).encode()).hexdigest(), + }) + + try: + preplay = self._download_json('https://www.viceland.com/en_us/preplay/%s' % video_id, video_id, query=query) + except ExtractorError as e: + if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400: + error = json.loads(e.cause.read().decode()) + raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True) + raise + + video_data = preplay['video'] + base = video_data['base'] + uplynk_preplay_url = preplay['preplayURL'] + episode = video_data.get('episode', {}) + channel = video_data.get('channel', {}) + + subtitles = {} + cc_url = preplay.get('ccURL') + if cc_url: + subtitles['en'] = [{ + 'url': cc_url, + }] + + return { + '_type': 'url_transparent', + 'url': uplynk_preplay_url, + 'id': video_id, + 'title': title, + 'description': base.get('body'), + 'thumbnail': watch_hub_data.get('cover-image') or watch_hub_data.get('thumbnail'), + 'duration': parse_duration(video_data.get('video_duration') or watch_hub_data.get('video-duration')), + 'timestamp': int_or_none(video_data.get('created_at')), + 'age_limit': parse_age_limit(video_data.get('video_rating')), + 'series': video_data.get('show_title') or watch_hub_data.get('show-title'), + 'episode_number': int_or_none(episode.get('episode_number') or watch_hub_data.get('episode')), + 'episode_id': str_or_none(episode.get('id') or video_data.get('episode_id')), + 'season_number': int_or_none(watch_hub_data.get('season')), + 'season_id': str_or_none(episode.get('season_id')), + 'uploader': channel.get('base', {}).get('title') or watch_hub_data.get('channel-title'), + 'uploader_id': str_or_none(channel.get('id')), + 'subtitles': subtitles, + 'ie_key': 'UplynkPreplay', + } diff --git a/youtube_dl/extractor/vidbit.py b/youtube_dl/extractor/vidbit.py new file mode 100644 index 0000000..e7ac5a8 --- /dev/null +++ b/youtube_dl/extractor/vidbit.py @@ -0,0 +1,84 @@ +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..compat import compat_urlparse +from ..utils import ( + int_or_none, + js_to_json, + remove_end, + unified_strdate, +) + + +class VidbitIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vidbit\.co/(?:watch|embed)\?.*?\bv=(?P<id>[\da-zA-Z]+)' + _TESTS = [{ + 'url': 'http://www.vidbit.co/watch?v=jkL2yDOEq2', + 'md5': '1a34b7f14defe3b8fafca9796892924d', + 'info_dict': { + 'id': 'jkL2yDOEq2', + 'ext': 'mp4', + 'title': 'Intro to VidBit', + 'description': 'md5:5e0d6142eec00b766cbf114bfd3d16b7', + 'thumbnail': 're:https?://.*\.jpg$', + 'upload_date': '20160618', + 'view_count': int, + 'comment_count': int, + } + }, { + 'url': 'http://www.vidbit.co/embed?v=jkL2yDOEq2&auto=0&water=0', + 'only_matching': True, + }] + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage( + compat_urlparse.urljoin(url, '/watch?v=%s' % video_id), video_id) + + video_url, title = [None] * 2 + + config = self._parse_json(self._search_regex( + r'(?s)\.setup\(({.+?})\);', webpage, 'setup', default='{}'), + video_id, transform_source=js_to_json) + if config: + if config.get('file'): + video_url = compat_urlparse.urljoin(url, config['file']) + title = config.get('title') + + if not video_url: + video_url = compat_urlparse.urljoin(url, self._search_regex( + r'file\s*:\s*(["\'])(?P<url>(?:(?!\1).)+)\1', + webpage, 'video URL', group='url')) + + if not title: + title = remove_end( + self._html_search_regex( + (r'<h1>(.+?)</h1>', r'<title>(.+?)</title>'), + webpage, 'title', default=None) or self._og_search_title(webpage), + ' - VidBit') + + description = self._html_search_meta( + ('description', 'og:description', 'twitter:description'), + webpage, 'description') + + upload_date = unified_strdate(self._html_search_meta( + 'datePublished', webpage, 'upload date')) + + view_count = int_or_none(self._search_regex( + r'<strong>(\d+)</strong> views', + webpage, 'view count', fatal=False)) + comment_count = int_or_none(self._search_regex( + r'id=["\']cmt_num["\'][^>]*>\((\d+)\)', + webpage, 'comment count', fatal=False)) + + return { + 'id': video_id, + 'url': video_url, + 'title': title, + 'description': description, + 'thumbnail': self._og_search_thumbnail(webpage), + 'upload_date': upload_date, + 'view_count': view_count, + 'comment_count': comment_count, + } diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py index 3c78fb3..d49cc6c 100644 --- a/youtube_dl/extractor/vidzi.py +++ b/youtube_dl/extractor/vidzi.py @@ -9,8 +9,8 @@ from ..utils import ( class VidziIE(JWPlatformBaseIE): - _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?P<id>\w+)' - _TEST = { + _VALID_URL = r'https?://(?:www\.)?vidzi\.tv/(?:embed-)?(?P<id>[0-9a-zA-Z]+)' + _TESTS = [{ 'url': 'http://vidzi.tv/cghql9yq6emu.html', 'md5': '4f16c71ca0c8c8635ab6932b5f3f1660', 'info_dict': { @@ -22,12 +22,16 @@ class VidziIE(JWPlatformBaseIE): # m3u8 download 'skip_download': True, }, - } + }, { + 'url': 'http://vidzi.tv/embed-4z2yb0rzphe9-600x338.html', + 'skip_download': True, + }] def _real_extract(self, url): video_id = self._match_id(url) - webpage = self._download_webpage(url, video_id) + webpage = self._download_webpage( + 'http://vidzi.tv/%s' % video_id, video_id) title = self._html_search_regex( r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title') diff --git a/youtube_dl/extractor/viki.py b/youtube_dl/extractor/viki.py index efa15e0..4351ac4 100644 --- a/youtube_dl/extractor/viki.py +++ b/youtube_dl/extractor/viki.py @@ -130,7 +130,7 @@ class VikiIE(VikiBaseIE): }, { # clip 'url': 'http://www.viki.com/videos/1067139v-the-avengers-age-of-ultron-press-conference', - 'md5': 'feea2b1d7b3957f70886e6dfd8b8be84', + 'md5': '86c0b5dbd4d83a6611a79987cc7a1989', 'info_dict': { 'id': '1067139v', 'ext': 'mp4', @@ -156,15 +156,11 @@ class VikiIE(VikiBaseIE): 'like_count': int, 'age_limit': 13, }, - 'params': { - # m3u8 download - 'skip_download': True, - }, 'skip': 'Blocked in the US', }, { # episode 'url': 'http://www.viki.com/videos/44699v-boys-over-flowers-episode-1', - 'md5': '1f54697dabc8f13f31bf06bb2e4de6db', + 'md5': '5fa476a902e902783ac7a4d615cdbc7a', 'info_dict': { 'id': '44699v', 'ext': 'mp4', @@ -200,7 +196,7 @@ class VikiIE(VikiBaseIE): }, { # non-English description 'url': 'http://www.viki.com/videos/158036v-love-in-magic', - 'md5': '013dc282714e22acf9447cad14ff1208', + 'md5': '1713ae35df5a521b31f6dc40730e7c9c', 'info_dict': { 'id': '158036v', 'ext': 'mp4', @@ -281,9 +277,16 @@ class VikiIE(VikiBaseIE): r'^(\d+)[pP]$', format_id, 'height', default=None)) for protocol, format_dict in stream_dict.items(): if format_id == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_dict['url'], video_id, 'mp4', 'm3u8_native', - m3u8_id='m3u8-%s' % protocol, fatal=False)) + m3u8_formats = self._extract_m3u8_formats( + format_dict['url'], video_id, 'mp4', + entry_protocol='m3u8_native', preference=-1, + m3u8_id='m3u8-%s' % protocol, fatal=False) + # Despite CODECS metadata in m3u8 all video-only formats + # are actually video+audio + for f in m3u8_formats: + if f.get('acodec') == 'none' and f.get('vcodec') != 'none': + f['acodec'] = None + formats.extend(m3u8_formats) else: formats.append({ 'url': format_dict['url'], diff --git a/youtube_dl/extractor/vimeo.py b/youtube_dl/extractor/vimeo.py index d9c9852..7e854f3 100644 --- a/youtube_dl/extractor/vimeo.py +++ b/youtube_dl/extractor/vimeo.py @@ -364,6 +364,11 @@ class VimeoIE(VimeoBaseInfoExtractor): r'<embed[^>]+?src="((?:https?:)?//(?:www\.)?vimeo\.com/moogaloop\.swf.+?)"', webpage) if mobj: return mobj.group(1) + # Look more for non-standard embedded Vimeo player + mobj = re.search( + r'<video[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//(?:www\.)?vimeo\.com/[0-9]+)(?P=q1)', webpage) + if mobj: + return mobj.group('url') def _verify_player_video_password(self, url, video_id): password = self._downloader.params.get('videopassword') diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py index 5b80184..0183f05 100644 --- a/youtube_dl/extractor/vine.py +++ b/youtube_dl/extractor/vine.py @@ -90,10 +90,12 @@ class VineIE(InfoExtractor): data = self._parse_json( self._search_regex( - r'window\.POST_DATA\s*=\s*{\s*%s\s*:\s*({.+?})\s*};\s*</script>' % video_id, + r'window\.POST_DATA\s*=\s*({.+?});\s*</script>', webpage, 'vine data'), video_id) + data = data[list(data.keys())[0]] + formats = [{ 'format_id': '%(format)s-%(rate)s' % f, 'vcodec': f.get('format'), diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py index cfc5ffd..3ee66e2 100644 --- a/youtube_dl/extractor/vk.py +++ b/youtube_dl/extractor/vk.py @@ -6,11 +6,18 @@ import json import sys from .common import InfoExtractor -from ..compat import compat_str +from ..compat import ( + compat_str, + compat_urlparse, +) from ..utils import ( + clean_html, ExtractorError, + get_element_by_class, int_or_none, orderedSet, + parse_duration, + remove_start, str_to_int, unescapeHTML, unified_strdate, @@ -20,26 +27,72 @@ from .vimeo import VimeoIE from .pladform import PladformIE -class VKIE(InfoExtractor): +class VKBaseIE(InfoExtractor): + _NETRC_MACHINE = 'vk' + + def _login(self): + (username, password) = self._get_login_info() + if username is None: + return + + login_page, url_handle = self._download_webpage_handle( + 'https://vk.com', None, 'Downloading login page') + + login_form = self._hidden_inputs(login_page) + + login_form.update({ + 'email': username.encode('cp1251'), + 'pass': password.encode('cp1251'), + }) + + # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header + # and expects the first one to be set rather than second (see + # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). + # As of RFC6265 the newer one cookie should be set into cookie store + # what actually happens. + # We will workaround this VK issue by resetting the remixlhk cookie to + # the first one manually. + cookies = url_handle.headers.get('Set-Cookie') + if cookies: + if sys.version_info[0] >= 3: + cookies = cookies.encode('iso-8859-1') + cookies = cookies.decode('utf-8') + remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies) + if remixlhk: + value, domain = remixlhk.groups() + self._set_cookie(domain, 'remixlhk', value) + + login_page = self._download_webpage( + 'https://login.vk.com/?act=login', None, + note='Logging in as %s' % username, + data=urlencode_postdata(login_form)) + + if re.search(r'onLoginFailed', login_page): + raise ExtractorError( + 'Unable to login, incorrect username and/or password', expected=True) + + def _real_initialize(self): + self._login() + + +class VKIE(VKBaseIE): IE_NAME = 'vk' IE_DESC = 'VK' _VALID_URL = r'''(?x) https?:// (?: (?: - (?:m\.)?vk\.com/video_| + (?:(?:m|new)\.)?vk\.com/video_| (?:www\.)?daxab.com/ ) ext\.php\?(?P<embed_query>.*?\boid=(?P<oid>-?\d+).*?\bid=(?P<id>\d+).*)| (?: - (?:m\.)?vk\.com/(?:.+?\?.*?z=)?video| + (?:(?:m|new)\.)?vk\.com/(?:.+?\?.*?z=)?video| (?:www\.)?daxab.com/embed/ ) (?P<videoid>-?\d+_\d+)(?:.*\blist=(?P<list_id>[\da-f]+))? ) ''' - _NETRC_MACHINE = 'vk' - _TESTS = [ { 'url': 'http://vk.com/videos-77521?z=video-77521_162222515%2Fclub77521', @@ -182,52 +235,13 @@ class VKIE(InfoExtractor): # pladform embed 'url': 'https://vk.com/video-76116461_171554880', 'only_matching': True, + }, + { + 'url': 'http://new.vk.com/video205387401_165548505', + 'only_matching': True, } ] - def _login(self): - (username, password) = self._get_login_info() - if username is None: - return - - login_page, url_handle = self._download_webpage_handle( - 'https://vk.com', None, 'Downloading login page') - - login_form = self._hidden_inputs(login_page) - - login_form.update({ - 'email': username.encode('cp1251'), - 'pass': password.encode('cp1251'), - }) - - # https://new.vk.com/ serves two same remixlhk cookies in Set-Cookie header - # and expects the first one to be set rather than second (see - # https://github.com/rg3/youtube-dl/issues/9841#issuecomment-227871201). - # As of RFC6265 the newer one cookie should be set into cookie store - # what actually happens. - # We will workaround this VK issue by resetting the remixlhk cookie to - # the first one manually. - cookies = url_handle.headers.get('Set-Cookie') - if sys.version_info[0] >= 3: - cookies = cookies.encode('iso-8859-1') - cookies = cookies.decode('utf-8') - remixlhk = re.search(r'remixlhk=(.+?);.*?\bdomain=(.+?)(?:[,;]|$)', cookies) - if remixlhk: - value, domain = remixlhk.groups() - self._set_cookie(domain, 'remixlhk', value) - - login_page = self._download_webpage( - 'https://login.vk.com/?act=login', None, - note='Logging in as %s' % username, - data=urlencode_postdata(login_form)) - - if re.search(r'onLoginFailed', login_page): - raise ExtractorError( - 'Unable to login, incorrect username and/or password', expected=True) - - def _real_initialize(self): - self._login() - def _real_extract(self, url): mobj = re.match(self._VALID_URL, url) video_id = mobj.group('videoid') @@ -351,10 +365,10 @@ class VKIE(InfoExtractor): } -class VKUserVideosIE(InfoExtractor): +class VKUserVideosIE(VKBaseIE): IE_NAME = 'vk:uservideos' IE_DESC = "VK - User's Videos" - _VALID_URL = r'https?://vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' + _VALID_URL = r'https?://(?:(?:m|new)\.)?vk\.com/videos(?P<id>-?[0-9]+)(?!\?.*\bz=video)(?:[/?#&]|$)' _TEMPLATE_URL = 'https://vk.com/videos' _TESTS = [{ 'url': 'http://vk.com/videos205387401', @@ -369,6 +383,12 @@ class VKUserVideosIE(InfoExtractor): }, { 'url': 'http://vk.com/videos-97664626?section=all', 'only_matching': True, + }, { + 'url': 'http://m.vk.com/videos205387401', + 'only_matching': True, + }, { + 'url': 'http://new.vk.com/videos205387401', + 'only_matching': True, }] def _real_extract(self, url): @@ -386,3 +406,121 @@ class VKUserVideosIE(InfoExtractor): webpage, 'title', default=page_id)) return self.playlist_result(entries, page_id, title) + + +class VKWallPostIE(VKBaseIE): + IE_NAME = 'vk:wallpost' + _VALID_URL = r'https?://(?:(?:(?:(?:m|new)\.)?vk\.com/(?:[^?]+\?.*\bw=)?wall(?P<id>-?\d+_\d+)))' + _TESTS = [{ + # public page URL, audio playlist + 'url': 'https://vk.com/bs.official?w=wall-23538238_35', + 'info_dict': { + 'id': '23538238_35', + 'title': 'Black Shadow - Wall post 23538238_35', + 'description': 'md5:3f84b9c4f9ef499731cf1ced9998cc0c', + }, + 'playlist': [{ + 'md5': '5ba93864ec5b85f7ce19a9af4af080f6', + 'info_dict': { + 'id': '135220665_111806521', + 'ext': 'mp3', + 'title': 'Black Shadow - Слепое Верование', + 'duration': 370, + 'uploader': 'Black Shadow', + 'artist': 'Black Shadow', + 'track': 'Слепое Верование', + }, + }, { + 'md5': '4cc7e804579122b17ea95af7834c9233', + 'info_dict': { + 'id': '135220665_111802303', + 'ext': 'mp3', + 'title': 'Black Shadow - Война - Негасимое Бездны Пламя!', + 'duration': 423, + 'uploader': 'Black Shadow', + 'artist': 'Black Shadow', + 'track': 'Война - Негасимое Бездны Пламя!', + }, + 'params': { + 'skip_download': True, + }, + }], + 'skip': 'Requires vk account credentials', + }, { + # single YouTube embed, no leading - + 'url': 'https://vk.com/wall85155021_6319', + 'info_dict': { + 'id': '85155021_6319', + 'title': 'Sergey Gorbunov - Wall post 85155021_6319', + }, + 'playlist_count': 1, + 'skip': 'Requires vk account credentials', + }, { + # wall page URL + 'url': 'https://vk.com/wall-23538238_35', + 'only_matching': True, + }, { + # mobile wall page URL + 'url': 'https://m.vk.com/wall-23538238_35', + 'only_matching': True, + }] + + def _real_extract(self, url): + post_id = self._match_id(url) + + wall_url = 'https://vk.com/wall%s' % post_id + + post_id = remove_start(post_id, '-') + + webpage = self._download_webpage(wall_url, post_id) + + error = self._html_search_regex( + r'>Error</div>\s*<div[^>]+class=["\']body["\'][^>]*>([^<]+)', + webpage, 'error', default=None) + if error: + raise ExtractorError('VK said: %s' % error, expected=True) + + description = clean_html(get_element_by_class('wall_post_text', webpage)) + uploader = clean_html(get_element_by_class( + 'fw_post_author', webpage)) or self._og_search_description(webpage) + thumbnail = self._og_search_thumbnail(webpage) + + entries = [] + + for audio in re.finditer(r'''(?sx) + <input[^>]+ + id=(?P<q1>["\'])audio_info(?P<id>\d+_\d+).*?(?P=q1)[^>]+ + value=(?P<q2>["\'])(?P<url>http.+?)(?P=q2) + .+? + </table>''', webpage): + audio_html = audio.group(0) + audio_id = audio.group('id') + duration = parse_duration(get_element_by_class('duration', audio_html)) + track = self._html_search_regex( + r'<span[^>]+id=["\']title%s[^>]*>([^<]+)' % audio_id, + audio_html, 'title', default=None) + artist = self._html_search_regex( + r'>([^<]+)</a></b>\s*&ndash', audio_html, + 'artist', default=None) + entries.append({ + 'id': audio_id, + 'url': audio.group('url'), + 'title': '%s - %s' % (artist, track) if artist and track else audio_id, + 'thumbnail': thumbnail, + 'duration': duration, + 'uploader': uploader, + 'artist': artist, + 'track': track, + }) + + for video in re.finditer( + r'<a[^>]+href=(["\'])(?P<url>/video(?:-?[\d_]+).*?)\1', webpage): + entries.append(self.url_result( + compat_urlparse.urljoin(url, video.group('url')), VKIE.ie_key())) + + title = 'Wall post %s' % post_id + + return self.playlist_result( + orderedSet(entries), post_id, + '%s - %s' % (uploader, title) if uploader else title, + description) diff --git a/youtube_dl/extractor/vodplatform.py b/youtube_dl/extractor/vodplatform.py new file mode 100644 index 0000000..b49542b --- /dev/null +++ b/youtube_dl/extractor/vodplatform.py @@ -0,0 +1,58 @@ +# coding: utf-8 +from __future__ import unicode_literals + +from .common import InfoExtractor +from ..utils import unescapeHTML + + +class VODPlatformIE(InfoExtractor): + _VALID_URL = r'https?://(?:www\.)?vod-platform\.net/embed/(?P<id>[^/?#]+)' + _TEST = { + # from http://www.lbcgroup.tv/watch/chapter/29143/52844/%D8%A7%D9%84%D9%86%D8%B5%D8%B1%D8%A9-%D9%81%D9%8A-%D8%B6%D9%8A%D8%A7%D9%81%D8%A9-%D8%A7%D9%84%D9%80-cnn/ar + 'url': 'http://vod-platform.net/embed/RufMcytHDolTH1MuKHY9Fw', + 'md5': '1db2b7249ce383d6be96499006e951fc', + 'info_dict': { + 'id': 'RufMcytHDolTH1MuKHY9Fw', + 'ext': 'mp4', + 'title': 'LBCi News_ النصرة في ضيافة الـ "سي.أن.أن"', + } + } + + def _real_extract(self, url): + video_id = self._match_id(url) + webpage = self._download_webpage(url, video_id) + + title = unescapeHTML(self._og_search_title(webpage)) + hidden_inputs = self._hidden_inputs(webpage) + + base_url = self._search_regex( + '(.*/)(?:playlist.m3u8|manifest.mpd)', + hidden_inputs.get('HiddenmyhHlsLink') or hidden_inputs['HiddenmyDashLink'], + 'base url') + formats = self._extract_m3u8_formats( + base_url + 'playlist.m3u8', video_id, 'mp4', + 'm3u8_native', m3u8_id='hls', fatal=False) + formats.extend(self._extract_mpd_formats( + base_url + 'manifest.mpd', video_id, + mpd_id='dash', fatal=False)) + rtmp_formats = self._extract_smil_formats( + base_url + 'jwplayer.smil', video_id, fatal=False) + for rtmp_format in rtmp_formats: + rtsp_format = rtmp_format.copy() + rtsp_format['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtsp_format['play_path'] + del rtsp_format['ext'] + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([rtmp_format, rtsp_format]) + self._sort_formats(formats) + + return { + 'id': video_id, + 'title': title, + 'thumbnail': hidden_inputs.get('HiddenThumbnail') or self._og_search_thumbnail(webpage), + 'formats': formats, + } diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py index 8e35f24..bec7ab3 100644 --- a/youtube_dl/extractor/vrt.py +++ b/youtube_dl/extractor/vrt.py @@ -25,7 +25,8 @@ class VRTIE(InfoExtractor): 'timestamp': 1414271750.949, 'upload_date': '20141025', 'duration': 929, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, # sporza.be { @@ -39,7 +40,8 @@ class VRTIE(InfoExtractor): 'timestamp': 1413835980.560, 'upload_date': '20141020', 'duration': 3238, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, # cobra.be { @@ -53,16 +55,39 @@ class VRTIE(InfoExtractor): 'timestamp': 1413967500.494, 'upload_date': '20141022', 'duration': 661, - } + }, + 'skip': 'HTTP Error 404: Not Found', }, { # YouTube video 'url': 'http://deredactie.be/cm/vrtnieuws/videozone/nieuws/cultuurenmedia/1.2622957', - 'only_matching': True, + 'md5': 'b8b93da1df1cea6c8556255a796b7d61', + 'info_dict': { + 'id': 'Wji-BZ0oCwg', + 'ext': 'mp4', + 'title': 'ROGUE ONE: A STAR WARS STORY Official Teaser Trailer', + 'description': 'md5:8e468944dce15567a786a67f74262583', + 'uploader': 'Star Wars', + 'uploader_id': 'starwars', + 'upload_date': '20160407', + }, + 'add_ie': ['Youtube'], }, { 'url': 'http://cobra.canvas.be/cm/cobra/videozone/rubriek/film-videozone/1.2377055', - 'only_matching': True, + 'md5': '', + 'info_dict': { + 'id': '2377055', + 'ext': 'mp4', + 'title': 'Cafe Derby', + 'description': 'Lenny Van Wesemael debuteert met de langspeelfilm Café Derby. Een waar gebeurd maar ook verzonnen verhaal.', + 'upload_date': '20150626', + 'timestamp': 1435305240.769, + }, + 'params': { + # m3u8 download + 'skip_download': True, + } } ] @@ -98,6 +123,32 @@ class VRTIE(InfoExtractor): formats.extend(self._extract_m3u8_formats( src, video_id, 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + formats.extend(self._extract_f4m_formats( + src.replace('playlist.m3u8', 'manifest.f4m'), + video_id, f4m_id='hds', fatal=False)) + if 'data-video-geoblocking="true"' not in webpage: + rtmp_formats = self._extract_smil_formats( + src.replace('playlist.m3u8', 'jwplayer.smil'), + video_id, fatal=False) + formats.extend(rtmp_formats) + for rtmp_format in rtmp_formats: + rtmp_format_c = rtmp_format.copy() + rtmp_format_c['url'] = '%s/%s' % (rtmp_format['url'], rtmp_format['play_path']) + del rtmp_format_c['play_path'] + del rtmp_format_c['ext'] + http_format = rtmp_format_c.copy() + http_format.update({ + 'url': rtmp_format_c['url'].replace('rtmp://', 'http://').replace('vod.', 'download.').replace('/_definst_/', '/').replace('mp4:', ''), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'http'), + 'protocol': 'http', + }) + rtsp_format = rtmp_format_c.copy() + rtsp_format.update({ + 'url': rtsp_format['url'].replace('rtmp://', 'rtsp://'), + 'format_id': rtmp_format['format_id'].replace('rtmp', 'rtsp'), + 'protocol': 'rtsp', + }) + formats.extend([http_format, rtsp_format]) else: formats.extend(self._extract_f4m_formats( '%s/manifest.f4m' % src, video_id, f4m_id='hds', fatal=False)) diff --git a/youtube_dl/extractor/vuclip.py b/youtube_dl/extractor/vuclip.py index eaa888f..b73da5c 100644 --- a/youtube_dl/extractor/vuclip.py +++ b/youtube_dl/extractor/vuclip.py @@ -9,7 +9,7 @@ from ..compat import ( from ..utils import ( ExtractorError, parse_duration, - qualities, + remove_end, ) @@ -22,7 +22,7 @@ class VuClipIE(InfoExtractor): 'id': '922692425', 'ext': '3gp', 'title': 'The Toy Soldiers - Hollywood Movie Trailer', - 'duration': 180, + 'duration': 177, } } @@ -46,34 +46,21 @@ class VuClipIE(InfoExtractor): '%s said: %s' % (self.IE_NAME, error_msg), expected=True) # These clowns alternate between two page types - links_code = self._search_regex( - r'''(?xs) - (?: - <img\s+src="[^"]*/play.gif".*?>| - <!--\ player\ end\ -->\s*</div><!--\ thumb\ end--> - ) - (.*?) - (?: - <a\s+href="fblike|<div\s+class="social"> - ) - ''', webpage, 'links') - title = self._html_search_regex( - r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip() + video_url = self._search_regex( + r'<a[^>]+href="([^"]+)"[^>]*><img[^>]+src="[^"]*/play\.gif', + webpage, 'video URL', default=None) + if video_url: + formats = [{ + 'url': video_url, + }] + else: + formats = self._parse_html5_media_entries(url, webpage)[0]['formats'] - quality_order = qualities(['Reg', 'Hi']) - formats = [] - for url, q in re.findall( - r'<a\s+href="(?P<url>[^"]+)".*?>(?:<button[^>]*>)?(?P<q>[^<]+)(?:</button>)?</a>', links_code): - format_id = compat_urllib_parse_urlparse(url).scheme + '-' + q - formats.append({ - 'format_id': format_id, - 'url': url, - 'quality': quality_order(q), - }) - self._sort_formats(formats) + title = remove_end(self._html_search_regex( + r'<title>(.*?)-\s*Vuclip</title>', webpage, 'title').strip(), ' - Video') - duration = parse_duration(self._search_regex( - r'\(([0-9:]+)\)</span>', webpage, 'duration', fatal=False)) + duration = parse_duration(self._html_search_regex( + r'[(>]([0-9]+:[0-9]+)(?:<span|\))', webpage, 'duration', fatal=False)) return { 'id': video_id, diff --git a/youtube_dl/extractor/wat.py b/youtube_dl/extractor/wat.py index de7d6b5..9f1b8b4 100644 --- a/youtube_dl/extractor/wat.py +++ b/youtube_dl/extractor/wat.py @@ -9,7 +9,7 @@ from ..utils import ( ExtractorError, unified_strdate, HEADRequest, - float_or_none, + int_or_none, ) @@ -31,48 +31,58 @@ class WatIE(InfoExtractor): }, { 'url': 'http://www.wat.tv/video/gregory-lemarchal-voix-ange-6z1v7_6ygkj_.html', - 'md5': 'fbc84e4378165278e743956d9c1bf16b', + 'md5': '34bdfa5ca9fd3c7eb88601b635b0424c', 'info_dict': { 'id': '11713075', 'ext': 'mp4', 'title': 'Grégory Lemarchal, une voix d\'ange depuis 10 ans (1/3)', - 'description': 'md5:b7a849cf16a2b733d9cd10c52906dee3', 'upload_date': '20140816', - 'duration': 2910, }, - 'skip': "Ce contenu n'est pas disponible pour l'instant.", + 'expected_warnings': ["Ce contenu n'est pas disponible pour l'instant."], }, ] + _FORMATS = ( + (200, 416, 234), + (400, 480, 270), + (600, 640, 360), + (1200, 640, 360), + (1800, 960, 540), + (2500, 1280, 720), + ) + def _real_extract(self, url): video_id = self._match_id(url) video_id = video_id if video_id.isdigit() and len(video_id) > 6 else compat_str(int(video_id, 36)) # 'contentv4' is used in the website, but it also returns the related # videos, we don't need them - video_info = self._download_json( - 'http://www.wat.tv/interface/contentv3/' + video_id, video_id)['media'] + video_data = self._download_json( + 'http://www.wat.tv/interface/contentv4s/' + video_id, video_id) + video_info = video_data['media'] error_desc = video_info.get('error_desc') if error_desc: - raise ExtractorError( - '%s returned error: %s' % (self.IE_NAME, error_desc), expected=True) + self.report_warning( + '%s returned error: %s' % (self.IE_NAME, error_desc)) chapters = video_info['chapters'] - first_chapter = chapters[0] + if chapters: + first_chapter = chapters[0] - def video_id_for_chapter(chapter): - return chapter['tc_start'].split('-')[0] + def video_id_for_chapter(chapter): + return chapter['tc_start'].split('-')[0] - if video_id_for_chapter(first_chapter) != video_id: - self.to_screen('Multipart video detected') - entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] - return self.playlist_result(entries, video_id, video_info['title']) - # Otherwise we can continue and extract just one part, we have to use - # the video id for getting the video url + if video_id_for_chapter(first_chapter) != video_id: + self.to_screen('Multipart video detected') + entries = [self.url_result('wat:%s' % video_id_for_chapter(chapter)) for chapter in chapters] + return self.playlist_result(entries, video_id, video_info['title']) + # Otherwise we can continue and extract just one part, we have to use + # the video id for getting the video url + else: + first_chapter = video_info - date_diffusion = first_chapter.get('date_diffusion') - upload_date = unified_strdate(date_diffusion) if date_diffusion else None + title = first_chapter['title'] def extract_url(path_template, url_type): req_url = 'http://www.wat.tv/get/%s' % (path_template % video_id) @@ -84,45 +94,61 @@ class WatIE(InfoExtractor): expected=True) return red_url - m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') - http_url = extract_url('android5/%s.mp4', 'http') - formats = [] - m3u8_formats = self._extract_m3u8_formats( - m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') - formats.extend(m3u8_formats) - formats.extend(self._extract_f4m_formats( - m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), - video_id, f4m_id='hds', fatal=False)) - for m3u8_format in m3u8_formats: - mobj = re.search( - r'audio.*?%3D(\d+)(?:-video.*?%3D(\d+))?', m3u8_format['url']) - if not mobj: - continue - abr, vbr = mobj.groups() - abr, vbr = float_or_none(abr, 1000), float_or_none(vbr, 1000) - m3u8_format.update({ - 'vbr': vbr, - 'abr': abr, - }) - if not vbr or not abr: - continue - f = m3u8_format.copy() - f.update({ - 'url': re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url), - 'format_id': f['format_id'].replace('hls', 'http'), - 'protocol': 'http', - }) - formats.append(f) - self._sort_formats(formats) + try: + http_url = extract_url('android5/%s.mp4', 'http') + m3u8_url = extract_url('ipad/%s.m3u8', 'm3u8') + m3u8_formats = self._extract_m3u8_formats( + m3u8_url, video_id, 'mp4', 'm3u8_native', m3u8_id='hls') + formats.extend(m3u8_formats) + formats.extend(self._extract_f4m_formats( + m3u8_url.replace('ios.', 'web.').replace('.m3u8', '.f4m'), + video_id, f4m_id='hds', fatal=False)) + for m3u8_format in m3u8_formats: + vbr, abr = m3u8_format.get('vbr'), m3u8_format.get('abr') + if not vbr or not abr: + continue + format_id = m3u8_format['format_id'].replace('hls', 'http') + fmt_url = re.sub(r'%s-\d+00-\d+' % video_id, '%s-%d00-%d' % (video_id, round(vbr / 100), round(abr)), http_url) + if self._is_valid_url(fmt_url, video_id, format_id): + f = m3u8_format.copy() + f.update({ + 'url': fmt_url, + 'format_id': format_id, + 'protocol': 'http', + }) + formats.append(f) + self._sort_formats(formats) + except ExtractorError: + abr = 64 + for vbr, width, height in self._FORMATS: + tbr = vbr + abr + format_id = 'http-%s' % tbr + fmt_url = 'http://dnl.adv.tf1.fr/2/USP-0x0/%s/%s/%s/ssm/%s-%s-64k.mp4' % (video_id[-4:-2], video_id[-2:], video_id, video_id, vbr) + if self._is_valid_url(fmt_url, video_id, format_id): + formats.append({ + 'format_id': format_id, + 'url': fmt_url, + 'vbr': vbr, + 'abr': abr, + 'width': width, + 'height': height, + }) + + date_diffusion = first_chapter.get('date_diffusion') or video_data.get('configv4', {}).get('estatS4') + upload_date = unified_strdate(date_diffusion) if date_diffusion else None + duration = None + files = video_info['files'] + if files: + duration = int_or_none(files[0].get('duration')) return { 'id': video_id, - 'title': first_chapter['title'], - 'thumbnail': first_chapter['preview'], - 'description': first_chapter['description'], - 'view_count': video_info['views'], + 'title': title, + 'thumbnail': first_chapter.get('preview'), + 'description': first_chapter.get('description'), + 'view_count': int_or_none(video_info.get('views')), 'upload_date': upload_date, - 'duration': video_info['files'][0]['duration'], + 'duration': duration, 'formats': formats, } diff --git a/youtube_dl/extractor/xiami.py b/youtube_dl/extractor/xiami.py index a6dfc4a..86abef2 100644 --- a/youtube_dl/extractor/xiami.py +++ b/youtube_dl/extractor/xiami.py @@ -13,6 +13,7 @@ class XiamiBaseIE(InfoExtractor): webpage = super(XiamiBaseIE, self)._download_webpage(*args, **kwargs) if '>Xiami is currently not available in your country.<' in webpage: self.raise_geo_restricted('Xiami is currently not available in your country') + return webpage def _extract_track(self, track, track_id=None): title = track['title'] diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py index 4075b8a..83bc1fe 100644 --- a/youtube_dl/extractor/xtube.py +++ b/youtube_dl/extractor/xtube.py @@ -4,17 +4,23 @@ import itertools import re from .common import InfoExtractor -from ..compat import compat_urllib_parse_unquote from ..utils import ( int_or_none, orderedSet, + parse_duration, sanitized_Request, str_to_int, ) class XTubeIE(InfoExtractor): - _VALID_URL = r'(?:xtube:|https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-))(?P<id>[^/?&#]+)' + _VALID_URL = r'''(?x) + (?: + xtube:| + https?://(?:www\.)?xtube\.com/(?:watch\.php\?.*\bv=|video-watch/(?P<display_id>[^/]+)-) + ) + (?P<id>[^/?&#]+) + ''' _TESTS = [{ # old URL schema @@ -27,6 +33,8 @@ class XTubeIE(InfoExtractor): 'description': 'contains:an ET kind of thing', 'uploader': 'greenshowers', 'duration': 450, + 'view_count': int, + 'comment_count': int, 'age_limit': 18, } }, { @@ -51,21 +59,30 @@ class XTubeIE(InfoExtractor): req.add_header('Cookie', 'age_verified=1; cookiesAccepted=1') webpage = self._download_webpage(req, display_id) - flashvars = self._parse_json( - self._search_regex( - r'xt\.playerOps\s*=\s*({.+?});', webpage, 'player ops'), - video_id)['flashvars'] - - title = flashvars.get('title') or self._search_regex( - r'<h1>([^<]+)</h1>', webpage, 'title') - video_url = compat_urllib_parse_unquote(flashvars['video_url']) - duration = int_or_none(flashvars.get('video_duration')) - - uploader = self._search_regex( - r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', - webpage, 'uploader', fatal=False) + sources = self._parse_json(self._search_regex( + r'sources\s*:\s*({.+?}),', webpage, 'sources'), video_id) + + formats = [] + for format_id, format_url in sources.items(): + formats.append({ + 'url': format_url, + 'format_id': format_id, + 'height': int_or_none(format_id), + }) + self._sort_formats(formats) + + title = self._search_regex( + (r'<h1>(?P<title>[^<]+)</h1>', r'videoTitle\s*:\s*(["\'])(?P<title>.+?)\1'), + webpage, 'title', group='title') description = self._search_regex( r'</h1>\s*<p>([^<]+)', webpage, 'description', fatal=False) + uploader = self._search_regex( + (r'<input[^>]+name="contentOwnerId"[^>]+value="([^"]+)"', + r'<span[^>]+class="nickname"[^>]*>([^<]+)'), + webpage, 'uploader', fatal=False) + duration = parse_duration(self._search_regex( + r'<dt>Runtime:</dt>\s*<dd>([^<]+)</dd>', + webpage, 'duration', fatal=False)) view_count = str_to_int(self._search_regex( r'<dt>Views:</dt>\s*<dd>([\d,\.]+)</dd>', webpage, 'view count', fatal=False)) @@ -76,7 +93,6 @@ class XTubeIE(InfoExtractor): return { 'id': video_id, 'display_id': display_id, - 'url': video_url, 'title': title, 'description': description, 'uploader': uploader, @@ -84,6 +100,7 @@ class XTubeIE(InfoExtractor): 'view_count': view_count, 'comment_count': comment_count, 'age_limit': 18, + 'formats': formats, } diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py index 0be8932..a66daee 100644 --- a/youtube_dl/extractor/xuite.py +++ b/youtube_dl/extractor/xuite.py @@ -68,6 +68,20 @@ class XuiteIE(InfoExtractor): }, 'skip': 'Video removed', }, { + # Video with encoded media id + # from http://forgetfulbc.blogspot.com/2016/06/date.html + 'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0', + 'info_dict': { + 'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==', + 'ext': 'mp4', + 'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)', + 'description': 'md5:f0abdcb69df300f522a5442ef3146f2a', + 'timestamp': 1466160960, + 'upload_date': '20160617', + 'uploader': 'B.C. & Lowy', + 'uploader_id': '232279340', + }, + }, { 'url': 'http://vlog.xuite.net/play/S1dDUjdyLTMyOTc3NjcuZmx2/%E5%AD%AB%E7%87%95%E5%A7%BF-%E7%9C%BC%E6%B7%9A%E6%88%90%E8%A9%A9', 'only_matching': True, }] @@ -80,10 +94,9 @@ class XuiteIE(InfoExtractor): def base64_encode_utf8(data): return base64.b64encode(data.encode('utf-8')).decode('utf-8') - def _extract_flv_config(self, media_id): - base64_media_id = self.base64_encode_utf8(media_id) + def _extract_flv_config(self, encoded_media_id): flv_config = self._download_xml( - 'http://vlog.xuite.net/flash/player?media=%s' % base64_media_id, + 'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id, 'flv config') prop_dict = {} for prop in flv_config.findall('./property'): @@ -108,9 +121,14 @@ class XuiteIE(InfoExtractor): '%s returned error: %s' % (self.IE_NAME, error_msg), expected=True) - video_id = self._html_search_regex( - r'data-mediaid="(\d+)"', webpage, 'media id') - flv_config = self._extract_flv_config(video_id) + encoded_media_id = self._search_regex( + r'attributes\.name\s*=\s*"([^"]+)"', webpage, + 'encoded media id', default=None) + if encoded_media_id is None: + video_id = self._html_search_regex( + r'data-mediaid="(\d+)"', webpage, 'media id') + encoded_media_id = self.base64_encode_utf8(video_id) + flv_config = self._extract_flv_config(encoded_media_id) FORMATS = { 'audio': 'mp3', diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py index 1dfe031..30825da 100644 --- a/youtube_dl/extractor/xvideos.py +++ b/youtube_dl/extractor/xvideos.py @@ -15,10 +15,10 @@ class XVideosIE(InfoExtractor): _VALID_URL = r'https?://(?:www\.)?xvideos\.com/video(?P<id>[0-9]+)(?:.*)' _TEST = { 'url': 'http://www.xvideos.com/video4588838/biker_takes_his_girl', - 'md5': '4b46ae6ea5e6e9086e714d883313c0c9', + 'md5': '14cea69fcb84db54293b1e971466c2e1', 'info_dict': { 'id': '4588838', - 'ext': 'flv', + 'ext': 'mp4', 'title': 'Biker Takes his Girl', 'age_limit': 18, } @@ -42,24 +42,24 @@ class XVideosIE(InfoExtractor): video_url = compat_urllib_parse_unquote(self._search_regex( r'flv_url=(.+?)&', webpage, 'video URL', default='')) if video_url: - formats.append({'url': video_url}) + formats.append({ + 'url': video_url, + 'format_id': 'flv', + }) - player_args = self._search_regex( - r'(?s)new\s+HTML5Player\((.+?)\)', webpage, ' html5 player', default=None) - if player_args: - for arg in player_args.split(','): - format_url = self._search_regex( - r'(["\'])(?P<url>https?://.+?)\1', arg, 'url', - default=None, group='url') - if not format_url: - continue - ext = determine_ext(format_url) - if ext == 'mp4': - formats.append({'url': format_url}) - elif ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - format_url, video_id, 'mp4', - entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + for kind, _, format_url in re.findall( + r'setVideo([^(]+)\((["\'])(http.+?)\2\)', webpage): + format_id = kind.lower() + if format_id == 'hls': + formats.extend(self._extract_m3u8_formats( + format_url, video_id, 'mp4', + entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)) + elif format_id in ('urllow', 'urlhigh'): + formats.append({ + 'url': format_url, + 'format_id': '%s-%s' % (determine_ext(format_url, 'mp4'), format_id[3:]), + 'quality': -2 if format_id.endswith('low') else None, + }) self._sort_formats(formats) diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py index 927a964..b0679df 100644 --- a/youtube_dl/extractor/yahoo.py +++ b/youtube_dl/extractor/yahoo.py @@ -19,6 +19,7 @@ from ..utils import ( mimetype2ext, ) +from .brightcove import BrightcoveNewIE from .nbc import NBCSportsVPlayerIE @@ -227,7 +228,12 @@ class YahooIE(InfoExtractor): # Look for NBCSports iframes nbc_sports_url = NBCSportsVPlayerIE._extract_url(webpage) if nbc_sports_url: - return self.url_result(nbc_sports_url, 'NBCSportsVPlayer') + return self.url_result(nbc_sports_url, NBCSportsVPlayerIE.ie_key()) + + # Look for Brightcove New Studio embeds + bc_url = BrightcoveNewIE._extract_url(webpage) + if bc_url: + return self.url_result(bc_url, BrightcoveNewIE.ie_key()) # Query result is often embedded in webpage as JSON. Sometimes explicit requests # to video API results in a failure with geo restriction reason therefore using diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py index b37d0ea..fd6268b 100644 --- a/youtube_dl/extractor/yandexmusic.py +++ b/youtube_dl/extractor/yandexmusic.py @@ -75,6 +75,12 @@ class YandexMusicTrackIE(YandexMusicBaseIE): % storage_dir, track_id, 'Downloading track location JSON') + # Each string is now wrapped in a list, this is probably only temporarily thus + # supporting both scenarios (see https://github.com/rg3/youtube-dl/issues/10193) + for k, v in data.items(): + if v and isinstance(v, list): + data[k] = v[0] + key = hashlib.md5(('XGRlBW9FXlekgbPrRHuSiA' + data['path'][1:] + data['s']).encode('utf-8')).hexdigest() storage = storage_dir.split('.') diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py index 4150b28..31e2f92 100644 --- a/youtube_dl/extractor/youjizz.py +++ b/youtube_dl/extractor/youjizz.py @@ -9,8 +9,8 @@ from ..utils import ( class YouJizzIE(InfoExtractor): - _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/[^/#?]+-(?P<id>[0-9]+)\.html(?:$|[?#])' - _TEST = { + _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])' + _TESTS = [{ 'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html', 'md5': '07e15fa469ba384c7693fd246905547c', 'info_dict': { @@ -19,7 +19,10 @@ class YouJizzIE(InfoExtractor): 'title': 'Zeichentrick 1', 'age_limit': 18, } - } + }, { + 'url': 'http://www.youjizz.com/videos/-2189178.html', + 'only_matching': True, + }] def _real_extract(self, url): video_id = self._match_id(url) diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py index 147608e..e37f237 100644 --- a/youtube_dl/extractor/youku.py +++ b/youtube_dl/extractor/youku.py @@ -16,7 +16,6 @@ from ..compat import ( from ..utils import ( ExtractorError, get_element_by_attribute, - sanitized_Request, ) @@ -218,14 +217,10 @@ class YoukuIE(InfoExtractor): headers = { 'Referer': req_url, } + headers.update(self.geo_verification_headers()) self._set_cookie('youku.com', 'xreferrer', 'http://www.youku.com') - req = sanitized_Request(req_url, headers=headers) - cn_verification_proxy = self._downloader.params.get('cn_verification_proxy') - if cn_verification_proxy: - req.add_header('Ytdl-request-proxy', cn_verification_proxy) - - raw_data = self._download_json(req, video_id, note=note) + raw_data = self._download_json(req_url, video_id, note=note, headers=headers) return raw_data['data'] diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py index c8d54f2..268080b 100644 --- a/youtube_dl/extractor/youtube.py +++ b/youtube_dl/extractor/youtube.py @@ -53,6 +53,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): """Provide base functions for Youtube extractors""" _LOGIN_URL = 'https://accounts.google.com/ServiceLogin' _TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge' + _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password' _NETRC_MACHINE = 'youtube' # If True it will raise an error if no login info is provided _LOGIN_REQUIRED = False @@ -116,12 +117,10 @@ class YoutubeBaseInfoExtractor(InfoExtractor): 'hl': 'en_US', } - login_data = urlencode_postdata(login_form_strs) - - req = sanitized_Request(self._LOGIN_URL, login_data) login_results = self._download_webpage( - req, None, - note='Logging in', errnote='unable to log in', fatal=False) + self._PASSWORD_CHALLENGE_URL, None, + note='Logging in', errnote='unable to log in', fatal=False, + data=urlencode_postdata(login_form_strs)) if login_results is False: return False @@ -137,7 +136,7 @@ class YoutubeBaseInfoExtractor(InfoExtractor): # Two-Factor # TODO add SMS and phone call support - these require making a request and then prompting the user - if re.search(r'(?i)<form[^>]* id="challenge"', login_results) is not None: + if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None: tfa_code = self._get_tfa_info('2-step verification code') if not tfa_code: @@ -165,17 +164,17 @@ class YoutubeBaseInfoExtractor(InfoExtractor): if tfa_results is False: return False - if re.search(r'(?i)<form[^>]* id="challenge"', tfa_results) is not None: + if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None: self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.') return False - if re.search(r'(?i)<form[^>]* id="gaia_loginform"', tfa_results) is not None: + if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None: self._downloader.report_warning('unable to log in - did the page structure change?') return False if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None: self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.') return False - if re.search(r'(?i)<form[^>]* id="gaia_loginform"', login_results) is not None: + if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None: self._downloader.report_warning('unable to log in: bad username or password') return False return True @@ -858,6 +857,11 @@ class YoutubeIE(YoutubeBaseInfoExtractor): { 'url': 'https://www.youtube.com/watch?feature=player_embedded&amp;v=V36LpHqtcDY', 'only_matching': True, + }, + { + # YouTube Red paid video (https://github.com/rg3/youtube-dl/issues/10059) + 'url': 'https://www.youtube.com/watch?v=i1Ko8UG-Tdo', + 'only_matching': True, } ] @@ -1730,6 +1734,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor): } +class YoutubeSharedVideoIE(InfoExtractor): + _VALID_URL = r'(?:https?:)?//(?:www\.)?youtube\.com/shared\?.*\bci=(?P<id>[0-9A-Za-z_-]{11})' + IE_NAME = 'youtube:shared' + + _TEST = { + 'url': 'https://www.youtube.com/shared?ci=1nEzmT-M4fU', + 'info_dict': { + 'id': 'uPDB5I9wfp8', + 'ext': 'webm', + 'title': 'Pocoyo: 90 minutos de episódios completos Português para crianças - PARTE 3', + 'description': 'md5:d9e4d9346a2dfff4c7dc4c8cec0f546d', + 'upload_date': '20160219', + 'uploader': 'Pocoyo - Português (BR)', + 'uploader_id': 'PocoyoBrazil', + }, + 'add_ie': ['Youtube'], + 'params': { + # There are already too many Youtube downloads + 'skip_download': True, + }, + } + + def _real_extract(self, url): + video_id = self._match_id(url) + + webpage = self._download_webpage(url, video_id) + + real_video_id = self._html_search_meta( + 'videoId', webpage, 'YouTube video id', fatal=True) + + return self.url_result(real_video_id, YoutubeIE.ie_key()) + + class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor): IE_DESC = 'YouTube.com playlists' _VALID_URL = r"""(?x)(?: @@ -1945,10 +1982,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): return (False if YoutubePlaylistsIE.suitable(url) or YoutubeLiveIE.suitable(url) else super(YoutubeChannelIE, cls).suitable(url)) + def _build_template_url(self, url, channel_id): + return self._TEMPLATE_URL % channel_id + def _real_extract(self, url): channel_id = self._match_id(url) - url = self._TEMPLATE_URL % channel_id + url = self._build_template_url(url, channel_id) # Channel by page listing is restricted to 35 pages of 30 items, i.e. 1050 videos total (see #5778) # Workaround by extracting as a playlist if managed to obtain channel playlist URL @@ -1962,9 +2002,13 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): channel_playlist_id = self._html_search_meta( 'channelId', channel_page, 'channel id', default=None) if not channel_playlist_id: - channel_playlist_id = self._search_regex( - r'data-(?:channel-external-|yt)id="([^"]+)"', - channel_page, 'channel id', default=None) + channel_url = self._html_search_meta( + ('al:ios:url', 'twitter:app:url:iphone', 'twitter:app:url:ipad'), + channel_page, 'channel url', default=None) + if channel_url: + channel_playlist_id = self._search_regex( + r'vnd\.youtube://user/([0-9A-Za-z_-]+)', + channel_url, 'channel id', default=None) if channel_playlist_id and channel_playlist_id.startswith('UC'): playlist_id = 'UU' + channel_playlist_id[2:] return self.url_result( @@ -1987,20 +2031,39 @@ class YoutubeChannelIE(YoutubePlaylistBaseInfoExtractor): for video_id, video_title in self.extract_videos_from_page(channel_page)] return self.playlist_result(entries, channel_id) + try: + next(self._entries(channel_page, channel_id)) + except StopIteration: + alert_message = self._html_search_regex( + r'(?s)<div[^>]+class=(["\']).*?\byt-alert-message\b.*?\1[^>]*>(?P<alert>[^<]+)</div>', + channel_page, 'alert', default=None, group='alert') + if alert_message: + raise ExtractorError('Youtube said: %s' % alert_message, expected=True) + return self.playlist_result(self._entries(channel_page, channel_id), channel_id) class YoutubeUserIE(YoutubeChannelIE): IE_DESC = 'YouTube.com user videos (URL or "ytuser" keyword)' - _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:user/|c/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' - _TEMPLATE_URL = 'https://www.youtube.com/user/%s/videos' + _VALID_URL = r'(?:(?:https?://(?:\w+\.)?youtube\.com/(?:(?P<user>user|c)/)?(?!(?:attribution_link|watch|results)(?:$|[^a-z_A-Z0-9-])))|ytuser:)(?!feed/)(?P<id>[A-Za-z0-9_-]+)' + _TEMPLATE_URL = 'https://www.youtube.com/%s/%s/videos' IE_NAME = 'youtube:user' _TESTS = [{ 'url': 'https://www.youtube.com/user/TheLinuxFoundation', 'playlist_mincount': 320, 'info_dict': { - 'title': 'TheLinuxFoundation', + 'id': 'UUfX55Sx5hEFjoC3cNs6mCUQ', + 'title': 'Uploads from The Linux Foundation', + } + }, { + # Only available via https://www.youtube.com/c/12minuteathlete/videos + # but not https://www.youtube.com/user/12minuteathlete/videos + 'url': 'https://www.youtube.com/c/12minuteathlete/videos', + 'playlist_mincount': 249, + 'info_dict': { + 'id': 'UUVjM-zV6_opMDx7WYxnjZiQ', + 'title': 'Uploads from 12 Minute Athlete', } }, { 'url': 'ytuser:phihag', @@ -2008,6 +2071,13 @@ class YoutubeUserIE(YoutubeChannelIE): }, { 'url': 'https://www.youtube.com/c/gametrailers', 'only_matching': True, + }, { + 'url': 'https://www.youtube.com/gametrailers', + 'only_matching': True, + }, { + # This channel is not available. + 'url': 'https://www.youtube.com/user/kananishinoSMEJ/videos', + 'only_matching': True, }] @classmethod @@ -2020,6 +2090,10 @@ class YoutubeUserIE(YoutubeChannelIE): else: return super(YoutubeUserIE, cls).suitable(url) + def _build_template_url(self, url, channel_id): + mobj = re.match(self._VALID_URL, url) + return self._TEMPLATE_URL % (mobj.group('user') or 'user', mobj.group('id')) + class YoutubeLiveIE(YoutubeBaseInfoExtractor): IE_DESC = 'YouTube.com live streams' diff --git a/youtube_dl/extractor/zippcast.py b/youtube_dl/extractor/zippcast.py deleted file mode 100644 index de81937..0000000 --- a/youtube_dl/extractor/zippcast.py +++ /dev/null @@ -1,94 +0,0 @@ -from __future__ import unicode_literals - -import re - -from .common import InfoExtractor -from ..utils import ( - determine_ext, - str_to_int, -) - - -class ZippCastIE(InfoExtractor): - _VALID_URL = r'https?://(?:www\.)?zippcast\.com/(?:video/|videoview\.php\?.*\bvplay=)(?P<id>[0-9a-zA-Z]+)' - _TESTS = [{ - # m3u8, hq direct link - 'url': 'http://www.zippcast.com/video/c9cfd5c7e44dbc29c81', - 'md5': '5ea0263b5606866c4d6cda0fc5e8c6b6', - 'info_dict': { - 'id': 'c9cfd5c7e44dbc29c81', - 'ext': 'mp4', - 'title': '[Vinesauce] Vinny - Digital Space Traveler', - 'description': 'Muted on youtube, but now uploaded in it\'s original form.', - 'thumbnail': 're:^https?://.*\.jpg$', - 'uploader': 'vinesauce', - 'view_count': int, - 'categories': ['Entertainment'], - 'tags': list, - }, - }, { - # f4m, lq ipod direct link - 'url': 'http://www.zippcast.com/video/b79c0a233e9c6581775', - 'only_matching': True, - }, { - 'url': 'http://www.zippcast.com/videoview.php?vplay=c9cfd5c7e44dbc29c81&auto=no', - 'only_matching': True, - }] - - def _real_extract(self, url): - video_id = self._match_id(url) - - webpage = self._download_webpage( - 'http://www.zippcast.com/video/%s' % video_id, video_id) - - formats = [] - video_url = self._search_regex( - r'<source[^>]+src=(["\'])(?P<url>.+?)\1', webpage, - 'video url', default=None, group='url') - if video_url: - formats.append({ - 'url': video_url, - 'format_id': 'http', - 'preference': 0, # direct link is almost always of worse quality - }) - src_url = self._search_regex( - r'src\s*:\s*(?:escape\()?(["\'])(?P<url>http://.+?)\1', - webpage, 'src', default=None, group='url') - ext = determine_ext(src_url) - if ext == 'm3u8': - formats.extend(self._extract_m3u8_formats( - src_url, video_id, 'mp4', entry_protocol='m3u8_native', - m3u8_id='hls', fatal=False)) - elif ext == 'f4m': - formats.extend(self._extract_f4m_formats( - src_url, video_id, f4m_id='hds', fatal=False)) - self._sort_formats(formats) - - title = self._og_search_title(webpage) - description = self._og_search_description(webpage) or self._html_search_meta( - 'description', webpage) - uploader = self._search_regex( - r'<a[^>]+href="https?://[^/]+/profile/[^>]+>([^<]+)</a>', - webpage, 'uploader', fatal=False) - thumbnail = self._og_search_thumbnail(webpage) - view_count = str_to_int(self._search_regex( - r'>([\d,.]+) views!', webpage, 'view count', fatal=False)) - - categories = re.findall( - r'<a[^>]+href="https?://[^/]+/categories/[^"]+">([^<]+),?<', - webpage) - tags = re.findall( - r'<a[^>]+href="https?://[^/]+/search/tags/[^"]+">([^<]+),?<', - webpage) - - return { - 'id': video_id, - 'title': title, - 'description': description, - 'thumbnail': thumbnail, - 'uploader': uploader, - 'view_count': view_count, - 'categories': categories, - 'tags': tags, - 'formats': formats, - } diff --git a/youtube_dl/options.py b/youtube_dl/options.py index 99ce413..d32a9e3 100644 --- a/youtube_dl/options.py +++ b/youtube_dl/options.py @@ -2,6 +2,7 @@ from __future__ import unicode_literals import os.path import optparse +import re import sys from .downloader.external import list_external_downloaders @@ -26,9 +27,11 @@ def parseOpts(overrideArguments=None): except IOError: return default # silently skip if file is not present try: - res = [] - for l in optionf: - res += compat_shlex_split(l, comments=True) + # FIXME: https://github.com/rg3/youtube-dl/commit/dfe5fa49aed02cf36ba9f743b11b0903554b5e56 + contents = optionf.read() + if sys.version_info < (3,): + contents = contents.decode(preferredencoding()) + res = compat_shlex_split(contents, comments=True) finally: optionf.close() return res @@ -91,8 +94,18 @@ def parseOpts(overrideArguments=None): setattr(parser.values, option.dest, value.split(',')) def _hide_login_info(opts): - opts = list(opts) - for private_opt in ['-p', '--password', '-u', '--username', '--video-password']: + PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password'] + eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$') + + def _scrub_eq(o): + m = eqre.match(o) + if m: + return m.group('key') + '=PRIVATE' + else: + return o + + opts = list(map(_scrub_eq, opts)) + for private_opt in PRIVATE_OPTS: try: i = opts.index(private_opt) opts[i + 1] = 'PRIVATE' @@ -212,10 +225,15 @@ def parseOpts(overrideArguments=None): help='Make all connections via IPv6 (experimental)', ) network.add_option( + '--geo-verification-proxy', + dest='geo_verification_proxy', default=None, metavar='URL', + help='Use this proxy to verify the IP address for some geo-restricted sites. ' + 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + ) + network.add_option( '--cn-verification-proxy', dest='cn_verification_proxy', default=None, metavar='URL', - help='Use this proxy to verify the IP address for some Chinese sites. ' - 'The default proxy specified by --proxy (or none, if the options is not present) is used for the actual downloading. (experimental)' + help=optparse.SUPPRESS_HELP, ) selection = optparse.OptionGroup(parser, 'Video Selection') @@ -481,9 +499,20 @@ def parseOpts(overrideArguments=None): dest='bidi_workaround', action='store_true', help='Work around terminals that lack bidirectional text support. Requires bidiv or fribidi executable in PATH') workarounds.add_option( - '--sleep-interval', metavar='SECONDS', + '--sleep-interval', '--min-sleep-interval', metavar='SECONDS', dest='sleep_interval', type=float, - help='Number of seconds to sleep before each download.') + help=( + 'Number of seconds to sleep before each download when used alone ' + 'or a lower bound of a range for randomized sleep before each download ' + '(minimum possible number of seconds to sleep) when used along with ' + '--max-sleep-interval.')) + workarounds.add_option( + '--max-sleep-interval', metavar='SECONDS', + dest='max_sleep_interval', type=float, + help=( + 'Upper bound of a range for randomized sleep before each download ' + '(maximum possible number of seconds to sleep). Must only be used ' + 'along with --min-sleep-interval.')) verbosity = optparse.OptionGroup(parser, 'Verbosity / Simulation Options') verbosity.add_option( @@ -809,11 +838,11 @@ def parseOpts(overrideArguments=None): system_conf = [] user_conf = [] else: - system_conf = compat_conf(_readOptions('/etc/youtube-dl.conf')) + system_conf = _readOptions('/etc/youtube-dl.conf') if '--ignore-config' in system_conf: user_conf = [] else: - user_conf = compat_conf(_readUserConf()) + user_conf = _readUserConf() argv = system_conf + user_conf + command_line_conf opts, args = parser.parse_args(argv) diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py index 42377fa..920573d 100644 --- a/youtube_dl/postprocessor/metadatafromtitle.py +++ b/youtube_dl/postprocessor/metadatafromtitle.py @@ -3,11 +3,6 @@ from __future__ import unicode_literals import re from .common import PostProcessor -from ..utils import PostProcessingError - - -class MetadataFromTitlePPError(PostProcessingError): - pass class MetadataFromTitlePP(PostProcessor): @@ -38,7 +33,8 @@ class MetadataFromTitlePP(PostProcessor): title = info['title'] match = re.match(self._titleregex, title) if match is None: - raise MetadataFromTitlePPError('Could not interpret title of video as "%s"' % self._titleformat) + self._downloader.to_screen('[fromtitle] Could not interpret title of video as "%s"' % self._titleformat) + return [], info for attribute, value in match.groupdict().items(): value = match.group(attribute) info[attribute] = value diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py index fd49d74..1048072 100644 --- a/youtube_dl/socks.py +++ b/youtube_dl/socks.py @@ -76,7 +76,7 @@ class Socks4Error(ProxyError): CODES = { 91: 'request rejected or failed', - 92: 'request rejected becasue SOCKS server cannot connect to identd on the client', + 92: 'request rejected because SOCKS server cannot connect to identd on the client', 93: 'request rejected because the client program and identd report different user-ids' } diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py index 562031f..b3b687a 100644 --- a/youtube_dl/utils.py +++ b/youtube_dl/utils.py @@ -47,6 +47,7 @@ from .compat import ( compat_socket_create_connection, compat_str, compat_struct_pack, + compat_struct_unpack, compat_urllib_error, compat_urllib_parse, compat_urllib_parse_urlencode, @@ -110,6 +111,50 @@ ACCENT_CHARS = dict(zip('ÂÃÄÀÁÅÆÇÈÉÊËÌÍÎÏÐÑÒÓÔÕÖŐ،٠itertools.chain('AAAAAA', ['AE'], 'CEEEEIIIIDNOOOOOOO', ['OE'], 'UUUUUYP', ['ss'], 'aaaaaa', ['ae'], 'ceeeeiiiionooooooo', ['oe'], 'uuuuuypy'))) +DATE_FORMATS = ( + '%d %B %Y', + '%d %b %Y', + '%B %d %Y', + '%b %d %Y', + '%b %dst %Y %I:%M', + '%b %dnd %Y %I:%M', + '%b %dth %Y %I:%M', + '%Y %m %d', + '%Y-%m-%d', + '%Y/%m/%d', + '%Y/%m/%d %H:%M', + '%Y/%m/%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S', + '%Y-%m-%d %H:%M:%S.%f', + '%d.%m.%Y %H:%M', + '%d.%m.%Y %H.%M', + '%Y-%m-%dT%H:%M:%SZ', + '%Y-%m-%dT%H:%M:%S.%fZ', + '%Y-%m-%dT%H:%M:%S.%f0Z', + '%Y-%m-%dT%H:%M:%S', + '%Y-%m-%dT%H:%M:%S.%f', + '%Y-%m-%dT%H:%M', +) + +DATE_FORMATS_DAY_FIRST = list(DATE_FORMATS) +DATE_FORMATS_DAY_FIRST.extend([ + '%d-%m-%Y', + '%d.%m.%Y', + '%d.%m.%y', + '%d/%m/%Y', + '%d/%m/%y', + '%d/%m/%Y %H:%M:%S', +]) + +DATE_FORMATS_MONTH_FIRST = list(DATE_FORMATS) +DATE_FORMATS_MONTH_FIRST.extend([ + '%m-%d-%Y', + '%m.%d.%Y', + '%m/%d/%Y', + '%m/%d/%y', + '%m/%d/%Y %H:%M:%S', +]) + def preferredencoding(): """Get preferred encoding. @@ -267,9 +312,17 @@ def get_element_by_id(id, html): return get_element_by_attribute('id', id, html) -def get_element_by_attribute(attribute, value, html): +def get_element_by_class(class_name, html): + return get_element_by_attribute( + 'class', r'[^\'"]*\b%s\b[^\'"]*' % re.escape(class_name), + html, escape_value=False) + + +def get_element_by_attribute(attribute, value, html, escape_value=True): """Return the content of the tag with the specified attribute in the passed HTML document""" + value = re.escape(value) if escape_value else value + m = re.search(r'''(?xs) <([a-zA-Z0-9:._-]+) (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*? @@ -278,7 +331,7 @@ def get_element_by_attribute(attribute, value, html): \s*> (?P<content>.*?) </\1> - ''' % (re.escape(attribute), re.escape(value)), html) + ''' % (re.escape(attribute), value), html) if not m: return None @@ -975,6 +1028,24 @@ class YoutubeDLCookieProcessor(compat_urllib_request.HTTPCookieProcessor): https_response = http_response +def extract_timezone(date_str): + m = re.search( + r'^.{8,}?(?P<tz>Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', + date_str) + if not m: + timezone = datetime.timedelta() + else: + date_str = date_str[:-len(m.group('tz'))] + if not m.group('sign'): + timezone = datetime.timedelta() + else: + sign = 1 if m.group('sign') == '+' else -1 + timezone = datetime.timedelta( + hours=sign * int(m.group('hours')), + minutes=sign * int(m.group('minutes'))) + return timezone, date_str + + def parse_iso8601(date_str, delimiter='T', timezone=None): """ Return a UNIX timestamp from the given date """ @@ -984,20 +1055,8 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): date_str = re.sub(r'\.[0-9]+', '', date_str) if timezone is None: - m = re.search( - r'(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)', - date_str) - if not m: - timezone = datetime.timedelta() - else: - date_str = date_str[:-len(m.group(0))] - if not m.group('sign'): - timezone = datetime.timedelta() - else: - sign = 1 if m.group('sign') == '+' else -1 - timezone = datetime.timedelta( - hours=sign * int(m.group('hours')), - minutes=sign * int(m.group('minutes'))) + timezone, date_str = extract_timezone(date_str) + try: date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter) dt = datetime.datetime.strptime(date_str, date_format) - timezone @@ -1006,6 +1065,10 @@ def parse_iso8601(date_str, delimiter='T', timezone=None): pass +def date_formats(day_first=True): + return DATE_FORMATS_DAY_FIRST if day_first else DATE_FORMATS_MONTH_FIRST + + def unified_strdate(date_str, day_first=True): """Return a string with the date in the format YYYYMMDD""" @@ -1014,53 +1077,11 @@ def unified_strdate(date_str, day_first=True): upload_date = None # Replace commas date_str = date_str.replace(',', ' ') - # %z (UTC offset) is only supported in python>=3.2 - if not re.match(r'^[0-9]{1,2}-[0-9]{1,2}-[0-9]{4}$', date_str): - date_str = re.sub(r' ?(\+|-)[0-9]{2}:?[0-9]{2}$', '', date_str) # Remove AM/PM + timezone date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + _, date_str = extract_timezone(date_str) - format_expressions = [ - '%d %B %Y', - '%d %b %Y', - '%B %d %Y', - '%b %d %Y', - '%b %dst %Y %I:%M', - '%b %dnd %Y %I:%M', - '%b %dth %Y %I:%M', - '%Y %m %d', - '%Y-%m-%d', - '%Y/%m/%d', - '%Y/%m/%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S', - '%Y-%m-%d %H:%M:%S.%f', - '%d.%m.%Y %H:%M', - '%d.%m.%Y %H.%M', - '%Y-%m-%dT%H:%M:%SZ', - '%Y-%m-%dT%H:%M:%S.%fZ', - '%Y-%m-%dT%H:%M:%S.%f0Z', - '%Y-%m-%dT%H:%M:%S', - '%Y-%m-%dT%H:%M:%S.%f', - '%Y-%m-%dT%H:%M', - ] - if day_first: - format_expressions.extend([ - '%d-%m-%Y', - '%d.%m.%Y', - '%d.%m.%y', - '%d/%m/%Y', - '%d/%m/%y', - '%d/%m/%Y %H:%M:%S', - ]) - else: - format_expressions.extend([ - '%m-%d-%Y', - '%m.%d.%Y', - '%m/%d/%Y', - '%m/%d/%y', - '%m/%d/%Y %H:%M:%S', - ]) - for expression in format_expressions: + for expression in date_formats(day_first): try: upload_date = datetime.datetime.strptime(date_str, expression).strftime('%Y%m%d') except ValueError: @@ -1076,6 +1097,29 @@ def unified_strdate(date_str, day_first=True): return compat_str(upload_date) +def unified_timestamp(date_str, day_first=True): + if date_str is None: + return None + + date_str = date_str.replace(',', ' ') + + pm_delta = 12 if re.search(r'(?i)PM', date_str) else 0 + timezone, date_str = extract_timezone(date_str) + + # Remove AM/PM + timezone + date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str) + + for expression in date_formats(day_first): + try: + dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta) + return calendar.timegm(dt.timetuple()) + except ValueError: + pass + timetuple = email.utils.parsedate_tz(date_str) + if timetuple: + return calendar.timegm(timetuple) + pm_delta * 3600 + + def determine_ext(url, default_ext='unknown_video'): if url is None: return default_ext @@ -1410,6 +1454,8 @@ def shell_quote(args): def smuggle_url(url, data): """ Pass additional data in a URL for internal use. """ + url, idata = unsmuggle_url(url, {}) + data.update(idata) sdata = compat_urllib_parse_urlencode( {'__youtubedl_smuggle': json.dumps(data)}) return url + '#' + sdata @@ -1591,6 +1637,11 @@ class HEADRequest(compat_urllib_request.Request): return 'HEAD' +class PUTRequest(compat_urllib_request.Request): + def get_method(self): + return 'PUT' + + def int_or_none(v, scale=1, default=None, get_attr=None, invscale=1): if get_attr: if v is not None: @@ -1626,6 +1677,10 @@ def float_or_none(v, scale=1, invscale=1, default=None): return default +def strip_or_none(v): + return None if v is None else v.strip() + + def parse_duration(s): if not isinstance(s, compat_basestring): return None @@ -1882,7 +1937,13 @@ def update_Request(req, url=None, data=None, headers={}, query={}): req_headers.update(headers) req_data = data or req.data req_url = update_url_query(url or req.get_full_url(), query) - req_type = HEADRequest if req.get_method() == 'HEAD' else compat_urllib_request.Request + req_get_method = req.get_method() + if req_get_method == 'HEAD': + req_type = HEADRequest + elif req_get_method == 'PUT': + req_type = PUTRequest + else: + req_type = compat_urllib_request.Request new_req = req_type( req_url, data=req_data, headers=req_headers, origin_req_host=req.origin_req_host, unverifiable=req.unverifiable) @@ -1924,11 +1985,27 @@ US_RATINGS = { } +TV_PARENTAL_GUIDELINES = { + 'TV-Y': 0, + 'TV-Y7': 7, + 'TV-G': 0, + 'TV-PG': 0, + 'TV-14': 14, + 'TV-MA': 17, +} + + def parse_age_limit(s): - if s is None: + if type(s) == int: + return s if 0 <= s <= 21 else None + if not isinstance(s, compat_basestring): return None m = re.match(r'^(?P<age>\d{1,2})\+?$', s) - return int(m.group('age')) if m else US_RATINGS.get(s) + if m: + return int(m.group('age')) + if s in US_RATINGS: + return US_RATINGS[s] + return TV_PARENTAL_GUIDELINES.get(s) def strip_jsonp(code): @@ -2046,6 +2123,7 @@ def mimetype2ext(mt): return ext _, _, res = mt.rpartition('/') + res = res.lower() return { '3gpp': '3gp', @@ -2057,9 +2135,53 @@ def mimetype2ext(mt): 'x-flv': 'flv', 'x-mp4-fragmented': 'mp4', 'x-ms-wmv': 'wmv', + 'mpegurl': 'm3u8', + 'x-mpegurl': 'm3u8', + 'vnd.apple.mpegurl': 'm3u8', + 'dash+xml': 'mpd', + 'f4m': 'f4m', + 'f4m+xml': 'f4m', + 'hds+xml': 'f4m', + 'vnd.ms-sstr+xml': 'ism', }.get(res, res) +def parse_codecs(codecs_str): + # http://tools.ietf.org/html/rfc6381 + if not codecs_str: + return {} + splited_codecs = list(filter(None, map( + lambda str: str.strip(), codecs_str.strip().strip(',').split(',')))) + vcodec, acodec = None, None + for full_codec in splited_codecs: + codec = full_codec.split('.')[0] + if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'): + if not vcodec: + vcodec = full_codec + elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac'): + if not acodec: + acodec = full_codec + else: + write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr) + if not vcodec and not acodec: + if len(splited_codecs) == 2: + return { + 'vcodec': vcodec, + 'acodec': acodec, + } + elif len(splited_codecs) == 1: + return { + 'vcodec': 'none', + 'acodec': vcodec, + } + else: + return { + 'vcodec': vcodec or 'none', + 'acodec': acodec or 'none', + } + return {} + + def urlhandle_detect_ext(url_handle): getheader = url_handle.headers.get @@ -2288,6 +2410,8 @@ def dfxp2srt(dfxp_data): def cli_option(params, command_option, param): param = params.get(param) + if param: + param = compat_str(param) return [command_option, param] if param is not None else [] @@ -2861,3 +2985,114 @@ def parse_m3u8_attributes(attrib): val = val[1:-1] info[key] = val return info + + +def urshift(val, n): + return val >> n if val >= 0 else (val + 0x100000000) >> n + + +# Based on png2str() written by @gdkchan and improved by @yokrysty +# Originally posted at https://github.com/rg3/youtube-dl/issues/9706 +def decode_png(png_data): + # Reference: https://www.w3.org/TR/PNG/ + header = png_data[8:] + + if png_data[:8] != b'\x89PNG\x0d\x0a\x1a\x0a' or header[4:8] != b'IHDR': + raise IOError('Not a valid PNG file.') + + int_map = {1: '>B', 2: '>H', 4: '>I'} + unpack_integer = lambda x: compat_struct_unpack(int_map[len(x)], x)[0] + + chunks = [] + + while header: + length = unpack_integer(header[:4]) + header = header[4:] + + chunk_type = header[:4] + header = header[4:] + + chunk_data = header[:length] + header = header[length:] + + header = header[4:] # Skip CRC + + chunks.append({ + 'type': chunk_type, + 'length': length, + 'data': chunk_data + }) + + ihdr = chunks[0]['data'] + + width = unpack_integer(ihdr[:4]) + height = unpack_integer(ihdr[4:8]) + + idat = b'' + + for chunk in chunks: + if chunk['type'] == b'IDAT': + idat += chunk['data'] + + if not idat: + raise IOError('Unable to read PNG data.') + + decompressed_data = bytearray(zlib.decompress(idat)) + + stride = width * 3 + pixels = [] + + def _get_pixel(idx): + x = idx % stride + y = idx // stride + return pixels[y][x] + + for y in range(height): + basePos = y * (1 + stride) + filter_type = decompressed_data[basePos] + + current_row = [] + + pixels.append(current_row) + + for x in range(stride): + color = decompressed_data[1 + basePos + x] + basex = y * stride + x + left = 0 + up = 0 + + if x > 2: + left = _get_pixel(basex - 3) + if y > 0: + up = _get_pixel(basex - stride) + + if filter_type == 1: # Sub + color = (color + left) & 0xff + elif filter_type == 2: # Up + color = (color + up) & 0xff + elif filter_type == 3: # Average + color = (color + ((left + up) >> 1)) & 0xff + elif filter_type == 4: # Paeth + a = left + b = up + c = 0 + + if x > 2 and y > 0: + c = _get_pixel(basex - stride - 3) + + p = a + b - c + + pa = abs(p - a) + pb = abs(p - b) + pc = abs(p - c) + + if pa <= pb and pa <= pc: + color = (color + a) & 0xff + elif pb <= pc: + color = (color + b) & 0xff + else: + color = (color + c) & 0xff + + current_row.append(color) + + return width, height, pixels diff --git a/youtube_dl/version.py b/youtube_dl/version.py index 2b7a4c9..cf59501 100644 --- a/youtube_dl/version.py +++ b/youtube_dl/version.py @@ -1,3 +1,3 @@ from __future__ import unicode_literals -__version__ = '2016.06.25' +__version__ = '2016.08.17' |