aboutsummaryrefslogtreecommitdiffstats
path: root/youtube_dl/extractor/generic.py
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r--youtube_dl/extractor/generic.py242
1 files changed, 182 insertions, 60 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index b83c183..9b0cd00 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -22,6 +22,8 @@ from ..utils import (
HEADRequest,
is_html,
js_to_json,
+ KNOWN_EXTENSIONS,
+ mimetype2ext,
orderedSet,
sanitized_Request,
smuggle_url,
@@ -57,10 +59,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE
from .drtuber import DrTuberIE
from .redtube import RedTubeIE
from .vimeo import VimeoIE
-from .dailymotion import (
- DailymotionIE,
- DailymotionCloudIE,
-)
+from .dailymotion import DailymotionIE
from .dailymail import DailyMailIE
from .onionstudios import OnionStudiosIE
from .viewlift import ViewLiftEmbedIE
@@ -99,6 +98,9 @@ from .mediaset import MediasetIE
from .joj import JojIE
from .megaphone import MegaphoneIE
from .vzaar import VzaarIE
+from .channel9 import Channel9IE
+from .vshare import VShareIE
+from .mediasite import MediasiteIE
class GenericIE(InfoExtractor):
@@ -1088,23 +1090,24 @@ class GenericIE(InfoExtractor):
'ext': 'mp4',
'upload_date': '20150212',
'uploader': 'The National Archives UK',
- 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6',
+ 'description': 'md5:8078af856dca76edc42910b61273dbbf',
'uploader_id': 'NationalArchives08',
'title': 'Webinar: Using Discovery, The National Archives’ online catalogue',
},
},
# jwplayer rtmp
{
- 'url': 'http://www.suffolk.edu/sjc/',
+ 'url': 'http://www.suffolk.edu/sjc/live.php',
'info_dict': {
- 'id': 'sjclive',
+ 'id': 'live',
'ext': 'flv',
'title': 'Massachusetts Supreme Judicial Court Oral Arguments',
'uploader': 'www.suffolk.edu',
},
'params': {
'skip_download': True,
- }
+ },
+ 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/',
},
# Complex jwplayer
{
@@ -1113,6 +1116,7 @@ class GenericIE(InfoExtractor):
'id': 'videos',
'ext': 'mp4',
'title': 'king machine trailer 1',
+ 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.',
'thumbnail': r're:^https?://.*\.jpg$',
},
},
@@ -1130,13 +1134,55 @@ class GenericIE(InfoExtractor):
'skip_download': True,
}
},
+ {
+ # JWPlatform iframe
+ 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/',
+ 'md5': 'ca00a040364b5b439230e7ebfd02c4e9',
+ 'info_dict': {
+ 'id': 'O0c5JcKT',
+ 'ext': 'mp4',
+ 'upload_date': '20171122',
+ 'timestamp': 1511366290,
+ 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone',
+ },
+ 'add_ie': [JWPlatformIE.ie_key()],
+ },
+ {
+ # Video.js embed, multiple formats
+ 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html',
+ 'info_dict': {
+ 'id': 'yygqldloqIk',
+ 'ext': 'mp4',
+ 'title': 'SolidWorks. Урок 6 Настройка чертежа',
+ 'description': 'md5:baf95267792646afdbf030e4d06b2ab3',
+ 'upload_date': '20130314',
+ 'uploader': 'PROстое3D',
+ 'uploader_id': 'PROstoe3D',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
+ # Video.js embed, single format
+ 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=',
+ 'info_dict': {
+ 'id': 'watch',
+ 'ext': 'mp4',
+ 'title': 'Step 1 - Good Foundation',
+ 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# rtl.nl embed
{
'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen',
'playlist_mincount': 5,
'info_dict': {
'id': 'aanslagen-kopenhagen',
- 'title': 'Aanslagen Kopenhagen | RTL Nieuws',
+ 'title': 'Aanslagen Kopenhagen',
}
},
# Zapiks embed
@@ -1268,6 +1314,7 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'This video is unavailable.',
},
# Pladform embed
{
@@ -1281,6 +1328,7 @@ class GenericIE(InfoExtractor):
'duration': 694,
'age_limit': 0,
},
+ 'skip': 'HTTP Error 404: Not Found',
},
# Playwire embed
{
@@ -1301,6 +1349,14 @@ class GenericIE(InfoExtractor):
'id': '518726732',
'ext': 'mp4',
'title': 'Facebook Creates "On This Day" | Crunch Report',
+ 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild',
+ 'timestamp': 1427237531,
+ 'uploader': 'Crunch Report',
+ 'upload_date': '20150324',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
},
# SVT embed
@@ -1352,16 +1408,20 @@ class GenericIE(InfoExtractor):
'upload_date': '20140107',
'timestamp': 1389118457,
},
+ 'skip': 'Invalid Page URL',
},
# NBC News embed
{
'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html',
'md5': '1aa589c675898ae6d37a17913cf68d66',
'info_dict': {
- 'id': '701714499682',
+ 'id': 'x_dtl_oa_LettermanliftPR_160608',
'ext': 'mp4',
- 'title': 'PREVIEW: On Assignment: David Letterman',
+ 'title': 'David Letterman: A Preview',
'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.',
+ 'upload_date': '20160609',
+ 'timestamp': 1465431544,
+ 'uploader': 'NBCU-NEWS',
},
},
# UDN embed
@@ -1378,6 +1438,7 @@ class GenericIE(InfoExtractor):
# m3u8 download
'skip_download': True,
},
+ 'expected_warnings': ['Failed to parse JSON Expecting value'],
},
# Ooyala embed
{
@@ -1385,7 +1446,7 @@ class GenericIE(InfoExtractor):
'info_dict': {
'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs',
'ext': 'mp4',
- 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.',
+ 'description': 'Index/Match versus VLOOKUP.',
'title': 'This is what separates the Excel masters from the wannabes',
'duration': 191.933,
},
@@ -1409,22 +1470,6 @@ class GenericIE(InfoExtractor):
'timestamp': 1432570283,
},
},
- # Dailymotion Cloud video
- {
- 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910',
- 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38',
- 'info_dict': {
- 'id': 'x2uy8t3',
- 'ext': 'mp4',
- 'title': 'Sauvons les abeilles ! - Le débat',
- 'description': 'md5:d9082128b1c5277987825d684939ca26',
- 'thumbnail': r're:^https?://.*\.jpe?g$',
- 'timestamp': 1434970506,
- 'upload_date': '20150622',
- 'uploader': 'Public Sénat',
- 'uploader_id': 'xa9gza',
- }
- },
# OnionStudios embed
{
'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537',
@@ -1581,22 +1626,6 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['BrightcoveLegacy'],
},
- # Nexx embed
- {
- 'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503',
- 'info_dict': {
- 'id': '247746',
- 'ext': 'mp4',
- 'title': "Yesterday's Jam (OV)",
- 'description': 'md5:09bc0984723fed34e2581624a84e05f0',
- 'timestamp': 1492594816,
- 'upload_date': '20170419',
- },
- 'params': {
- 'format': 'bestvideo',
- 'skip_download': True,
- },
- },
# Facebook <iframe> embed
{
'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
@@ -1879,6 +1908,37 @@ class GenericIE(InfoExtractor):
'title': 'Building A Business Online: Principal Chairs Q & A',
},
},
+ {
+ # multiple HTML5 videos on one page
+ 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html',
+ 'info_dict': {
+ 'id': 'keyscenarios',
+ 'title': 'Rescue Kit 14 Free Edition - Getting started',
+ },
+ 'playlist_count': 4,
+ },
+ {
+ # vshare embed
+ 'url': 'https://youtube-dl-demo.neocities.org/vshare.html',
+ 'md5': '17b39f55b5497ae8b59f5fbce8e35886',
+ 'info_dict': {
+ 'id': '0f64ce6',
+ 'title': 'vl14062007715967',
+ 'ext': 'mp4',
+ }
+ },
+ {
+ 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/',
+ 'md5': 'aecd089f55b1cb5a59032cb049d3a356',
+ 'info_dict': {
+ 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d',
+ 'ext': 'mp4',
+ 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare',
+ 'description': 'md5:5a51db84a62def7b7054df2ade403c6c',
+ 'timestamp': 1474354800,
+ 'upload_date': '20160920',
+ }
+ }
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -2128,7 +2188,7 @@ class GenericIE(InfoExtractor):
return self.playlist_result(self._parse_xspf(doc, video_id), video_id)
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'] = self._parse_mpd_formats(
- doc, video_id,
+ doc,
mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],
mpd_url=url)
self._sort_formats(info_dict['formats'])
@@ -2166,7 +2226,7 @@ class GenericIE(InfoExtractor):
# And then there are the jokers who advertise that they use RTA,
# but actually don't.
AGE_LIMIT_MARKERS = [
- r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>',
+ r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>',
]
if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS):
age_limit = 18
@@ -2228,7 +2288,7 @@ class GenericIE(InfoExtractor):
# Look for embedded rtl.nl player
matches = re.findall(
- r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"',
+ r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"',
webpage)
if matches:
return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl')
@@ -2627,7 +2687,7 @@ class GenericIE(InfoExtractor):
# Look for UDN embeds
mobj = re.search(
- r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
+ r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage)
if mobj is not None:
return self.url_result(
compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed')
@@ -2637,11 +2697,6 @@ class GenericIE(InfoExtractor):
if senate_isvp_url:
return self.url_result(senate_isvp_url, 'SenateISVP')
- # Look for Dailymotion Cloud videos
- dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage)
- if dmcloud_url:
- return self.url_result(dmcloud_url, 'DailymotionCloud')
-
# Look for OnionStudios embeds
onionstudios_url = OnionStudiosIE._extract_url(webpage)
if onionstudios_url:
@@ -2653,9 +2708,9 @@ class GenericIE(InfoExtractor):
return self.url_result(viewlift_url)
# Look for JWPlatform embeds
- jwplatform_url = JWPlatformIE._extract_url(webpage)
- if jwplatform_url:
- return self.url_result(jwplatform_url, 'JWPlatform')
+ jwplatform_urls = JWPlatformIE._extract_urls(webpage)
+ if jwplatform_urls:
+ return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key())
# Look for Digiteka embeds
digiteka_url = DigitekaIE._extract_url(webpage)
@@ -2831,6 +2886,26 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
+ channel9_urls = Channel9IE._extract_urls(webpage)
+ if channel9_urls:
+ return self.playlist_from_matches(
+ channel9_urls, video_id, video_title, ie=Channel9IE.ie_key())
+
+ vshare_urls = VShareIE._extract_urls(webpage)
+ if vshare_urls:
+ return self.playlist_from_matches(
+ vshare_urls, video_id, video_title, ie=VShareIE.ie_key())
+
+ # Look for Mediasite embeds
+ mediasite_urls = MediasiteIE._extract_urls(webpage)
+ if mediasite_urls:
+ entries = [
+ self.url_result(smuggle_url(
+ compat_urlparse.urljoin(url, mediasite_url),
+ {'UrlReferrer': url}), ie=MediasiteIE.ie_key())
+ for mediasite_url in mediasite_urls]
+ return self.playlist_result(entries, video_id, video_title)
+
def merge_dicts(dict1, dict2):
merged = {}
for k, v in dict1.items():
@@ -2849,13 +2924,20 @@ class GenericIE(InfoExtractor):
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
- for entry in entries:
- entry.update({
+ if len(entries) == 1:
+ entries[0].update({
'id': video_id,
'title': video_title,
})
+ else:
+ for num, entry in enumerate(entries, start=1):
+ entry.update({
+ 'id': '%s-%s' % (video_id, num),
+ 'title': '%s (%d)' % (video_title, num),
+ })
+ for entry in entries:
self._sort_formats(entry['formats'])
- return self.playlist_result(entries)
+ return self.playlist_result(entries, video_id, video_title)
jwplayer_data = self._find_jwplayer_data(
webpage, video_id, transform_source=js_to_json)
@@ -2864,6 +2946,46 @@ class GenericIE(InfoExtractor):
jwplayer_data, video_id, require_title=False, base_url=url)
return merge_dicts(info, info_dict)
+ # Video.js embed
+ mobj = re.search(
+ r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;',
+ webpage)
+ if mobj is not None:
+ sources = self._parse_json(
+ mobj.group(1), video_id, transform_source=js_to_json,
+ fatal=False) or []
+ if not isinstance(sources, list):
+ sources = [sources]
+ formats = []
+ for source in sources:
+ src = source.get('src')
+ if not src or not isinstance(src, compat_str):
+ continue
+ src = compat_urlparse.urljoin(url, src)
+ src_type = source.get('type')
+ if isinstance(src_type, compat_str):
+ src_type = src_type.lower()
+ ext = determine_ext(src).lower()
+ if src_type == 'video/youtube':
+ return self.url_result(src, YoutubeIE.ie_key())
+ if src_type == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False))
+ elif src_type == 'application/x-mpegurl' or ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ 'ext': (mimetype2ext(src_type) or
+ ext if ext in KNOWN_EXTENSIONS else 'mp4'),
+ })
+ if formats:
+ self._sort_formats(formats)
+ info_dict['formats'] = formats
+ return info_dict
+
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')
@@ -2957,7 +3079,7 @@ class GenericIE(InfoExtractor):
# be supported by youtube-dl thus this is checked the very last (see
# https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser)
embed_url = self._html_search_meta('twitter:player', webpage, default=None)
- if embed_url:
+ if embed_url and embed_url != url:
return self.url_result(embed_url)
if not found: