diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r-- | youtube_dl/extractor/generic.py | 242 |
1 files changed, 182 insertions, 60 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index b83c183..9b0cd00 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -22,6 +22,8 @@ from ..utils import ( HEADRequest, is_html, js_to_json, + KNOWN_EXTENSIONS, + mimetype2ext, orderedSet, sanitized_Request, smuggle_url, @@ -57,10 +59,7 @@ from .tnaflix import TNAFlixNetworkEmbedIE from .drtuber import DrTuberIE from .redtube import RedTubeIE from .vimeo import VimeoIE -from .dailymotion import ( - DailymotionIE, - DailymotionCloudIE, -) +from .dailymotion import DailymotionIE from .dailymail import DailyMailIE from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE @@ -99,6 +98,9 @@ from .mediaset import MediasetIE from .joj import JojIE from .megaphone import MegaphoneIE from .vzaar import VzaarIE +from .channel9 import Channel9IE +from .vshare import VShareIE +from .mediasite import MediasiteIE class GenericIE(InfoExtractor): @@ -1088,23 +1090,24 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'upload_date': '20150212', 'uploader': 'The National Archives UK', - 'description': 'md5:a236581cd2449dd2df4f93412f3f01c6', + 'description': 'md5:8078af856dca76edc42910b61273dbbf', 'uploader_id': 'NationalArchives08', 'title': 'Webinar: Using Discovery, The National Archives’ online catalogue', }, }, # jwplayer rtmp { - 'url': 'http://www.suffolk.edu/sjc/', + 'url': 'http://www.suffolk.edu/sjc/live.php', 'info_dict': { - 'id': 'sjclive', + 'id': 'live', 'ext': 'flv', 'title': 'Massachusetts Supreme Judicial Court Oral Arguments', 'uploader': 'www.suffolk.edu', }, 'params': { 'skip_download': True, - } + }, + 'skip': 'Only has video a few mornings per month, see http://www.suffolk.edu/sjc/', }, # Complex jwplayer { @@ -1113,6 +1116,7 @@ class GenericIE(InfoExtractor): 'id': 'videos', 'ext': 'mp4', 'title': 'king machine trailer 1', + 'description': 'Browse King Machine videos & audio for sweet media. Your eyes will thank you.', 'thumbnail': r're:^https?://.*\.jpg$', }, }, @@ -1130,13 +1134,55 @@ class GenericIE(InfoExtractor): 'skip_download': True, } }, + { + # JWPlatform iframe + 'url': 'https://www.mediaite.com/tv/dem-senator-claims-gary-cohn-faked-a-bad-connection-during-trump-call-to-get-him-off-the-phone/', + 'md5': 'ca00a040364b5b439230e7ebfd02c4e9', + 'info_dict': { + 'id': 'O0c5JcKT', + 'ext': 'mp4', + 'upload_date': '20171122', + 'timestamp': 1511366290, + 'title': 'Dem Senator Claims Gary Cohn Faked a Bad Connection During Trump Call to Get Him Off the Phone', + }, + 'add_ie': [JWPlatformIE.ie_key()], + }, + { + # Video.js embed, multiple formats + 'url': 'http://ortcam.com/solidworks-урок-6-настройка-чертежа_33f9b7351.html', + 'info_dict': { + 'id': 'yygqldloqIk', + 'ext': 'mp4', + 'title': 'SolidWorks. Урок 6 Настройка чертежа', + 'description': 'md5:baf95267792646afdbf030e4d06b2ab3', + 'upload_date': '20130314', + 'uploader': 'PROстое3D', + 'uploader_id': 'PROstoe3D', + }, + 'params': { + 'skip_download': True, + }, + }, + { + # Video.js embed, single format + 'url': 'https://www.vooplayer.com/v3/watch/watch.php?v=NzgwNTg=', + 'info_dict': { + 'id': 'watch', + 'ext': 'mp4', + 'title': 'Step 1 - Good Foundation', + 'description': 'md5:d1e7ff33a29fc3eb1673d6c270d344f4', + }, + 'params': { + 'skip_download': True, + }, + }, # rtl.nl embed { 'url': 'http://www.rtlnieuws.nl/nieuws/buitenland/aanslagen-kopenhagen', 'playlist_mincount': 5, 'info_dict': { 'id': 'aanslagen-kopenhagen', - 'title': 'Aanslagen Kopenhagen | RTL Nieuws', + 'title': 'Aanslagen Kopenhagen', } }, # Zapiks embed @@ -1268,6 +1314,7 @@ class GenericIE(InfoExtractor): 'params': { 'skip_download': True, }, + 'skip': 'This video is unavailable.', }, # Pladform embed { @@ -1281,6 +1328,7 @@ class GenericIE(InfoExtractor): 'duration': 694, 'age_limit': 0, }, + 'skip': 'HTTP Error 404: Not Found', }, # Playwire embed { @@ -1301,6 +1349,14 @@ class GenericIE(InfoExtractor): 'id': '518726732', 'ext': 'mp4', 'title': 'Facebook Creates "On This Day" | Crunch Report', + 'description': 'Amazon updates Fire TV line, Tesla\'s Model X spotted in the wild', + 'timestamp': 1427237531, + 'uploader': 'Crunch Report', + 'upload_date': '20150324', + }, + 'params': { + # m3u8 download + 'skip_download': True, }, }, # SVT embed @@ -1352,16 +1408,20 @@ class GenericIE(InfoExtractor): 'upload_date': '20140107', 'timestamp': 1389118457, }, + 'skip': 'Invalid Page URL', }, # NBC News embed { 'url': 'http://www.vulture.com/2016/06/letterman-couldnt-care-less-about-late-night.html', 'md5': '1aa589c675898ae6d37a17913cf68d66', 'info_dict': { - 'id': '701714499682', + 'id': 'x_dtl_oa_LettermanliftPR_160608', 'ext': 'mp4', - 'title': 'PREVIEW: On Assignment: David Letterman', + 'title': 'David Letterman: A Preview', 'description': 'A preview of Tom Brokaw\'s interview with David Letterman as part of the On Assignment series powered by Dateline. Airs Sunday June 12 at 7/6c.', + 'upload_date': '20160609', + 'timestamp': 1465431544, + 'uploader': 'NBCU-NEWS', }, }, # UDN embed @@ -1378,6 +1438,7 @@ class GenericIE(InfoExtractor): # m3u8 download 'skip_download': True, }, + 'expected_warnings': ['Failed to parse JSON Expecting value'], }, # Ooyala embed { @@ -1385,7 +1446,7 @@ class GenericIE(InfoExtractor): 'info_dict': { 'id': '50YnY4czr4ms1vJ7yz3xzq0excz_pUMs', 'ext': 'mp4', - 'description': 'VIDEO: INDEX/MATCH versus VLOOKUP.', + 'description': 'Index/Match versus VLOOKUP.', 'title': 'This is what separates the Excel masters from the wannabes', 'duration': 191.933, }, @@ -1409,22 +1470,6 @@ class GenericIE(InfoExtractor): 'timestamp': 1432570283, }, }, - # Dailymotion Cloud video - { - 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', - 'info_dict': { - 'id': 'x2uy8t3', - 'ext': 'mp4', - 'title': 'Sauvons les abeilles ! - Le débat', - 'description': 'md5:d9082128b1c5277987825d684939ca26', - 'thumbnail': r're:^https?://.*\.jpe?g$', - 'timestamp': 1434970506, - 'upload_date': '20150622', - 'uploader': 'Public Sénat', - 'uploader_id': 'xa9gza', - } - }, # OnionStudios embed { 'url': 'http://www.clickhole.com/video/dont-understand-bitcoin-man-will-mumble-explanatio-2537', @@ -1581,22 +1626,6 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['BrightcoveLegacy'], }, - # Nexx embed - { - 'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503', - 'info_dict': { - 'id': '247746', - 'ext': 'mp4', - 'title': "Yesterday's Jam (OV)", - 'description': 'md5:09bc0984723fed34e2581624a84e05f0', - 'timestamp': 1492594816, - 'upload_date': '20170419', - }, - 'params': { - 'format': 'bestvideo', - 'skip_download': True, - }, - }, # Facebook <iframe> embed { 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', @@ -1879,6 +1908,37 @@ class GenericIE(InfoExtractor): 'title': 'Building A Business Online: Principal Chairs Q & A', }, }, + { + # multiple HTML5 videos on one page + 'url': 'https://www.paragon-software.com/home/rk-free/keyscenarios.html', + 'info_dict': { + 'id': 'keyscenarios', + 'title': 'Rescue Kit 14 Free Edition - Getting started', + }, + 'playlist_count': 4, + }, + { + # vshare embed + 'url': 'https://youtube-dl-demo.neocities.org/vshare.html', + 'md5': '17b39f55b5497ae8b59f5fbce8e35886', + 'info_dict': { + 'id': '0f64ce6', + 'title': 'vl14062007715967', + 'ext': 'mp4', + } + }, + { + 'url': 'http://www.heidelberg-laureate-forum.org/blog/video/lecture-friday-september-23-2016-sir-c-antony-r-hoare/', + 'md5': 'aecd089f55b1cb5a59032cb049d3a356', + 'info_dict': { + 'id': '90227f51a80c4d8f86c345a7fa62bd9a1d', + 'ext': 'mp4', + 'title': 'Lecture: Friday, September 23, 2016 - Sir Tony Hoare', + 'description': 'md5:5a51db84a62def7b7054df2ade403c6c', + 'timestamp': 1474354800, + 'upload_date': '20160920', + } + } # { # # TODO: find another test # # http://schema.org/VideoObject @@ -2128,7 +2188,7 @@ class GenericIE(InfoExtractor): return self.playlist_result(self._parse_xspf(doc, video_id), video_id) elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag): info_dict['formats'] = self._parse_mpd_formats( - doc, video_id, + doc, mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0], mpd_url=url) self._sort_formats(info_dict['formats']) @@ -2166,7 +2226,7 @@ class GenericIE(InfoExtractor): # And then there are the jokers who advertise that they use RTA, # but actually don't. AGE_LIMIT_MARKERS = [ - r'Proudly Labeled <a href="http://www.rtalabel.org/" title="Restricted to Adults">RTA</a>', + r'Proudly Labeled <a href="http://www\.rtalabel\.org/" title="Restricted to Adults">RTA</a>', ] if any(re.search(marker, webpage) for marker in AGE_LIMIT_MARKERS): age_limit = 18 @@ -2228,7 +2288,7 @@ class GenericIE(InfoExtractor): # Look for embedded rtl.nl player matches = re.findall( - r'<iframe[^>]+?src="((?:https?:)?//(?:www\.)?rtl\.nl/system/videoplayer/[^"]+(?:video_)?embed[^"]+)"', + r'<iframe[^>]+?src="((?:https?:)?//(?:(?:www|static)\.)?rtl\.nl/(?:system/videoplayer/[^"]+(?:video_)?)?embed[^"]+)"', webpage) if matches: return self.playlist_from_matches(matches, video_id, video_title, ie='RtlNl') @@ -2627,7 +2687,7 @@ class GenericIE(InfoExtractor): # Look for UDN embeds mobj = re.search( - r'<iframe[^>]+src="(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) + r'<iframe[^>]+src="(?:https?:)?(?P<url>%s)"' % UDNEmbedIE._PROTOCOL_RELATIVE_VALID_URL, webpage) if mobj is not None: return self.url_result( compat_urlparse.urljoin(url, mobj.group('url')), 'UDNEmbed') @@ -2637,11 +2697,6 @@ class GenericIE(InfoExtractor): if senate_isvp_url: return self.url_result(senate_isvp_url, 'SenateISVP') - # Look for Dailymotion Cloud videos - dmcloud_url = DailymotionCloudIE._extract_dmcloud_url(webpage) - if dmcloud_url: - return self.url_result(dmcloud_url, 'DailymotionCloud') - # Look for OnionStudios embeds onionstudios_url = OnionStudiosIE._extract_url(webpage) if onionstudios_url: @@ -2653,9 +2708,9 @@ class GenericIE(InfoExtractor): return self.url_result(viewlift_url) # Look for JWPlatform embeds - jwplatform_url = JWPlatformIE._extract_url(webpage) - if jwplatform_url: - return self.url_result(jwplatform_url, 'JWPlatform') + jwplatform_urls = JWPlatformIE._extract_urls(webpage) + if jwplatform_urls: + return self.playlist_from_matches(jwplatform_urls, video_id, video_title, ie=JWPlatformIE.ie_key()) # Look for Digiteka embeds digiteka_url = DigitekaIE._extract_url(webpage) @@ -2831,6 +2886,26 @@ class GenericIE(InfoExtractor): return self.playlist_from_matches( vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key()) + channel9_urls = Channel9IE._extract_urls(webpage) + if channel9_urls: + return self.playlist_from_matches( + channel9_urls, video_id, video_title, ie=Channel9IE.ie_key()) + + vshare_urls = VShareIE._extract_urls(webpage) + if vshare_urls: + return self.playlist_from_matches( + vshare_urls, video_id, video_title, ie=VShareIE.ie_key()) + + # Look for Mediasite embeds + mediasite_urls = MediasiteIE._extract_urls(webpage) + if mediasite_urls: + entries = [ + self.url_result(smuggle_url( + compat_urlparse.urljoin(url, mediasite_url), + {'UrlReferrer': url}), ie=MediasiteIE.ie_key()) + for mediasite_url in mediasite_urls] + return self.playlist_result(entries, video_id, video_title) + def merge_dicts(dict1, dict2): merged = {} for k, v in dict1.items(): @@ -2849,13 +2924,20 @@ class GenericIE(InfoExtractor): # Look for HTML5 media entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls') if entries: - for entry in entries: - entry.update({ + if len(entries) == 1: + entries[0].update({ 'id': video_id, 'title': video_title, }) + else: + for num, entry in enumerate(entries, start=1): + entry.update({ + 'id': '%s-%s' % (video_id, num), + 'title': '%s (%d)' % (video_title, num), + }) + for entry in entries: self._sort_formats(entry['formats']) - return self.playlist_result(entries) + return self.playlist_result(entries, video_id, video_title) jwplayer_data = self._find_jwplayer_data( webpage, video_id, transform_source=js_to_json) @@ -2864,6 +2946,46 @@ class GenericIE(InfoExtractor): jwplayer_data, video_id, require_title=False, base_url=url) return merge_dicts(info, info_dict) + # Video.js embed + mobj = re.search( + r'(?s)\bvideojs\s*\(.+?\.src\s*\(\s*((?:\[.+?\]|{.+?}))\s*\)\s*;', + webpage) + if mobj is not None: + sources = self._parse_json( + mobj.group(1), video_id, transform_source=js_to_json, + fatal=False) or [] + if not isinstance(sources, list): + sources = [sources] + formats = [] + for source in sources: + src = source.get('src') + if not src or not isinstance(src, compat_str): + continue + src = compat_urlparse.urljoin(url, src) + src_type = source.get('type') + if isinstance(src_type, compat_str): + src_type = src_type.lower() + ext = determine_ext(src).lower() + if src_type == 'video/youtube': + return self.url_result(src, YoutubeIE.ie_key()) + if src_type == 'application/dash+xml' or ext == 'mpd': + formats.extend(self._extract_mpd_formats( + src, video_id, mpd_id='dash', fatal=False)) + elif src_type == 'application/x-mpegurl' or ext == 'm3u8': + formats.extend(self._extract_m3u8_formats( + src, video_id, 'mp4', entry_protocol='m3u8_native', + m3u8_id='hls', fatal=False)) + else: + formats.append({ + 'url': src, + 'ext': (mimetype2ext(src_type) or + ext if ext in KNOWN_EXTENSIONS else 'mp4'), + }) + if formats: + self._sort_formats(formats) + info_dict['formats'] = formats + return info_dict + # Looking for http://schema.org/VideoObject json_ld = self._search_json_ld( webpage, video_id, default={}, expected_type='VideoObject') @@ -2957,7 +3079,7 @@ class GenericIE(InfoExtractor): # be supported by youtube-dl thus this is checked the very last (see # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) embed_url = self._html_search_meta('twitter:player', webpage, default=None) - if embed_url: + if embed_url and embed_url != url: return self.url_result(embed_url) if not found: |