diff options
Diffstat (limited to 'youtube_dl/extractor/generic.py')
-rw-r--r-- | youtube_dl/extractor/generic.py | 282 |
1 files changed, 252 insertions, 30 deletions
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py index 4aa2406..197ab95 100644 --- a/youtube_dl/extractor/generic.py +++ b/youtube_dl/extractor/generic.py @@ -49,7 +49,10 @@ from .pornhub import PornHubIE from .xhamster import XHamsterEmbedIE from .tnaflix import TNAFlixNetworkEmbedIE from .vimeo import VimeoIE -from .dailymotion import DailymotionCloudIE +from .dailymotion import ( + DailymotionIE, + DailymotionCloudIE, +) from .onionstudios import OnionStudiosIE from .viewlift import ViewLiftEmbedIE from .screenwavemedia import ScreenwaveMediaIE @@ -59,11 +62,17 @@ from .videomore import VideomoreIE from .googledrive import GoogleDriveIE from .jwplatform import JWPlatformIE from .digiteka import DigitekaIE +from .arkena import ArkenaIE from .instagram import InstagramIE from .liveleak import LiveLeakIE from .threeqsdn import ThreeQSDNIE from .theplatform import ThePlatformIE from .vessel import VesselIE +from .kaltura import KalturaIE +from .eagleplatform import EaglePlatformIE +from .facebook import FacebookIE +from .soundcloud import SoundcloudIE +from .vbox7 import Vbox7IE class GenericIE(InfoExtractor): @@ -467,7 +476,7 @@ class GenericIE(InfoExtractor): 'url': 'http://www.vestifinance.ru/articles/25753', 'info_dict': { 'id': '25753', - 'title': 'Вести Экономика ― Прямые трансляции с Форума-выставки "Госзаказ-2013"', + 'title': 'Прямые трансляции с Форума-выставки "Госзаказ-2013"', }, 'playlist': [{ 'info_dict': { @@ -634,6 +643,8 @@ class GenericIE(InfoExtractor): 'ext': 'mp4', 'title': 'Key and Peele|October 10, 2012|2|203|Liam Neesons - Uncensored', 'description': 'Two valets share their love for movie star Liam Neesons.', + 'timestamp': 1349922600, + 'upload_date': '20121011', }, }, # YouTube embed via <data-embed-url=""> @@ -775,6 +786,15 @@ class GenericIE(InfoExtractor): 'upload_date': '20141029', } }, + # Soundcloud multiple embeds + { + 'url': 'http://www.guitarplayer.com/lessons/1014/legato-workout-one-hour-to-more-fluid-performance---tab/52809', + 'info_dict': { + 'id': '52809', + 'title': 'Guitar Essentials: Legato Workout—One-Hour to Fluid Performance | TAB + AUDIO', + }, + 'playlist_mincount': 7, + }, # Livestream embed { 'url': 'http://www.esa.int/Our_Activities/Space_Science/Rosetta/Philae_comet_touch-down_webcast', @@ -850,6 +870,7 @@ class GenericIE(InfoExtractor): 'description': 'md5:601cb790edd05908957dae8aaa866465', 'upload_date': '20150220', }, + 'skip': 'All The Daily Show URLs now redirect to http://www.cc.com/shows/', }, # jwplayer YouTube { @@ -920,6 +941,24 @@ class GenericIE(InfoExtractor): }, 'add_ie': ['Kaltura'], }, + { + # Kaltura embedded via quoted entry_id + 'url': 'https://www.oreilly.com/ideas/my-cloud-makes-pretty-pictures', + 'info_dict': { + 'id': '0_utuok90b', + 'ext': 'mp4', + 'title': '06_matthew_brender_raj_dutt', + 'timestamp': 1466638791, + 'upload_date': '20160622', + }, + 'add_ie': ['Kaltura'], + 'expected_warnings': [ + 'Could not send HEAD request' + ], + 'params': { + 'skip_download': True, + } + }, # Eagle.Platform embed (generic URL) { 'url': 'http://lenta.ru/news/2015/03/06/navalny/', @@ -1091,12 +1130,17 @@ class GenericIE(InfoExtractor): # Dailymotion Cloud video { 'url': 'http://replay.publicsenat.fr/vod/le-debat/florent-kolandjian,dominique-cena,axel-decourtye,laurence-abeille,bruno-parmentier/175910', - 'md5': '49444254273501a64675a7e68c502681', + 'md5': 'dcaf23ad0c67a256f4278bce6e0bae38', 'info_dict': { - 'id': '5585de919473990de4bee11b', + 'id': 'x2uy8t3', 'ext': 'mp4', - 'title': 'Le débat', + 'title': 'Sauvons les abeilles ! - Le débat', + 'description': 'md5:d9082128b1c5277987825d684939ca26', 'thumbnail': 're:^https?://.*\.jpe?g$', + 'timestamp': 1434970506, + 'upload_date': '20150622', + 'uploader': 'Public Sénat', + 'uploader_id': 'xa9gza', } }, # OnionStudios embed @@ -1220,6 +1264,145 @@ class GenericIE(InfoExtractor): 'uploader': 'www.hudl.com', }, }, + # twitter:player:stream embed + { + 'url': 'http://www.rtl.be/info/video/589263.aspx?CategoryID=288', + 'info_dict': { + 'id': 'master', + 'ext': 'mp4', + 'title': 'Une nouvelle espèce de dinosaure découverte en Argentine', + 'uploader': 'www.rtl.be', + }, + 'params': { + # m3u8 downloads + 'skip_download': True, + }, + }, + # twitter:player embed + { + 'url': 'http://www.theatlantic.com/video/index/484130/what-do-black-holes-sound-like/', + 'md5': 'a3e0df96369831de324f0778e126653c', + 'info_dict': { + 'id': '4909620399001', + 'ext': 'mp4', + 'title': 'What Do Black Holes Sound Like?', + 'description': 'what do black holes sound like', + 'upload_date': '20160524', + 'uploader_id': '29913724001', + 'timestamp': 1464107587, + 'uploader': 'TheAtlantic', + }, + 'add_ie': ['BrightcoveLegacy'], + }, + # Facebook <iframe> embed + { + 'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html', + 'md5': 'fbcde74f534176ecb015849146dd3aee', + 'info_dict': { + 'id': '599637780109885', + 'ext': 'mp4', + 'title': 'Facebook video #599637780109885', + }, + }, + # Facebook API embed + { + 'url': 'http://www.lothype.com/blue-stars-2016-preview-standstill-full-show/', + 'md5': 'a47372ee61b39a7b90287094d447d94e', + 'info_dict': { + 'id': '10153467542406923', + 'ext': 'mp4', + 'title': 'Facebook video #10153467542406923', + }, + }, + # Wordpress "YouTube Video Importer" plugin + { + 'url': 'http://www.lothype.com/blue-devils-drumline-stanford-lot-2016/', + 'md5': 'd16797741b560b485194eddda8121b48', + 'info_dict': { + 'id': 'HNTXWDXV9Is', + 'ext': 'mp4', + 'title': 'Blue Devils Drumline Stanford lot 2016', + 'upload_date': '20160627', + 'uploader_id': 'GENOCIDE8GENERAL10', + 'uploader': 'cylus cyrus', + }, + }, + { + # video stored on custom kaltura server + 'url': 'http://www.expansion.com/multimedia/videos.html?media=EQcM30NHIPv', + 'md5': '537617d06e64dfed891fa1593c4b30cc', + 'info_dict': { + 'id': '0_1iotm5bh', + 'ext': 'mp4', + 'title': 'Elecciones británicas: 5 lecciones para Rajoy', + 'description': 'md5:435a89d68b9760b92ce67ed227055f16', + 'uploader_id': 'videos.expansion@el-mundo.net', + 'upload_date': '20150429', + 'timestamp': 1430303472, + }, + 'add_ie': ['Kaltura'], + }, + { + # Non-standard Vimeo embed + 'url': 'https://openclassrooms.com/courses/understanding-the-web', + 'md5': '64d86f1c7d369afd9a78b38cbb88d80a', + 'info_dict': { + 'id': '148867247', + 'ext': 'mp4', + 'title': 'Understanding the web - Teaser', + 'description': 'This is "Understanding the web - Teaser" by openclassrooms on Vimeo, the home for high quality videos and the people who love them.', + 'upload_date': '20151214', + 'uploader': 'OpenClassrooms', + 'uploader_id': 'openclassrooms', + }, + 'add_ie': ['Vimeo'], + }, + { + 'url': 'https://support.arkena.com/display/PLAY/Ways+to+embed+your+video', + 'md5': 'b96f2f71b359a8ecd05ce4e1daa72365', + 'info_dict': { + 'id': 'b41dda37-d8e7-4d3f-b1b5-9a9db578bdfe', + 'ext': 'mp4', + 'title': 'Big Buck Bunny', + 'description': 'Royalty free test video', + 'timestamp': 1432816365, + 'upload_date': '20150528', + 'is_live': False, + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [ArkenaIE.ie_key()], + }, + { + 'url': 'http://nova.bg/news/view/2016/08/16/156543/%D0%BD%D0%B0-%D0%BA%D0%BE%D1%81%D1%8A%D0%BC-%D0%BE%D1%82-%D0%B2%D0%B7%D1%80%D0%B8%D0%B2-%D0%BE%D1%82%D1%86%D0%B5%D0%BF%D0%B8%D1%85%D0%B0-%D1%86%D1%8F%D0%BB-%D0%BA%D0%B2%D0%B0%D1%80%D1%82%D0%B0%D0%BB-%D0%B7%D0%B0%D1%80%D0%B0%D0%B4%D0%B8-%D0%B8%D0%B7%D1%82%D0%B8%D1%87%D0%B0%D0%BD%D0%B5-%D0%BD%D0%B0-%D0%B3%D0%B0%D0%B7-%D0%B2-%D0%BF%D0%BB%D0%BE%D0%B2%D0%B4%D0%B8%D0%B2/', + 'info_dict': { + 'id': '1c7141f46c', + 'ext': 'mp4', + 'title': 'НА КОСЪМ ОТ ВЗРИВ: Изтичане на газ на бензиностанция в Пловдив', + }, + 'params': { + 'skip_download': True, + }, + 'add_ie': [Vbox7IE.ie_key()], + }, + # { + # # TODO: find another test + # # http://schema.org/VideoObject + # 'url': 'https://flipagram.com/f/nyvTSJMKId', + # 'md5': '888dcf08b7ea671381f00fab74692755', + # 'info_dict': { + # 'id': 'nyvTSJMKId', + # 'ext': 'mp4', + # 'title': 'Flipagram by sjuria101 featuring Midnight Memories by One Direction', + # 'description': '#love for cats.', + # 'timestamp': 1461244995, + # 'upload_date': '20160421', + # }, + # 'params': { + # 'force_generic_extractor': True, + # }, + # } ] def report_following_redirect(self, new_url): @@ -1576,12 +1759,16 @@ class GenericIE(InfoExtractor): if matches: return _playlist_from_matches(matches, lambda m: unescapeHTML(m)) - # Look for embedded Dailymotion player - matches = re.findall( - r'<(?:(?:embed|iframe)[^>]+?src=|input[^>]+id=[\'"]dmcloudUrlEmissionSelect[\'"][^>]+value=)(["\'])(?P<url>(?:https?:)?//(?:www\.)?dailymotion\.com/(?:embed|swf)/video/.+?)\1', webpage) + # Look for Wordpress "YouTube Video Importer" plugin + matches = re.findall(r'''(?x)<div[^>]+ + class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+ + data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage) if matches: - return _playlist_from_matches( - matches, lambda m: unescapeHTML(m[1])) + return _playlist_from_matches(matches, lambda m: m[-1]) + + matches = DailymotionIE._extract_urls(webpage) + if matches: + return _playlist_from_matches(matches) # Look for embedded Dailymotion playlist player (#3822) m = re.search( @@ -1718,10 +1905,9 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url')) # Look for embedded Facebook player - mobj = re.search( - r'<iframe[^>]+?src=(["\'])(?P<url>https://www\.facebook\.com/video/embed.+?)\1', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'Facebook') + facebook_url = FacebookIE._extract_url(webpage) + if facebook_url is not None: + return self.url_result(facebook_url, 'Facebook') # Look for embedded VK player mobj = re.search(r'<iframe[^>]+?src=(["\'])(?P<url>https?://vk\.com/video_ext\.php.+?)\1', webpage) @@ -1836,12 +2022,9 @@ class GenericIE(InfoExtractor): return self.url_result(myvi_url) # Look for embedded soundcloud player - mobj = re.search( - r'<iframe\s+(?:[a-zA-Z0-9_-]+="[^"]+"\s+)*src="(?P<url>https?://(?:w\.)?soundcloud\.com/player[^"]+)"', - webpage) - if mobj is not None: - url = unescapeHTML(mobj.group('url')) - return self.url_result(url) + soundcloud_urls = SoundcloudIE._extract_urls(webpage) + if soundcloud_urls: + return _playlist_from_matches(soundcloud_urls, getter=unescapeHTML, ie=SoundcloudIE.ie_key()) # Look for embedded mtvservices player mtvservices_url = MTVServicesEmbeddedIE._extract_url(webpage) @@ -1903,18 +2086,14 @@ class GenericIE(InfoExtractor): return self.url_result(mobj.group('url'), 'Zapiks') # Look for Kaltura embeds - mobj = (re.search(r"(?s)kWidget\.(?:thumb)?[Ee]mbed\(\{.*?(?P<q1>['\"])wid(?P=q1)\s*:\s*(?P<q2>['\"])_?(?P<partner_id>[^'\"]+)(?P=q2),.*?(?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*(?P<q4>['\"])(?P<id>[^'\"]+)(?P=q4),", webpage) or - re.search(r'(?s)(?P<q1>["\'])(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com/.*?(?:p|partner_id)/(?P<partner_id>\d+).*?(?P=q1).*?entry_?[Ii]d\s*:\s*(?P<q2>["\'])(?P<id>.+?)(?P=q2)', webpage)) - if mobj is not None: - return self.url_result(smuggle_url( - 'kaltura:%(partner_id)s:%(id)s' % mobj.groupdict(), - {'source_url': url}), 'Kaltura') + kaltura_url = KalturaIE._extract_url(webpage) + if kaltura_url: + return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key()) # Look for Eagle.Platform embeds - mobj = re.search( - r'<iframe[^>]+src="(?P<url>https?://.+?\.media\.eagleplatform\.com/index/player\?.+?)"', webpage) - if mobj is not None: - return self.url_result(mobj.group('url'), 'EaglePlatform') + eagleplatform_url = EaglePlatformIE._extract_url(webpage) + if eagleplatform_url: + return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key()) # Look for ClipYou (uses Eagle.Platform) embeds mobj = re.search( @@ -2008,6 +2187,11 @@ class GenericIE(InfoExtractor): if digiteka_url: return self.url_result(self._proto_relative_url(digiteka_url), DigitekaIE.ie_key()) + # Look for Arkena embeds + arkena_url = ArkenaIE._extract_url(webpage) + if arkena_url: + return self.url_result(arkena_url, ArkenaIE.ie_key()) + # Look for Limelight embeds mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage) if mobj: @@ -2036,6 +2220,14 @@ class GenericIE(InfoExtractor): return self.url_result( self._proto_relative_url(unescapeHTML(mobj.group(1))), 'Vine') + # Look for VODPlatform embeds + mobj = re.search( + r'<iframe[^>]+src=[\'"]((?:https?:)?//(?:www\.)?vod-platform\.net/embed/[^/?#]+)', + webpage) + if mobj is not None: + return self.url_result( + self._proto_relative_url(unescapeHTML(mobj.group(1))), 'VODPlatform') + # Look for Instagram embeds instagram_embed_url = InstagramIE._extract_embed_url(webpage) if instagram_embed_url is not None: @@ -2060,6 +2252,24 @@ class GenericIE(InfoExtractor): 'uploader': video_uploader, } + # Look for VBOX7 embeds + vbox7_url = Vbox7IE._extract_url(webpage) + if vbox7_url: + return self.url_result(vbox7_url, Vbox7IE.ie_key()) + + # Looking for http://schema.org/VideoObject + json_ld = self._search_json_ld( + webpage, video_id, default={}, expected_type='VideoObject') + if json_ld.get('url'): + info_dict.update({ + 'title': video_title or info_dict['title'], + 'description': video_description, + 'thumbnail': video_thumbnail, + 'age_limit': age_limit + }) + info_dict.update(json_ld) + return info_dict + def check_video(vurl): if YoutubeIE.suitable(vurl): return True @@ -2103,6 +2313,9 @@ class GenericIE(InfoExtractor): r"cinerama\.embedPlayer\(\s*\'[^']+\',\s*'([^']+)'", webpage) if not found: # Try to find twitter cards info + # twitter:player:stream should be checked before twitter:player since + # it is expected to contain a raw stream (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) found = filter_video(re.findall( r'<meta (?:property|name)="twitter:player:stream" (?:content|value)="(.+?)"', webpage)) if not found: @@ -2136,6 +2349,15 @@ class GenericIE(InfoExtractor): '_type': 'url', 'url': new_url, } + + if not found: + # twitter:player is a https URL to iframe player that may or may not + # be supported by youtube-dl thus this is checked the very last (see + # https://dev.twitter.com/cards/types/player#On_twitter.com_via_desktop_browser) + embed_url = self._html_search_meta('twitter:player', webpage, default=None) + if embed_url: + return self.url_result(embed_url) + if not found: raise UnsupportedError(url) |