aboutsummaryrefslogtreecommitdiffstats
path: root/youtube_dl/extractor
diff options
context:
space:
mode:
Diffstat (limited to 'youtube_dl/extractor')
-rw-r--r--youtube_dl/extractor/adn.py136
-rw-r--r--youtube_dl/extractor/adobepass.py69
-rw-r--r--youtube_dl/extractor/adultswim.py281
-rw-r--r--youtube_dl/extractor/aenetworks.py40
-rw-r--r--youtube_dl/extractor/afreecatv.py142
-rw-r--r--youtube_dl/extractor/airmozilla.py28
-rw-r--r--youtube_dl/extractor/aljazeera.py9
-rw-r--r--youtube_dl/extractor/allocine.py44
-rw-r--r--youtube_dl/extractor/amp.py26
-rw-r--r--youtube_dl/extractor/anvato.py66
-rw-r--r--youtube_dl/extractor/appleconnect.py4
-rw-r--r--youtube_dl/extractor/appletrailers.py5
-rw-r--r--youtube_dl/extractor/archiveorg.py4
-rw-r--r--youtube_dl/extractor/arte.py5
-rw-r--r--youtube_dl/extractor/atresplayer.py2
-rw-r--r--youtube_dl/extractor/audioboom.py2
-rw-r--r--youtube_dl/extractor/azubu.py140
-rw-r--r--youtube_dl/extractor/bandcamp.py12
-rw-r--r--youtube_dl/extractor/bbc.py10
-rw-r--r--youtube_dl/extractor/beeg.py2
-rw-r--r--youtube_dl/extractor/bilibili.py5
-rw-r--r--youtube_dl/extractor/bleacherreport.py10
-rw-r--r--youtube_dl/extractor/br.py2
-rw-r--r--youtube_dl/extractor/brightcove.py111
-rw-r--r--youtube_dl/extractor/canalc2.py5
-rw-r--r--youtube_dl/extractor/canalplus.py43
-rw-r--r--youtube_dl/extractor/canvas.py1
-rw-r--r--youtube_dl/extractor/cbc.py6
-rw-r--r--youtube_dl/extractor/cbslocal.py4
-rwxr-xr-xyoutube_dl/extractor/cda.py52
-rw-r--r--youtube_dl/extractor/ceskatelevize.py106
-rw-r--r--youtube_dl/extractor/chaturbate.py16
-rw-r--r--youtube_dl/extractor/clipfish.py2
-rw-r--r--youtube_dl/extractor/collegerama.py3
-rw-r--r--youtube_dl/extractor/common.py286
-rw-r--r--youtube_dl/extractor/condenast.py81
-rw-r--r--youtube_dl/extractor/coub.py5
-rw-r--r--youtube_dl/extractor/crackle.py7
-rw-r--r--youtube_dl/extractor/crunchyroll.py12
-rw-r--r--youtube_dl/extractor/cspan.py15
-rw-r--r--youtube_dl/extractor/curiositystream.py55
-rw-r--r--youtube_dl/extractor/cwtv.py7
-rw-r--r--youtube_dl/extractor/dailymail.py12
-rw-r--r--youtube_dl/extractor/dailymotion.py126
-rw-r--r--youtube_dl/extractor/democracynow.py3
-rw-r--r--youtube_dl/extractor/discoveryvr.py59
-rw-r--r--youtube_dl/extractor/dotsub.py2
-rw-r--r--youtube_dl/extractor/douyutv.py86
-rw-r--r--youtube_dl/extractor/drtv.py34
-rw-r--r--youtube_dl/extractor/extractors.py57
-rw-r--r--youtube_dl/extractor/foxsports.py9
-rw-r--r--youtube_dl/extractor/francetv.py218
-rw-r--r--youtube_dl/extractor/funimation.py273
-rw-r--r--youtube_dl/extractor/funnyordie.py3
-rw-r--r--youtube_dl/extractor/gamespot.py3
-rw-r--r--youtube_dl/extractor/gdcvault.py15
-rw-r--r--youtube_dl/extractor/generic.py262
-rw-r--r--youtube_dl/extractor/go.py49
-rw-r--r--youtube_dl/extractor/go90.py126
-rw-r--r--youtube_dl/extractor/hbo.py16
-rw-r--r--youtube_dl/extractor/imdb.py5
-rw-r--r--youtube_dl/extractor/infoq.py4
-rw-r--r--youtube_dl/extractor/instagram.py8
-rw-r--r--youtube_dl/extractor/iqiyi.py26
-rw-r--r--youtube_dl/extractor/itv.py28
-rw-r--r--youtube_dl/extractor/kaltura.py27
-rw-r--r--youtube_dl/extractor/laola1tv.py97
-rw-r--r--youtube_dl/extractor/leeco.py111
-rw-r--r--youtube_dl/extractor/lego.py2
-rw-r--r--youtube_dl/extractor/limelight.py53
-rw-r--r--youtube_dl/extractor/liveleak.py83
-rw-r--r--youtube_dl/extractor/mediaset.py118
-rw-r--r--youtube_dl/extractor/medici.py70
-rw-r--r--youtube_dl/extractor/mixcloud.py38
-rw-r--r--youtube_dl/extractor/myspace.py100
-rw-r--r--youtube_dl/extractor/nbc.py98
-rw-r--r--youtube_dl/extractor/nonktube.py33
-rw-r--r--youtube_dl/extractor/noovo.py97
-rw-r--r--youtube_dl/extractor/nowness.py2
-rw-r--r--youtube_dl/extractor/npo.py11
-rw-r--r--youtube_dl/extractor/nrk.py25
-rw-r--r--youtube_dl/extractor/nuevo.py5
-rw-r--r--youtube_dl/extractor/odnoklassniki.py32
-rw-r--r--youtube_dl/extractor/openload.py73
-rw-r--r--youtube_dl/extractor/orf.py109
-rw-r--r--youtube_dl/extractor/packtpub.py171
-rw-r--r--youtube_dl/extractor/pbs.py31
-rw-r--r--youtube_dl/extractor/periscope.py7
-rw-r--r--youtube_dl/extractor/porn91.py32
-rw-r--r--youtube_dl/extractor/pornhub.py5
-rw-r--r--youtube_dl/extractor/r7.py3
-rw-r--r--youtube_dl/extractor/rai.py509
-rw-r--r--youtube_dl/extractor/rbmaradio.py6
-rw-r--r--youtube_dl/extractor/rmcdecouverte.py26
-rw-r--r--youtube_dl/extractor/rtl2.py110
-rw-r--r--youtube_dl/extractor/rudo.py2
-rw-r--r--youtube_dl/extractor/streamable.py6
-rw-r--r--youtube_dl/extractor/streamango.py64
-rw-r--r--youtube_dl/extractor/ted.py2
-rw-r--r--youtube_dl/extractor/theplatform.py21
-rw-r--r--youtube_dl/extractor/thescene.py36
-rw-r--r--youtube_dl/extractor/thesun.py32
-rw-r--r--youtube_dl/extractor/turner.py9
-rw-r--r--youtube_dl/extractor/tv2hu.py62
-rw-r--r--youtube_dl/extractor/tv5mondeplus.py79
-rw-r--r--youtube_dl/extractor/tvp.py3
-rw-r--r--youtube_dl/extractor/tvplay.py6
-rw-r--r--youtube_dl/extractor/tvplayer.py35
-rw-r--r--youtube_dl/extractor/udemy.py79
-rw-r--r--youtube_dl/extractor/upskill.py176
-rw-r--r--youtube_dl/extractor/vevo.py17
-rw-r--r--youtube_dl/extractor/vice.py154
-rw-r--r--youtube_dl/extractor/viceland.py11
-rw-r--r--youtube_dl/extractor/videopress.py9
-rw-r--r--youtube_dl/extractor/vidio.py7
-rw-r--r--youtube_dl/extractor/vidzi.py11
-rw-r--r--youtube_dl/extractor/vier.py117
-rw-r--r--youtube_dl/extractor/viewlift.py2
-rw-r--r--youtube_dl/extractor/viewster.py3
-rw-r--r--youtube_dl/extractor/vlive.py4
-rw-r--r--youtube_dl/extractor/vrt.py1
-rw-r--r--youtube_dl/extractor/vrv.py212
-rw-r--r--youtube_dl/extractor/vshare.py38
-rw-r--r--youtube_dl/extractor/washingtonpost.py6
-rw-r--r--youtube_dl/extractor/wistia.py22
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py46
-rw-r--r--youtube_dl/extractor/wsj.py52
-rw-r--r--youtube_dl/extractor/xfileshare.py86
-rw-r--r--youtube_dl/extractor/xtube.py23
-rw-r--r--youtube_dl/extractor/xvideos.py11
-rw-r--r--youtube_dl/extractor/yahoo.py2
-rw-r--r--youtube_dl/extractor/yandexmusic.py3
-rw-r--r--youtube_dl/extractor/youku.py8
-rw-r--r--youtube_dl/extractor/youtube.py363
-rw-r--r--youtube_dl/extractor/zaq1.py101
135 files changed, 5095 insertions, 2195 deletions
diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py
new file mode 100644
index 0000000..66caf6a
--- /dev/null
+++ b/youtube_dl/extractor/adn.py
@@ -0,0 +1,136 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import json
+import os
+
+from .common import InfoExtractor
+from ..aes import aes_cbc_decrypt
+from ..compat import compat_ord
+from ..utils import (
+ bytes_to_intlist,
+ ExtractorError,
+ float_or_none,
+ intlist_to_bytes,
+ srt_subtitles_timecode,
+ strip_or_none,
+)
+
+
+class ADNIE(InfoExtractor):
+ IE_DESC = 'Anime Digital Network'
+ _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
+ 'md5': 'e497370d847fd79d9d4c74be55575c7a',
+ 'info_dict': {
+ 'id': '7778',
+ 'ext': 'mp4',
+ 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1',
+ 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
+ }
+ }
+
+ def _get_subtitles(self, sub_path, video_id):
+ if not sub_path:
+ return None
+
+ enc_subtitles = self._download_webpage(
+ 'http://animedigitalnetwork.fr/' + sub_path,
+ video_id, fatal=False)
+ if not enc_subtitles:
+ return None
+
+ # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
+ dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
+ bytes_to_intlist(base64.b64decode(enc_subtitles[24:])),
+ bytes_to_intlist(b'\nd\xaf\xd2J\xd0\xfc\xe1\xfc\xdf\xb61\xe8\xe1\xf0\xcc'),
+ bytes_to_intlist(base64.b64decode(enc_subtitles[:24]))
+ ))
+ subtitles_json = self._parse_json(
+ dec_subtitles[:-compat_ord(dec_subtitles[-1])],
+ None, fatal=False)
+ if not subtitles_json:
+ return None
+
+ subtitles = {}
+ for sub_lang, sub in subtitles_json.items():
+ srt = ''
+ for num, current in enumerate(sub):
+ start, end, text = (
+ float_or_none(current.get('startTime')),
+ float_or_none(current.get('endTime')),
+ current.get('text'))
+ if start is None or end is None or text is None:
+ continue
+ srt += os.linesep.join(
+ (
+ '%d' % num,
+ '%s --> %s' % (
+ srt_subtitles_timecode(start),
+ srt_subtitles_timecode(end)),
+ text,
+ os.linesep,
+ ))
+
+ if sub_lang == 'vostf':
+ sub_lang = 'fr'
+ subtitles.setdefault(sub_lang, []).extend([{
+ 'ext': 'json',
+ 'data': json.dumps(sub),
+ }, {
+ 'ext': 'srt',
+ 'data': srt,
+ }])
+ return subtitles
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ player_config = self._parse_json(self._search_regex(
+ r'playerConfig\s*=\s*({.+});', webpage, 'player config'), video_id)
+
+ video_info = {}
+ video_info_str = self._search_regex(
+ r'videoInfo\s*=\s*({.+});', webpage,
+ 'video info', fatal=False)
+ if video_info_str:
+ video_info = self._parse_json(
+ video_info_str, video_id, fatal=False) or {}
+
+ options = player_config.get('options') or {}
+ metas = options.get('metas') or {}
+ title = metas.get('title') or video_info['title']
+ links = player_config.get('links') or {}
+
+ formats = []
+ for format_id, qualities in links.items():
+ for load_balancer_url in qualities.values():
+ load_balancer_data = self._download_json(
+ load_balancer_url, video_id, fatal=False) or {}
+ m3u8_url = load_balancer_data.get('location')
+ if not m3u8_url:
+ continue
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id=format_id, fatal=False)
+ if format_id == 'vf':
+ for f in m3u8_formats:
+ f['language'] = 'fr'
+ formats.extend(m3u8_formats)
+ error = options.get('error')
+ if not formats and error:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': strip_or_none(metas.get('summary') or video_info.get('resume')),
+ 'thumbnail': video_info.get('image'),
+ 'formats': formats,
+ 'subtitles': self.extract_subtitles(player_config.get('subtitles'), video_id),
+ 'episode': metas.get('subtitle') or video_info.get('videoTitle'),
+ 'series': video_info.get('playlistTitle'),
+ }
diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py
index 1b2d364..7da96c6 100644
--- a/youtube_dl/extractor/adobepass.py
+++ b/youtube_dl/extractor/adobepass.py
@@ -41,6 +41,11 @@ MSO_INFO = {
'username_field': 'IDToken1',
'password_field': 'IDToken2',
},
+ 'Verizon': {
+ 'name': 'Verizon FiOS',
+ 'username_field': 'IDToken1',
+ 'password_field': 'IDToken2',
+ },
'thr030': {
'name': '3 Rivers Communications'
},
@@ -1303,6 +1308,12 @@ class AdobePassIE(InfoExtractor):
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_MVPD_CACHE = 'ap-mvpd'
+ def _download_webpage_handle(self, *args, **kwargs):
+ headers = kwargs.get('headers', {})
+ headers.update(self.geo_verification_headers())
+ kwargs['headers'] = headers
+ return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs)
+
@staticmethod
def _get_mvpd_resource(provider_id, title, guid, rating):
channel = etree.Element('channel')
@@ -1384,40 +1395,72 @@ class AdobePassIE(InfoExtractor):
# Comcast page flow varies by video site and whether you
# are on Comcast's network.
provider_redirect_page, urlh = provider_redirect_page_res
- # Check for Comcast auto login
if 'automatically signing you in' in provider_redirect_page:
oauth_redirect_url = self._html_search_regex(
r'window\.location\s*=\s*[\'"]([^\'"]+)',
provider_redirect_page, 'oauth redirect')
- # Just need to process the request. No useful data comes back
self._download_webpage(
oauth_redirect_url, video_id, 'Confirming auto login')
else:
if '<form name="signin"' in provider_redirect_page:
- # already have the form, just fill it
provider_login_page_res = provider_redirect_page_res
elif 'http-equiv="refresh"' in provider_redirect_page:
- # redirects to the login page
oauth_redirect_url = self._html_search_regex(
r'content="0;\s*url=([^\'"]+)',
provider_redirect_page, 'meta refresh redirect')
provider_login_page_res = self._download_webpage_handle(
- oauth_redirect_url,
- video_id, 'Downloading Provider Login Page')
+ oauth_redirect_url, video_id,
+ 'Downloading Provider Login Page')
else:
provider_login_page_res = post_form(
- provider_redirect_page_res, 'Downloading Provider Login Page')
+ provider_redirect_page_res,
+ 'Downloading Provider Login Page')
- mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {
- mso_info.get('username_field', 'username'): username,
- mso_info.get('password_field', 'password'): password,
- })
+ mvpd_confirm_page_res = post_form(
+ provider_login_page_res, 'Logging in', {
+ mso_info['username_field']: username,
+ mso_info['password_field']: password,
+ })
mvpd_confirm_page, urlh = mvpd_confirm_page_res
if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page:
post_form(mvpd_confirm_page_res, 'Confirming Login')
-
+ elif mso_id == 'Verizon':
+ # In general, if you're connecting from a Verizon-assigned IP,
+ # you will not actually pass your credentials.
+ provider_redirect_page, urlh = provider_redirect_page_res
+ if 'Please wait ...' in provider_redirect_page:
+ saml_redirect_url = self._html_search_regex(
+ r'self\.parent\.location=(["\'])(?P<url>.+?)\1',
+ provider_redirect_page,
+ 'SAML Redirect URL', group='url')
+ saml_login_page = self._download_webpage(
+ saml_redirect_url, video_id,
+ 'Downloading SAML Login Page')
+ else:
+ saml_login_page_res = post_form(
+ provider_redirect_page_res, 'Logging in', {
+ mso_info['username_field']: username,
+ mso_info['password_field']: password,
+ })
+ saml_login_page, urlh = saml_login_page_res
+ if 'Please try again.' in saml_login_page:
+ raise ExtractorError(
+ 'We\'re sorry, but either the User ID or Password entered is not correct.')
+ saml_login_url = self._search_regex(
+ r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1',
+ saml_login_page, 'SAML Login URL', group='url')
+ saml_response_json = self._download_json(
+ saml_login_url, video_id, 'Downloading SAML Response',
+ headers={'Content-Type': 'text/xml'})
+ self._download_webpage(
+ saml_response_json['targetValue'], video_id,
+ 'Confirming Login', data=urlencode_postdata({
+ 'SAMLResponse': saml_response_json['SAMLResponse'],
+ 'RelayState': saml_response_json['RelayState']
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded'
+ })
else:
- # Normal, non-Comcast flow
provider_login_page_res = post_form(
provider_redirect_page_res, 'Downloading Provider Login Page')
mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index 989505c..acc4ce3 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -5,91 +5,52 @@ import re
from .turner import TurnerBaseIE
from ..utils import (
- ExtractorError,
int_or_none,
+ strip_or_none,
)
class AdultSwimIE(TurnerBaseIE):
- _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?'
+ _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?'
_TESTS = [{
'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
- 'playlist': [
- {
- 'md5': '247572debc75c7652f253c8daa51a14d',
- 'info_dict': {
- 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0',
- 'ext': 'flv',
- 'title': 'Rick and Morty - Pilot Part 1',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
- },
- },
- {
- 'md5': '77b0e037a4b20ec6b98671c4c379f48d',
- 'info_dict': {
- 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3',
- 'ext': 'flv',
- 'title': 'Rick and Morty - Pilot Part 4',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
- },
- },
- ],
'info_dict': {
'id': 'rQxZvXQ4ROaSOqq-or2Mow',
+ 'ext': 'mp4',
'title': 'Rick and Morty - Pilot',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.',
+ 'timestamp': 1493267400,
+ 'upload_date': '20170427',
},
- 'skip': 'This video is only available for registered users',
- }, {
- 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/',
- 'playlist': [
- {
- 'md5': '2eb5c06d0f9a1539da3718d897f13ec5',
- 'info_dict': {
- 'id': '-t8CamQlQ2aYZ49ItZCFog-0',
- 'ext': 'flv',
- 'title': 'American Dad - Putting Francine Out of Business',
- 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
- },
- }
- ],
- 'info_dict': {
- 'id': '-t8CamQlQ2aYZ49ItZCFog',
- 'title': 'American Dad - Putting Francine Out of Business',
- 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
+ 'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
- 'playlist': [
- {
- 'md5': '3e346a2ab0087d687a05e1e7f3b3e529',
- 'info_dict': {
- 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0',
- 'ext': 'mp4',
- 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
- 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
- },
- }
- ],
'info_dict': {
'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
+ 'ext': 'mp4',
'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
- 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
+ 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.',
+ 'upload_date': '20080124',
+ 'timestamp': 1201150800,
},
'params': {
# m3u8 download
'skip_download': True,
- }
+ },
}, {
- # heroMetadata.trailer
'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',
'info_dict': {
'id': 'I0LQFQkaSUaFp8PnAWHhoQ',
'ext': 'mp4',
'title': 'Decker - Inside Decker: A New Hero',
- 'description': 'md5:c916df071d425d62d70c86d4399d3ee0',
- 'duration': 249.008,
+ 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.',
+ 'timestamp': 1469480460,
+ 'upload_date': '20160725',
},
'params': {
# m3u8 download
@@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE):
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
- 'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/',
+ 'url': 'http://www.adultswim.com/videos/attack-on-titan',
+ 'info_dict': {
+ 'id': 'b7A69dzfRzuaXIECdxW8XQ',
+ 'title': 'Attack on Titan',
+ 'description': 'md5:6c8e003ea0777b47013e894767f5e114',
+ },
+ 'playlist_mincount': 12,
+ }, {
+ 'url': 'http://www.adultswim.com/videos/streams/williams-stream',
'info_dict': {
- 'id': 'eYiLsKVgQ6qTC6agD67Sig',
- 'title': 'Toonami - Friday, October 14th, 2016',
- 'description': 'md5:99892c96ffc85e159a428de85c30acde',
+ 'id': 'd8DEBj7QRfetLsRgFnGEyg',
+ 'ext': 'mp4',
+ 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'description': 'original programming',
},
- 'playlist': [{
- 'md5': '',
- 'info_dict': {
- 'id': 'eYiLsKVgQ6qTC6agD67Sig',
- 'ext': 'mp4',
- 'title': 'Toonami - Friday, October 14th, 2016',
- 'description': 'md5:99892c96ffc85e159a428de85c30acde',
- },
- }],
'params': {
# m3u8 download
'skip_download': True,
},
- 'expected_warnings': ['Unable to download f4m manifest'],
}]
- @staticmethod
- def find_video_info(collection, slug):
- for video in collection.get('videos'):
- if video.get('slug') == slug:
- return video
-
- @staticmethod
- def find_collection_by_linkURL(collections, linkURL):
- for collection in collections:
- if collection.get('linkURL') == linkURL:
- return collection
-
- @staticmethod
- def find_collection_containing_video(collections, slug):
- for collection in collections:
- for video in collection.get('videos'):
- if video.get('slug') == slug:
- return collection, video
- return None, None
-
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- show_path = mobj.group('show_path')
- episode_path = mobj.group('episode_path')
- is_playlist = True if mobj.group('is_playlist') else False
-
- webpage = self._download_webpage(url, episode_path)
-
- # Extract the value of `bootstrappedData` from the Javascript in the page.
- bootstrapped_data = self._parse_json(self._search_regex(
- r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)
-
- # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
- # NOTE: We are only downloading one video (the current one) not the playlist
- if is_playlist:
- collections = bootstrapped_data['playlists']['collections']
- collection = self.find_collection_by_linkURL(collections, show_path)
- video_info = self.find_video_info(collection, episode_path)
-
- show_title = video_info['showTitle']
- segment_ids = [video_info['videoPlaybackID']]
+ show_path, episode_path = re.match(self._VALID_URL, url).groups()
+ display_id = episode_path or show_path
+ webpage = self._download_webpage(url, display_id)
+ initial_data = self._parse_json(self._search_regex(
+ r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});',
+ webpage, 'initial data'), display_id)
+
+ is_stream = show_path == 'streams'
+ if is_stream:
+ if not episode_path:
+ episode_path = 'live-stream'
+
+ video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path)
+ video_id = video_data.get('stream')
+
+ if not video_id:
+ entries = []
+ for episode in video_data.get('archiveEpisodes', []):
+ episode_url = episode.get('url')
+ if not episode_url:
+ continue
+ entries.append(self.url_result(
+ episode_url, 'AdultSwim', episode.get('id')))
+ return self.playlist_result(
+ entries, video_data.get('id'), video_data.get('title'),
+ strip_or_none(video_data.get('description')))
else:
- collections = bootstrapped_data['show']['collections']
- collection, video_info = self.find_collection_containing_video(collections, episode_path)
- # Video wasn't found in the collections, let's try `slugged_video`.
- if video_info is None:
- if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
- video_info = bootstrapped_data['slugged_video']
- if not video_info:
- video_info = bootstrapped_data.get(
- 'heroMetadata', {}).get('trailer', {}).get('video')
- if not video_info:
- video_info = bootstrapped_data.get('onlineOriginals', [None])[0]
- if not video_info:
- raise ExtractorError('Unable to find video info')
-
- show = bootstrapped_data['show']
- show_title = show['title']
- stream = video_info.get('stream')
- if stream and stream.get('videoPlaybackID'):
- segment_ids = [stream['videoPlaybackID']]
- elif video_info.get('clips'):
- segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
- elif video_info.get('videoPlaybackID'):
- segment_ids = [video_info['videoPlaybackID']]
- elif video_info.get('id'):
- segment_ids = [video_info['id']]
- else:
- if video_info.get('auth') is True:
- raise ExtractorError(
- 'This video is only available via cable service provider subscription that'
- ' is not currently supported. You may want to use --cookies.', expected=True)
- else:
- raise ExtractorError('Unable to find stream or clips')
-
- episode_id = video_info['id']
- episode_title = video_info['title']
- episode_description = video_info.get('description')
- episode_duration = int_or_none(video_info.get('duration'))
- view_count = int_or_none(video_info.get('views'))
+ show_data = initial_data['show']
+
+ if not episode_path:
+ entries = []
+ for video in show_data.get('videos', []):
+ slug = video.get('slug')
+ if not slug:
+ continue
+ entries.append(self.url_result(
+ 'http://adultswim.com/videos/%s/%s' % (show_path, slug),
+ 'AdultSwim', video.get('id')))
+ return self.playlist_result(
+ entries, show_data.get('id'), show_data.get('title'),
+ strip_or_none(show_data.get('metadata', {}).get('description')))
+
+ video_data = show_data['sluggedVideo']
+ video_id = video_data['id']
+
+ info = self._extract_cvp_info(
+ 'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id,
+ video_id, {
+ 'secure': {
+ 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big',
+ 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do',
+ },
+ }, {
+ 'url': url,
+ 'site_name': 'AdultSwim',
+ 'auth_required': video_data.get('auth'),
+ })
- entries = []
- for part_num, segment_id in enumerate(segment_ids):
- segement_info = self._extract_cvp_info(
- 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id,
- segment_id, {
- 'secure': {
- 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big',
- 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do',
- },
- })
- segment_title = '%s - %s' % (show_title, episode_title)
- if len(segment_ids) > 1:
- segment_title += ' Part %d' % (part_num + 1)
- segement_info.update({
- 'id': segment_id,
- 'title': segment_title,
- 'description': episode_description,
+ info.update({
+ 'id': video_id,
+ 'display_id': display_id,
+ 'description': info.get('description') or strip_or_none(video_data.get('description')),
+ })
+ if not is_stream:
+ info.update({
+ 'duration': info.get('duration') or int_or_none(video_data.get('duration')),
+ 'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')),
+ 'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')),
+ 'episode': info['title'],
+ 'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')),
})
- entries.append(segement_info)
- return {
- '_type': 'playlist',
- 'id': episode_id,
- 'display_id': episode_path,
- 'entries': entries,
- 'title': '%s - %s' % (show_title, episode_title),
- 'description': episode_description,
- 'duration': episode_duration,
- 'view_count': view_count,
- }
+ info['series'] = video_data.get('collection_title') or info.get('series')
+ if info['series'] and info['series'] != info['title']:
+ info['title'] = '%s - %s' % (info['series'], info['title'])
+
+ return info
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
index dd96a47..2dcdba9 100644
--- a/youtube_dl/extractor/aenetworks.py
+++ b/youtube_dl/extractor/aenetworks.py
@@ -23,7 +23,19 @@ class AENetworksBaseIE(ThePlatformIE):
class AENetworksIE(AENetworksBaseIE):
IE_NAME = 'aenetworks'
IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
- _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime|lifetimemovieclub)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?
+ (?P<domain>
+ (?:history|aetv|mylifetime|lifetimemovieclub)\.com|
+ fyi\.tv
+ )/
+ (?:
+ shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|
+ movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?|
+ specials/(?P<special_display_id>[^/]+)/full-special
+ )
+ '''
_TESTS = [{
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
'md5': 'a97a65f7e823ae10e9244bc5433d5fe6',
@@ -65,6 +77,9 @@ class AENetworksIE(AENetworksBaseIE):
}, {
'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us',
'only_matching': True
+ }, {
+ 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special',
+ 'only_matching': True
}]
_DOMAIN_TO_REQUESTOR_ID = {
'history.com': 'HISTORY',
@@ -75,8 +90,8 @@ class AENetworksIE(AENetworksBaseIE):
}
def _real_extract(self, url):
- domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups()
- display_id = show_path or movie_display_id
+ domain, show_path, movie_display_id, special_display_id = re.match(self._VALID_URL, url).groups()
+ display_id = show_path or movie_display_id or special_display_id
webpage = self._download_webpage(url, display_id)
if show_path:
url_parts = show_path.split('/')
@@ -86,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE):
for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
entries.append(self.url_result(
compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
- return self.playlist_result(
- entries, self._html_search_meta('aetn:SeriesId', webpage),
- self._html_search_meta('aetn:SeriesTitle', webpage))
- elif url_parts_len == 2:
+ if entries:
+ return self.playlist_result(
+ entries, self._html_search_meta('aetn:SeriesId', webpage),
+ self._html_search_meta('aetn:SeriesTitle', webpage))
+ else:
+ # single season
+ url_parts_len = 2
+ if url_parts_len == 2:
entries = []
for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):
episode_attributes = extract_attributes(episode_item)
@@ -97,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE):
url, episode_attributes['data-canonical'])
entries.append(self.url_result(
episode_url, 'AENetworks',
- episode_attributes['data-videoid']))
+ episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))
return self.playlist_result(
entries, self._html_search_meta('aetn:SeasonId', webpage))
@@ -107,7 +126,10 @@ class AENetworksIE(AENetworksBaseIE):
}
video_id = self._html_search_meta('aetn:VideoID', webpage)
media_url = self._search_regex(
- r"media_url\s*=\s*'([^']+)'", webpage, 'video url')
+ [r"media_url\s*=\s*'(?P<url>[^']+)'",
+ r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)',
+ r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'],
+ webpage, 'video url', group='url')
theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py
index b774d6d..c8cb91d 100644
--- a/youtube_dl/extractor/afreecatv.py
+++ b/youtube_dl/extractor/afreecatv.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_xpath
from ..utils import (
+ determine_ext,
ExtractorError,
int_or_none,
xpath_text,
@@ -72,13 +73,70 @@ class AfreecaTVIE(InfoExtractor):
'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793',
'info_dict': {
'id': '18650793',
- 'ext': 'flv',
+ 'ext': 'mp4',
+ 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '윈아디',
'uploader_id': 'badkids',
- 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
+ 'duration': 107,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652',
+ 'info_dict': {
+ 'id': '10481652',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': 'dailyapril',
+ 'uploader_id': 'dailyapril',
+ 'duration': 6492,
+ },
+ 'playlist_count': 2,
+ 'playlist': [{
+ 'md5': 'd8b7c174568da61d774ef0203159bf97',
+ 'info_dict': {
+ 'id': '20160502_c4c62b9d_174361386_1',
+ 'ext': 'mp4',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)",
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': 'dailyapril',
+ 'uploader_id': 'dailyapril',
+ 'upload_date': '20160502',
+ 'duration': 3601,
+ },
+ }, {
+ 'md5': '58f2ce7f6044e34439ab2d50612ab02b',
+ 'info_dict': {
+ 'id': '20160502_39e739bb_174361386_2',
+ 'ext': 'mp4',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)",
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': 'dailyapril',
+ 'uploader_id': 'dailyapril',
+ 'upload_date': '20160502',
+ 'duration': 2891,
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # non standard key
+ 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605',
+ 'info_dict': {
+ 'id': '20170411_BE689A0E_190960999_1_2_h',
+ 'ext': 'mp4',
+ 'title': '혼자사는여자집',
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': '♥이슬이',
+ 'uploader_id': 'dasl8121',
+ 'upload_date': '20170411',
+ 'duration': 213,
},
'params': {
- 'skip_download': True, # requires rtmpdump
+ 'skip_download': True,
},
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
@@ -94,7 +152,7 @@ class AfreecaTVIE(InfoExtractor):
m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
if m:
video_key['upload_date'] = m.group('upload_date')
- video_key['part'] = m.group('part')
+ video_key['part'] = int(m.group('part'))
return video_key
def _real_extract(self, url):
@@ -109,23 +167,64 @@ class AfreecaTVIE(InfoExtractor):
raise ExtractorError('Specified AfreecaTV video does not exist',
expected=True)
- video_url_raw = video_element.text
-
- app, playpath = video_url_raw.split('mp4:')
+ video_url = video_element.text.strip()
title = xpath_text(video_xml, './track/title', 'title', fatal=True)
+
uploader = xpath_text(video_xml, './track/nickname', 'uploader')
uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
- duration = int_or_none(xpath_text(video_xml, './track/duration',
- 'duration'))
+ duration = int_or_none(xpath_text(
+ video_xml, './track/duration', 'duration'))
thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
- return {
+ common_entry = {
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'thumbnail': thumbnail,
+ }
+
+ info = common_entry.copy()
+ info.update({
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ })
+
+ if not video_url:
+ entries = []
+ file_elements = video_element.findall(compat_xpath('./file'))
+ one = len(file_elements) == 1
+ for file_num, file_element in enumerate(file_elements, start=1):
+ file_url = file_element.text
+ if not file_url:
+ continue
+ key = file_element.get('key', '')
+ upload_date = self._search_regex(
+ r'^(\d{8})_', key, 'upload date', default=None)
+ file_duration = int_or_none(file_element.get('duration'))
+ format_id = key if key else '%s_%s' % (video_id, file_num)
+ formats = self._extract_m3u8_formats(
+ file_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls',
+ note='Downloading part %d m3u8 information' % file_num)
+ file_info = common_entry.copy()
+ file_info.update({
+ 'id': format_id,
+ 'title': title if one else '%s (part %d)' % (title, file_num),
+ 'upload_date': upload_date,
+ 'duration': file_duration,
+ 'formats': formats,
+ })
+ entries.append(file_info)
+ entries_info = info.copy()
+ entries_info.update({
+ '_type': 'multi_video',
+ 'entries': entries,
+ })
+ return entries_info
+
+ info = {
'id': video_id,
- 'url': app,
- 'ext': 'flv',
- 'play_path': 'mp4:' + playpath,
- 'rtmp_live': True, # downloading won't end without this
'title': title,
'uploader': uploader,
'uploader_id': uploader_id,
@@ -133,6 +232,21 @@ class AfreecaTVIE(InfoExtractor):
'thumbnail': thumbnail,
}
+ if determine_ext(video_url) == 'm3u8':
+ info['formats'] = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ else:
+ app, playpath = video_url.split('mp4:')
+ info.update({
+ 'url': app,
+ 'ext': 'flv',
+ 'play_path': 'mp4:' + playpath,
+ 'rtmp_live': True, # downloading won't end without this
+ })
+
+ return info
+
class AfreecaTVGlobalIE(AfreecaTVIE):
IE_NAME = 'afreecatv:global'
diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py
index 0e06918..9e38136 100644
--- a/youtube_dl/extractor/airmozilla.py
+++ b/youtube_dl/extractor/airmozilla.py
@@ -15,12 +15,12 @@ class AirMozillaIE(InfoExtractor):
_VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?'
_TEST = {
'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/',
- 'md5': '2e3e7486ba5d180e829d453875b9b8bf',
+ 'md5': '8d02f53ee39cf006009180e21df1f3ba',
'info_dict': {
'id': '6x4q2w',
'ext': 'mp4',
'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco',
- 'thumbnail': r're:https?://vid\.ly/(?P<id>[0-9a-z-]+)/poster',
+ 'thumbnail': r're:https?://.*/poster\.jpg',
'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...',
'timestamp': 1422487800,
'upload_date': '20150128',
@@ -34,21 +34,13 @@ class AirMozillaIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- video_id = self._html_search_regex(r'//vid.ly/(.*?)/embed', webpage, 'id')
+ video_id = self._html_search_regex(r'//vid\.ly/(.*?)/embed', webpage, 'id')
embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id)
- jwconfig = self._search_regex(r'\svar jwconfig = (\{.*?\});\s', embed_script, 'metadata')
- metadata = self._parse_json(jwconfig, video_id)
-
- formats = [{
- 'url': source['file'],
- 'ext': source['type'],
- 'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'),
- 'format': source['label'],
- 'height': int(source['label'].rstrip('p')),
- } for source in metadata['playlist'][0]['sources']]
- self._sort_formats(formats)
+ jwconfig = self._parse_json(self._search_regex(
+ r'initCallback\((.*)\);', embed_script, 'metadata'), video_id)['config']
+ info_dict = self._parse_jwplayer_data(jwconfig, video_id)
view_count = int_or_none(self._html_search_regex(
r'Views since archived: ([0-9]+)',
webpage, 'view count', fatal=False))
@@ -58,17 +50,17 @@ class AirMozillaIE(InfoExtractor):
r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)',
webpage, 'duration', fatal=False))
- return {
+ info_dict.update({
'id': video_id,
'title': self._og_search_title(webpage),
- 'formats': formats,
'url': self._og_search_url(webpage),
'display_id': display_id,
- 'thumbnail': metadata['playlist'][0].get('image'),
'description': self._og_search_description(webpage),
'timestamp': timestamp,
'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None),
'duration': duration,
'view_count': view_count,
'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage),
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py
index 388e578..c68be31 100644
--- a/youtube_dl/extractor/aljazeera.py
+++ b/youtube_dl/extractor/aljazeera.py
@@ -4,9 +4,9 @@ from .common import InfoExtractor
class AlJazeeraIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
'info_dict': {
'id': '3792260579001',
@@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor):
},
'add_ie': ['BrightcoveNew'],
'skip': 'Not accessible from Travis CI server',
- }
+ }, {
+ 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html',
+ 'only_matching': True,
+ }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py
index 90f11d3..cd533ac 100644
--- a/youtube_dl/extractor/allocine.py
+++ b/youtube_dl/extractor/allocine.py
@@ -2,9 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- remove_end,
+ int_or_none,
qualities,
+ remove_end,
+ try_get,
+ unified_timestamp,
url_basename,
)
@@ -22,6 +26,10 @@ class AllocineIE(InfoExtractor):
'title': 'Astérix - Le Domaine des Dieux Teaser VF',
'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
'thumbnail': r're:http://.*\.jpg',
+ 'duration': 39,
+ 'timestamp': 1404273600,
+ 'upload_date': '20140702',
+ 'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html',
@@ -33,6 +41,10 @@ class AllocineIE(InfoExtractor):
'title': 'Planes 2 Bande-annonce VF',
'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway',
'thumbnail': r're:http://.*\.jpg',
+ 'duration': 69,
+ 'timestamp': 1385659800,
+ 'upload_date': '20131128',
+ 'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html',
@@ -44,6 +56,10 @@ class AllocineIE(InfoExtractor):
'title': 'Dragons 2 - Bande annonce finale VF',
'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a',
'thumbnail': r're:http://.*\.jpg',
+ 'duration': 144,
+ 'timestamp': 1397589900,
+ 'upload_date': '20140415',
+ 'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/video-19550147/',
@@ -69,34 +85,37 @@ class AllocineIE(InfoExtractor):
r'data-model="([^"]+)"', webpage, 'data model', default=None)
if model:
model_data = self._parse_json(model, display_id)
-
- for video_url in model_data['sources'].values():
+ video = model_data['videos'][0]
+ title = video['title']
+ for video_url in video['sources'].values():
video_id, format_id = url_basename(video_url).split('_')[:2]
formats.append({
'format_id': format_id,
'quality': quality(format_id),
'url': video_url,
})
-
- title = model_data['title']
+ duration = int_or_none(video.get('duration'))
+ view_count = int_or_none(video.get('view_count'))
+ timestamp = unified_timestamp(try_get(
+ video, lambda x: x['added_at']['date'], compat_str))
else:
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
+ title = remove_end(
+ self._html_search_regex(
+ r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
+ ' - AlloCiné')
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue
-
format_id = key[:-len('Path')]
formats.append({
'format_id': format_id,
'quality': quality(format_id),
'url': value,
})
-
- title = remove_end(self._html_search_regex(
- r'(?s)<title>(.+?)</title>', webpage, 'title'
- ).strip(), ' - AlloCiné')
+ duration, view_count, timestamp = [None] * 3
self._sort_formats(formats)
@@ -104,7 +123,10 @@ class AllocineIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'title': title,
+ 'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
'formats': formats,
- 'description': self._og_search_description(webpage),
}
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
index e8e4012..fde1a8f 100644
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -7,15 +7,19 @@ from ..utils import (
parse_iso8601,
mimetype2ext,
determine_ext,
+ ExtractorError,
)
class AMPIE(InfoExtractor):
# parse Akamai Adaptive Media Player feed
def _extract_feed_info(self, url):
- item = self._download_json(
+ feed = self._download_json(
url, None, 'Downloading Akamai AMP feed',
- 'Unable to download Akamai AMP feed')['channel']['item']
+ 'Unable to download Akamai AMP feed')
+ item = feed.get('channel', {}).get('item')
+ if not item:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error']))
video_id = item['guid']
@@ -30,9 +34,12 @@ class AMPIE(InfoExtractor):
if isinstance(media_thumbnail, dict):
media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail:
- thumbnail = thumbnail_data['@attributes']
+ thumbnail = thumbnail_data.get('@attributes', {})
+ thumbnail_url = thumbnail.get('url')
+ if not thumbnail_url:
+ continue
thumbnails.append({
- 'url': self._proto_relative_url(thumbnail['url'], 'http:'),
+ 'url': self._proto_relative_url(thumbnail_url, 'http:'),
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
@@ -43,9 +50,14 @@ class AMPIE(InfoExtractor):
if isinstance(media_subtitle, dict):
media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle:
- subtitle = subtitle_data['@attributes']
- lang = subtitle.get('lang') or 'en'
- subtitles[lang] = [{'url': subtitle['href']}]
+ subtitle = subtitle_data.get('@attributes', {})
+ subtitle_href = subtitle.get('href')
+ if not subtitle_href:
+ continue
+ subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
+ 'url': subtitle_href,
+ 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
+ })
formats = []
media_content = get_media_node('content')
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py
index 623f44d..8023da7 100644
--- a/youtube_dl/extractor/anvato.py
+++ b/youtube_dl/extractor/anvato.py
@@ -5,6 +5,7 @@ import base64
import hashlib
import json
import random
+import re
import time
from .common import InfoExtractor
@@ -16,6 +17,7 @@ from ..utils import (
intlist_to_bytes,
int_or_none,
strip_jsonp,
+ unescapeHTML,
)
@@ -26,6 +28,8 @@ def md5_text(s):
class AnvatoIE(InfoExtractor):
+ _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
+
# Copied from anvplayer.min.js
_ANVACK_TABLE = {
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor):
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
}
+ _MCP_TO_ACCESS_KEY_TABLE = {
+ 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
+ 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
+ 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+ 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+ 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
+ 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+ 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+ 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
+ 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
+ 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
+ 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
+ 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
+ }
+
+ _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
def __init__(self, *args, **kwargs):
@@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor):
}
if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'):
- # Not using _extract_m3u8_formats here as individual media
- # playlists are also included in published_urls.
- if tbr is None:
- formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls'))
- continue
- else:
+ if tbr is not None:
a_format.update({
'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
'ext': 'mp4',
@@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor):
'subtitles': subtitles,
}
+ @staticmethod
+ def _extract_urls(ie, webpage, video_id):
+ entries = []
+ for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
+ anvplayer_data = ie._parse_json(
+ mobj.group('anvp'), video_id, transform_source=unescapeHTML,
+ fatal=False)
+ if not anvplayer_data:
+ continue
+ video = anvplayer_data.get('video')
+ if not isinstance(video, compat_str) or not video.isdigit():
+ continue
+ access_key = anvplayer_data.get('accessKey')
+ if not access_key:
+ mcp = anvplayer_data.get('mcp')
+ if mcp:
+ access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
+ mcp.lower())
+ if not access_key:
+ continue
+ entries.append(ie.url_result(
+ 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
+ video_id=video))
+ return entries
+
def _extract_anvato_videos(self, webpage, video_id):
- anvplayer_data = self._parse_json(self._html_search_regex(
- r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
- 'Anvato player data'), video_id)
+ anvplayer_data = self._parse_json(
+ self._html_search_regex(
+ self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
+ video_id)
return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video'])
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ access_key, video_id = mobj.group('access_key_or_mcp', 'id')
+ if access_key not in self._ANVACK_TABLE:
+ access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key]
+ return self._get_anvato_videos(access_key, video_id)
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
index ea7a703..a84b8b1 100644
--- a/youtube_dl/extractor/appleconnect.py
+++ b/youtube_dl/extractor/appleconnect.py
@@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor):
_VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
_TEST = {
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
- 'md5': '10d0f2799111df4cb1c924520ca78f98',
+ 'md5': 'e7c38568a01ea45402570e6029206723',
'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v',
'title': 'Energy',
'uploader': 'Drake',
- 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20150710',
'timestamp': 1436545535,
},
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index a6801f3..b45b431 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor):
}, {
'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
'info_dict': {
- 'id': 'blackthorn',
+ 'id': '4489',
+ 'title': 'Blackthorn',
},
'playlist_mincount': 2,
'expected_warnings': ['Unable to download JSON metadata'],
@@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor):
'title': 'Most Popular',
'id': 'mostpopular',
},
- 'playlist_mincount': 80,
+ 'playlist_mincount': 30,
}, {
'url': 'http://trailers.apple.com/#section=moviestudios',
'info_dict': {
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index e21045b..3c7d725 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor):
}
}, {
'url': 'https://archive.org/details/Cops1922',
- 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba',
+ 'md5': '0869000b4ce265e8ca62738b336b268a',
'info_dict': {
'id': 'Cops1922',
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
- 'description': 'md5:b4544662605877edd99df22f9620d858',
+ 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',
}
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 69a23e8..56baef2 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -180,7 +180,7 @@ class ArteTVBaseIE(InfoExtractor):
class ArteTVPlus7IE(ArteTVBaseIE):
IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?P<lang>fr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
@@ -188,6 +188,9 @@ class ArteTVPlus7IE(ArteTVBaseIE):
}, {
'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22',
'only_matching': True,
+ }, {
+ 'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn',
+ 'only_matching': True,
}]
@classmethod
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
index 99af6dc..01fa308 100644
--- a/youtube_dl/extractor/atresplayer.py
+++ b/youtube_dl/extractor/atresplayer.py
@@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor):
},
{
'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html',
- 'md5': '0d0e918533bbd4b263f2de4d197d4aac',
+ 'md5': '6e52cbb513c405e403dbacb7aacf8747',
'info_dict': {
'id': 'capitulo-112-david-bustamante',
'ext': 'flv',
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
index 8fc5f65..e48bb89 100644
--- a/youtube_dl/extractor/audioboom.py
+++ b/youtube_dl/extractor/audioboom.py
@@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor):
'title': '3/09/2016 Czaban Hour 3',
'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans',
'duration': 2245.72,
- 'uploader': 'Steve Czaban',
+ 'uploader': 'SB Nation A.M.',
'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
}
}, {
diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py
deleted file mode 100644
index 3ba2f00..0000000
--- a/youtube_dl/extractor/azubu.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- float_or_none,
- sanitized_Request,
-)
-
-
-class AzubuIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/[^/]+#!/play/(?P<id>\d+)'
- _TESTS = [
- {
- 'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1',
- 'md5': 'a88b42fcf844f29ad6035054bd9ecaf4',
- 'info_dict': {
- 'id': '15575',
- 'ext': 'mp4',
- 'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1',
- 'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01',
- 'thumbnail': r're:^https?://.*\.jpe?g',
- 'timestamp': 1417523507.334,
- 'upload_date': '20141202',
- 'duration': 9988.7,
- 'uploader': 'GSL',
- 'uploader_id': 414310,
- 'view_count': int,
- },
- },
- {
- 'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-',
- 'md5': 'b72a871fe1d9f70bd7673769cdb3b925',
- 'info_dict': {
- 'id': '9344',
- 'ext': 'mp4',
- 'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"',
- 'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af',
- 'thumbnail': r're:^https?://.*\.jpe?g',
- 'timestamp': 1410530893.320,
- 'upload_date': '20140912',
- 'duration': 172.385,
- 'uploader': 'FnaticTV',
- 'uploader_id': 272749,
- 'view_count': int,
- },
- 'skip': 'Channel offline',
- },
- ]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- data = self._download_json(
- 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data']
-
- title = data['title'].strip()
- description = data.get('description')
- thumbnail = data.get('thumbnail')
- view_count = data.get('view_count')
- user = data.get('user', {})
- uploader = user.get('username')
- uploader_id = user.get('id')
-
- stream_params = json.loads(data['stream_params'])
-
- timestamp = float_or_none(stream_params.get('creationDate'), 1000)
- duration = float_or_none(stream_params.get('length'), 1000)
-
- renditions = stream_params.get('renditions') or []
- video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength')
- if video:
- renditions.append(video)
-
- if not renditions and not user.get('channel', {}).get('is_live', True):
- raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True)
-
- formats = [{
- 'url': fmt['url'],
- 'width': fmt['frameWidth'],
- 'height': fmt['frameHeight'],
- 'vbr': float_or_none(fmt['encodingRate'], 1000),
- 'filesize': fmt['size'],
- 'vcodec': fmt['videoCodec'],
- 'container': fmt['videoContainer'],
- } for fmt in renditions if fmt['url']]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'duration': duration,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'view_count': view_count,
- 'formats': formats,
- }
-
-
-class AzubuLiveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/(?P<id>[^/]+)$'
-
- _TESTS = [{
- 'url': 'http://www.azubu.tv/MarsTVMDLen',
- 'only_matching': True,
- }, {
- 'url': 'http://azubu.uol.com.br/adolfz',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- user = self._match_id(url)
-
- info = self._download_json(
- 'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user),
- user)['data']
- if info['type'] != 'STREAM':
- raise ExtractorError('{0} is not streaming live'.format(user), expected=True)
-
- req = sanitized_Request(
- 'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id'])
- req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV')
- bc_info = self._download_json(req, user)
- m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS')
- formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4')
- self._sort_formats(formats)
-
- return {
- 'id': info['id'],
- 'title': self._live_title(info['title']),
- 'uploader_id': user,
- 'formats': formats,
- 'is_live': True,
- 'thumbnail': bc_info['poster'],
- }
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 056e063..489d0ba 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor):
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '73d0b3171568232574e45652f8720b5c',
+ 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
'info_dict': {
'id': '2650410135',
- 'ext': 'mp3',
- 'title': 'Lanius (Battle)',
- 'uploader': 'Ben Prunty Music',
+ 'ext': 'aiff',
+ 'title': 'Ben Prunty - Lanius (Battle)',
+ 'uploader': 'Ben Prunty',
},
}]
@@ -47,6 +47,7 @@ class BandcampIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
webpage = self._download_webpage(url, title)
+ thumbnail = self._html_search_meta('og:image', webpage, default=None)
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if not m_download:
m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
@@ -75,6 +76,7 @@ class BandcampIE(InfoExtractor):
return {
'id': track_id,
'title': data['title'],
+ 'thumbnail': thumbnail,
'formats': formats,
'duration': float_or_none(data.get('duration')),
}
@@ -143,7 +145,7 @@ class BandcampIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'thumbnail': info.get('thumb_url'),
+ 'thumbnail': info.get('thumb_url') or thumbnail,
'uploader': info.get('artist'),
'artist': artist,
'track': track,
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 8a2ed0a..dd65b8d 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -361,7 +361,7 @@ class BBCCoUkIE(InfoExtractor):
fmt.update({
'width': width,
'height': height,
- 'vbr': bitrate,
+ 'tbr': bitrate,
'vcodec': encoding,
})
else:
@@ -370,7 +370,7 @@ class BBCCoUkIE(InfoExtractor):
'acodec': encoding,
'vcodec': 'none',
})
- if protocol == 'http':
+ if protocol in ('http', 'https'):
# Direct link
fmt.update({
'url': href,
@@ -389,6 +389,8 @@ class BBCCoUkIE(InfoExtractor):
'rtmp_live': False,
'ext': 'flv',
})
+ else:
+ continue
formats.append(fmt)
elif kind == 'captions':
subtitles = self.extract_subtitles(media, programme_id)
@@ -407,7 +409,7 @@ class BBCCoUkIE(InfoExtractor):
description = smp_config['summary']
for item in smp_config['items']:
kind = item['kind']
- if kind != 'programme' and kind != 'radioProgramme':
+ if kind not in ('programme', 'radioProgramme'):
continue
programme_id = item.get('vpid')
duration = int_or_none(item.get('duration'))
@@ -448,7 +450,7 @@ class BBCCoUkIE(InfoExtractor):
for item in self._extract_items(playlist):
kind = item.get('kind')
- if kind != 'programme' and kind != 'radioProgramme':
+ if kind not in ('programme', 'radioProgramme'):
continue
title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index b0b7914..d5c5822 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -16,7 +16,7 @@ class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)'
_TEST = {
'url': 'http://beeg.com/5416503',
- 'md5': '46c384def73b33dbc581262e5ee67cef',
+ 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
'info_dict': {
'id': '5416503',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 80dd838..1e3f255 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -122,6 +122,11 @@ class BiliBiliIE(InfoExtractor):
'preference': -2 if 'hd.mp4' in backup_url else -3,
})
+ for a_format in formats:
+ a_format.setdefault('http_headers', {}).update({
+ 'Referer': url,
+ })
+
self._sort_formats(formats)
entries.append({
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py
index 7a8e1f6..e829974 100644
--- a/youtube_dl/extractor/bleacherreport.py
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor):
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
- 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20',
+ 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757',
'uploader_id': 6466954,
'upload_date': '20151011',
},
@@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})'
_TESTS = [{
'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'md5': '8c2c12e3af7805152675446c905d159b',
+ 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index ff0aa11..2c32b6a 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -77,7 +77,7 @@ class BRIE(InfoExtractor):
'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',
'duration': 893,
'uploader': 'Eva Maria Steimle',
- 'upload_date': '20140117',
+ 'upload_date': '20170208',
}
},
]
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 46ef8e6..0ed59bc 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -5,6 +5,7 @@ import re
import json
from .common import InfoExtractor
+from .adobepass import AdobePassIE
from ..compat import (
compat_etree_fromstring,
compat_parse_qs,
@@ -17,6 +18,7 @@ from ..compat import (
from ..utils import (
determine_ext,
ExtractorError,
+ extract_attributes,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
@@ -109,6 +111,7 @@ class BrightcoveLegacyIE(InfoExtractor):
'upload_date': '20140827',
'uploader_id': '710858724001',
},
+ 'skip': 'Video gone',
},
{
# playlist with 'videoList'
@@ -129,6 +132,12 @@ class BrightcoveLegacyIE(InfoExtractor):
},
'playlist_mincount': 10,
},
+ {
+ # playerID inferred from bcpid
+ # from http://www.un.org/chinese/News/story.asp?NewsID=27724
+ 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350',
+ 'only_matching': True, # Tested in GenericIE
+ }
]
FLV_VCODECS = {
1: 'SORENSON',
@@ -264,9 +273,13 @@ class BrightcoveLegacyIE(InfoExtractor):
if matches:
return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
- return list(filter(None, [
- cls._build_brighcove_url_from_js(custom_bc)
- for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
+ matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)
+ if matches:
+ return list(filter(None, [
+ cls._build_brighcove_url_from_js(custom_bc)
+ for custom_bc in matches]))
+ return [src for _, src in re.findall(
+ r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -283,6 +296,10 @@ class BrightcoveLegacyIE(InfoExtractor):
if videoPlayer:
# We set the original url as the default 'Referer' header
referer = smuggled_data.get('Referer', url)
+ if 'playerID' not in query:
+ mobj = re.search(r'/bcpid(\d+)', url)
+ if mobj is not None:
+ query['playerID'] = [mobj.group(1)]
return self._get_video_info(
videoPlayer[0], query, referer=referer)
elif 'playerKey' in query:
@@ -432,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor):
return info
-class BrightcoveNewIE(InfoExtractor):
+class BrightcoveNewIE(AdobePassIE):
IE_NAME = 'brightcove:new'
_VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'
_TESTS = [{
@@ -482,17 +499,18 @@ class BrightcoveNewIE(InfoExtractor):
}]
@staticmethod
- def _extract_url(webpage):
- urls = BrightcoveNewIE._extract_urls(webpage)
+ def _extract_url(ie, webpage):
+ urls = BrightcoveNewIE._extract_urls(ie, webpage)
return urls[0] if urls else None
@staticmethod
- def _extract_urls(webpage):
+ def _extract_urls(ie, webpage):
# Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
- # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
- # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html
- # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
+ # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
+ # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
+ # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
+ # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
entries = []
@@ -501,22 +519,48 @@ class BrightcoveNewIE(InfoExtractor):
r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
entries.append(url if url.startswith('http') else 'http:' + url)
- # Look for embed_in_page embeds [2]
- for video_id, account_id, player_id, embed in re.findall(
- # According to examples from [3] it's unclear whether video id
- # may be optional and what to do when it is
- # According to [4] data-video-id may be prefixed with ref:
- r'''(?sx)
- <video[^>]+
- data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*?
- </video>.*?
- <script[^>]+
- src=["\'](?:https?:)?//players\.brightcove\.net/
- (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+ # Look for <video> tags [2] and embed_in_page embeds [3]
+ # [2] looks like:
+ for video, script_tag, account_id, player_id, embed in re.findall(
+ r'''(?isx)
+ (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
+ (?:.*?
+ (<script[^>]+
+ src=["\'](?:https?:)?//players\.brightcove\.net/
+ (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+ )
+ )?
''', webpage):
- entries.append(
- 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
- % (account_id, player_id, embed, video_id))
+ attrs = extract_attributes(video)
+
+ # According to examples from [4] it's unclear whether video id
+ # may be optional and what to do when it is
+ video_id = attrs.get('data-video-id')
+ if not video_id:
+ continue
+
+ account_id = account_id or attrs.get('data-account')
+ if not account_id:
+ continue
+
+ player_id = player_id or attrs.get('data-player') or 'default'
+ embed = embed or attrs.get('data-embed') or 'default'
+
+ bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (
+ account_id, player_id, embed, video_id)
+
+ # Some brightcove videos may be embedded with video tag only and
+ # without script tag or any mentioning of brightcove at all. Such
+ # embeds are considered ambiguous since they are matched based only
+ # on data-video-id and data-account attributes and in the wild may
+ # not be brightcove embeds at all. Let's check reconstructed
+ # brightcove URLs in case of such embeds and only process valid
+ # ones. By this we ensure there is indeed a brightcove embed.
+ if not script_tag and not ie._is_valid_url(
+ bc_url, video_id, 'possible brightcove video'):
+ continue
+
+ entries.append(bc_url)
return entries
@@ -559,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor):
raise ExtractorError(message, expected=True)
raise
+ errors = json_data.get('errors')
+ if errors and errors[0].get('error_subcode') == 'TVE_AUTH':
+ custom_fields = json_data['custom_fields']
+ tve_token = self._extract_mvpd_auth(
+ smuggled_data['source_url'], video_id,
+ custom_fields['bcadobepassrequestorid'],
+ custom_fields['bcadobepassresourceid'])
+ json_data = self._download_json(
+ api_url, video_id, headers={
+ 'Accept': 'application/json;pk=%s' % policy_key
+ }, query={
+ 'tveToken': tve_token,
+ })
+
title = json_data['name'].strip()
formats = []
@@ -624,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor):
})
formats.append(f)
- errors = json_data.get('errors')
if not formats and errors:
error = errors[0]
raise ExtractorError(
@@ -641,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor):
is_live = False
duration = float_or_none(json_data.get('duration'), 1000)
- if duration and duration < 0:
+ if duration is not None and duration <= 0:
is_live = True
return {
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index f1f128c..acd87e3 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor):
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Terrasses du Numérique',
'duration': 122,
},
- 'params': {
- 'skip_download': True, # Requires rtmpdump
- }
}, {
'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
'only_matching': True,
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 4b9fa2d..d8bf073 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -7,8 +7,8 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
dict_get,
- ExtractorError,
- HEADRequest,
+ # ExtractorError,
+ # HEADRequest,
int_or_none,
qualities,
remove_end,
@@ -45,6 +45,9 @@ class CanalplusIE(InfoExtractor):
'itele': 'itele',
}
+ # Only works for direct mp4 URLs
+ _GEO_COUNTRIES = ['FR']
+
_TESTS = [{
'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814',
'info_dict': {
@@ -56,6 +59,7 @@ class CanalplusIE(InfoExtractor):
'upload_date': '20160702',
},
}, {
+ # geo restricted, bypassed
'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
'info_dict': {
'id': '1108190',
@@ -65,19 +69,20 @@ class CanalplusIE(InfoExtractor):
'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff',
'upload_date': '20140724',
},
- 'skip': 'Only works from France',
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
}, {
- 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html',
- 'md5': '4b47b12b4ee43002626b97fad8fb1de5',
+ # geo restricted, bypassed
+ 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html?vid=1443684',
+ 'md5': 'bb6f9f343296ab7ebd88c97b660ecf8d',
'info_dict': {
- 'id': '1420213',
+ 'id': '1443684',
'display_id': 'pid6318-videos-integrales',
'ext': 'mp4',
- 'title': 'TPMP ! Même le matin - Les 35H de Baba - 14/10/2016',
- 'description': 'md5:f96736c1b0ffaa96fd5b9e60ad871799',
- 'upload_date': '20161014',
+ 'title': 'Guess my iep ! - TPMP - 07/04/2017',
+ 'description': 'md5:6f005933f6e06760a9236d9b3b5f17fa',
+ 'upload_date': '20170407',
},
- 'skip': 'Only works from France',
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
}, {
'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510',
'info_dict': {
@@ -134,15 +139,15 @@ class CanalplusIE(InfoExtractor):
preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD'])
- fmt_url = next(iter(media.get('VIDEOS')))
- if '/geo' in fmt_url.lower():
- response = self._request_webpage(
- HEADRequest(fmt_url), video_id,
- 'Checking if the video is georestricted')
- if '/blocage' in response.geturl():
- raise ExtractorError(
- 'The video is not available in your country',
- expected=True)
+ # _, fmt_url = next(iter(media['VIDEOS'].items()))
+ # if '/geo' in fmt_url.lower():
+ # response = self._request_webpage(
+ # HEADRequest(fmt_url), video_id,
+ # 'Checking if the video is georestricted')
+ # if '/blocage' in response.geturl():
+ # raise ExtractorError(
+ # 'The video is not available in your country',
+ # expected=True)
formats = []
for format_id, format_url in media['VIDEOS'].items():
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py
index 544c665..aada029 100644
--- a/youtube_dl/extractor/canvas.py
+++ b/youtube_dl/extractor/canvas.py
@@ -7,6 +7,7 @@ from ..utils import float_or_none
class CanvasIE(InfoExtractor):
+ IE_DESC = 'canvas.be and een.be'
_VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py
index cf678e7..87ad14e 100644
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -96,6 +96,7 @@ class CBCIE(InfoExtractor):
'info_dict': {
'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
'id': 'dog-indoor-exercise-winter-1.3928238',
+ 'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
},
'playlist_mincount': 6,
}]
@@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor):
'uploader': 'CBCC-NEW',
},
}, {
- # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url
'url': 'http://www.cbc.ca/player/play/2164402062',
- 'md5': '17a61eb813539abea40618d6323a7f82',
+ 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cancer survivor four times over',
'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
'timestamp': 1320410746,
diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py
index 8d5f11d..7d78e3a 100644
--- a/youtube_dl/extractor/cbslocal.py
+++ b/youtube_dl/extractor/cbslocal.py
@@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE):
'title': 'A Very Blue Anniversary',
'description': 'CBS2’s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
- 'timestamp': 1479962220,
- 'upload_date': '20161124',
+ 'timestamp': int,
+ 'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py
index 1ee35b5..78b7a92 100755
--- a/youtube_dl/extractor/cda.py
+++ b/youtube_dl/extractor/cda.py
@@ -9,7 +9,10 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ multipart_encode,
parse_duration,
+ random_birthday,
+ urljoin,
)
@@ -27,7 +30,8 @@ class CDAIE(InfoExtractor):
'description': 'md5:269ccd135d550da90d1662651fcb9772',
'thumbnail': r're:^https?://.*\.jpg$',
'average_rating': float,
- 'duration': 39
+ 'duration': 39,
+ 'age_limit': 0,
}
}, {
'url': 'http://www.cda.pl/video/57413289',
@@ -41,13 +45,41 @@ class CDAIE(InfoExtractor):
'uploader': 'crash404',
'view_count': int,
'average_rating': float,
- 'duration': 137
+ 'duration': 137,
+ 'age_limit': 0,
}
}, {
+ # Age-restricted
+ 'url': 'http://www.cda.pl/video/1273454c4',
+ 'info_dict': {
+ 'id': '1273454c4',
+ 'ext': 'mp4',
+ 'title': 'Bronson (2008) napisy HD 1080p',
+ 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
+ 'height': 1080,
+ 'uploader': 'boniek61',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 5554,
+ 'age_limit': 18,
+ 'view_count': int,
+ 'average_rating': float,
+ },
+ }, {
'url': 'http://ebd.cda.pl/0x0/5749950c',
'only_matching': True,
}]
+ def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
+ form_data = random_birthday('rok', 'miesiac', 'dzien')
+ form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
+ data, content_type = multipart_encode(form_data)
+ return self._download_webpage(
+ urljoin(url, '/a/validatebirth'), video_id, *args,
+ data=data, headers={
+ 'Referer': url,
+ 'Content-Type': content_type,
+ }, **kwargs)
+
def _real_extract(self, url):
video_id = self._match_id(url)
self._set_cookie('cda.pl', 'cda.player', 'html5')
@@ -57,6 +89,13 @@ class CDAIE(InfoExtractor):
if 'Ten film jest dostępny dla użytkowników premium' in webpage:
raise ExtractorError('This video is only available for premium users.', expected=True)
+ need_confirm_age = False
+ if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
+ webpage, 'birthday validate form', default=None):
+ webpage = self._download_age_confirm_page(
+ url, video_id, note='Confirming age')
+ need_confirm_age = True
+
formats = []
uploader = self._search_regex(r'''(?x)
@@ -81,6 +120,7 @@ class CDAIE(InfoExtractor):
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
'duration': None,
+ 'age_limit': 18 if need_confirm_age else 0,
}
def extract_format(page, version):
@@ -121,7 +161,12 @@ class CDAIE(InfoExtractor):
for href, resolution in re.findall(
r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
webpage):
- webpage = self._download_webpage(
+ if need_confirm_age:
+ handler = self._download_age_confirm_page
+ else:
+ handler = self._download_webpage
+
+ webpage = handler(
self._BASE_URL + href, video_id,
'Downloading %s version information' % resolution, fatal=False)
if not webpage:
@@ -129,6 +174,7 @@ class CDAIE(InfoExtractor):
# invalid version is requested.
self.report_warning('Unable to download %s version information' % resolution)
continue
+
extract_format(webpage, resolution)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index dd2529a..e250de1 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -12,13 +12,14 @@ from ..utils import (
ExtractorError,
float_or_none,
sanitized_Request,
+ unescapeHTML,
urlencode_postdata,
USER_AGENTS,
)
class CeskaTelevizeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P<id>[^/#?]+)/*(?:[#?].*)?$'
+ _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_TESTS = [{
'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
'info_dict': {
@@ -62,40 +63,12 @@ class CeskaTelevizeIE(InfoExtractor):
},
'skip': 'Georestricted to Czech Republic',
}, {
- # video with 18+ caution trailer
- 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
- 'info_dict': {
- 'id': '215562210900007-bogotart',
- 'title': 'Queer: Bogotart',
- 'description': 'Alternativní průvodce současným queer světem',
- },
- 'playlist': [{
- 'info_dict': {
- 'id': '61924494876844842',
- 'ext': 'mp4',
- 'title': 'Queer: Bogotart (Varování 18+)',
- 'duration': 10.2,
- },
- }, {
- 'info_dict': {
- 'id': '61924494877068022',
- 'ext': 'mp4',
- 'title': 'Queer: Bogotart (Queer)',
- 'thumbnail': r're:^https?://.*\.jpg',
- 'duration': 1558.3,
- },
- }],
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
-
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
@@ -103,13 +76,28 @@ class CeskaTelevizeIE(InfoExtractor):
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
- typ = self._html_search_regex(
- r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
- episode_id = self._html_search_regex(
- r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
+ type_ = None
+ episode_id = None
+
+ playlist = self._parse_json(
+ self._search_regex(
+ r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
+ default='{}'), playlist_id)
+ if playlist:
+ type_ = playlist.get('type')
+ episode_id = playlist.get('id')
+
+ if not type_:
+ type_ = self._html_search_regex(
+ r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
+ webpage, 'type')
+ if not episode_id:
+ episode_id = self._html_search_regex(
+ r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
+ webpage, 'episode_id')
data = {
- 'playlist[0][type]': typ,
+ 'playlist[0][type]': type_,
'playlist[0][id]': episode_id,
'requestUrl': compat_urllib_parse_urlparse(url).path,
'requestSource': 'iVysilani',
@@ -245,3 +233,47 @@ class CeskaTelevizeIE(InfoExtractor):
yield line
return '\r\n'.join(_fix_subtitle(subtitles))
+
+
+class CeskaTelevizePoradyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
+ _TESTS = [{
+ # video with 18+ caution trailer
+ 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
+ 'info_dict': {
+ 'id': '215562210900007-bogotart',
+ 'title': 'Queer: Bogotart',
+ 'description': 'Alternativní průvodce současným queer světem',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '61924494876844842',
+ 'ext': 'mp4',
+ 'title': 'Queer: Bogotart (Varování 18+)',
+ 'duration': 10.2,
+ },
+ }, {
+ 'info_dict': {
+ 'id': '61924494877068022',
+ 'ext': 'mp4',
+ 'title': 'Queer: Bogotart (Queer)',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 1558.3,
+ },
+ }],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ data_url = unescapeHTML(self._search_regex(
+ r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+ webpage, 'iframe player url', group='url'))
+
+ return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())
diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py
index 8fbc91c..e3eba4b 100644
--- a/youtube_dl/extractor/chaturbate.py
+++ b/youtube_dl/extractor/chaturbate.py
@@ -33,10 +33,17 @@ class ChaturbateIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- m3u8_formats = [(m.group('id').lower(), m.group('url')) for m in re.finditer(
- r'hlsSource(?P<id>.+?)\s*=\s*(?P<q>["\'])(?P<url>http.+?)(?P=q)', webpage)]
+ m3u8_urls = []
- if not m3u8_formats:
+ for m in re.finditer(
+ r'(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage):
+ m3u8_fast_url, m3u8_no_fast_url = m.group('url'), m.group(
+ 'url').replace('_fast', '')
+ for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url):
+ if m3u8_url not in m3u8_urls:
+ m3u8_urls.append(m3u8_url)
+
+ if not m3u8_urls:
error = self._search_regex(
[r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>',
r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'],
@@ -50,7 +57,8 @@ class ChaturbateIE(InfoExtractor):
raise ExtractorError('Unable to find stream URL')
formats = []
- for m3u8_id, m3u8_url in m3u8_formats:
+ for m3u8_url in m3u8_urls:
+ m3u8_id = 'fast' if '_fast' in m3u8_url else 'slow'
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4',
# ffmpeg skips segments for fast m3u8
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index bb52e0c..0920f62 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
- 'md5': '720563e467b86374c194bdead08d207d',
+ 'md5': 'b9a5dc46294154c1193e2d10e0c95693',
'info_dict': {
'id': '4343170',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py
index 18c7347..6a41db8 100644
--- a/youtube_dl/extractor/collegerama.py
+++ b/youtube_dl/extractor/collegerama.py
@@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor):
'ext': 'mp4',
'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.',
'description': '',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
'duration': 7713.088,
'timestamp': 1413309600,
'upload_date': '20141014',
@@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor):
'ext': 'wmv',
'title': '64ste Vakantiecursus: Afvalwater',
'description': 'md5:7fd774865cc69d972f542b157c328305',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
'duration': 10853,
'timestamp': 1326446400,
'upload_date': '20120113',
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 6c3c095..fec39da 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import base64
@@ -244,6 +245,10 @@ class InfoExtractor(object):
specified in the URL.
end_time: Time in seconds where the reproduction should end, as
specified in the URL.
+ chapters: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the chapter in seconds
+ * "end_time" - The end time of the chapter in seconds
+ * "title" (optional, string)
The following fields should only be used when the video belongs to some logical
chapter or section:
@@ -547,6 +552,34 @@ class InfoExtractor(object):
return encoding
+ def __check_blocked(self, content):
+ first_block = content[:512]
+ if ('<title>Access to this site is blocked</title>' in content and
+ 'Websense' in first_block):
+ msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
+ blocked_iframe = self._html_search_regex(
+ r'<iframe src="([^"]+)"', content,
+ 'Websense information URL', default=None)
+ if blocked_iframe:
+ msg += ' Visit %s for more details' % blocked_iframe
+ raise ExtractorError(msg, expected=True)
+ if '<title>The URL you requested has been blocked</title>' in first_block:
+ msg = (
+ 'Access to this webpage has been blocked by Indian censorship. '
+ 'Use a VPN or proxy server (with --proxy) to route around it.')
+ block_msg = self._html_search_regex(
+ r'</h1><p>(.*?)</p>',
+ content, 'block message', default=None)
+ if block_msg:
+ msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+ raise ExtractorError(msg, expected=True)
+ if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
+ 'blocklist.rkn.gov.ru' in content):
+ raise ExtractorError(
+ 'Access to this webpage has been blocked by decision of the Russian government. '
+ 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
+ expected=True)
+
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read()
@@ -588,25 +621,7 @@ class InfoExtractor(object):
except LookupError:
content = webpage_bytes.decode('utf-8', 'replace')
- if ('<title>Access to this site is blocked</title>' in content and
- 'Websense' in content[:512]):
- msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
- blocked_iframe = self._html_search_regex(
- r'<iframe src="([^"]+)"', content,
- 'Websense information URL', default=None)
- if blocked_iframe:
- msg += ' Visit %s for more details' % blocked_iframe
- raise ExtractorError(msg, expected=True)
- if '<title>The URL you requested has been blocked</title>' in content[:512]:
- msg = (
- 'Access to this webpage has been blocked by Indian censorship. '
- 'Use a VPN or proxy server (with --proxy) to route around it.')
- block_msg = self._html_search_regex(
- r'</h1><p>(.*?)</p>',
- content, 'block message', default=None)
- if block_msg:
- msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
- raise ExtractorError(msg, expected=True)
+ self.__check_blocked(content)
return content
@@ -965,6 +980,23 @@ class InfoExtractor(object):
return info
if isinstance(json_ld, dict):
json_ld = [json_ld]
+
+ def extract_video_object(e):
+ assert e['@type'] == 'VideoObject'
+ info.update({
+ 'url': e.get('contentUrl'),
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('uploadDate')),
+ 'filesize': float_or_none(e.get('contentSize')),
+ 'tbr': int_or_none(e.get('bitrate')),
+ 'width': int_or_none(e.get('width')),
+ 'height': int_or_none(e.get('height')),
+ 'view_count': int_or_none(e.get('interactionCount')),
+ })
+
for e in json_ld:
if e.get('@context') == 'http://schema.org':
item_type = e.get('@type')
@@ -989,18 +1021,11 @@ class InfoExtractor(object):
'description': unescapeHTML(e.get('articleBody')),
})
elif item_type == 'VideoObject':
- info.update({
- 'url': e.get('contentUrl'),
- 'title': unescapeHTML(e.get('name')),
- 'description': unescapeHTML(e.get('description')),
- 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
- 'duration': parse_duration(e.get('duration')),
- 'timestamp': unified_timestamp(e.get('uploadDate')),
- 'filesize': float_or_none(e.get('contentSize')),
- 'tbr': int_or_none(e.get('bitrate')),
- 'width': int_or_none(e.get('width')),
- 'height': int_or_none(e.get('height')),
- })
+ extract_video_object(e)
+ elif item_type == 'WebPage':
+ video = e.get('video')
+ if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+ extract_video_object(video)
break
return dict((k, v) for k, v in info.items() if v is not None)
@@ -1292,40 +1317,50 @@ class InfoExtractor(object):
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
fatal=True, live=False):
-
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
fatal=fatal)
+
if res is False:
return []
+
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
+ return self._parse_m3u8_formats(
+ m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+ preference=preference, m3u8_id=m3u8_id, live=live)
+
+ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, live=False):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
- formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+ formats = []
format_url = lambda u: (
u
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- # We should try extracting formats only from master playlists [1], i.e.
- # playlists that describe available qualities. On the other hand media
- # playlists [2] should be returned as is since they contain just the media
- # without qualities renditions.
+ # References:
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+ # 2. https://github.com/rg3/youtube-dl/issues/12211
+
+ # We should try extracting formats only from master playlists [1, 4.3.4],
+ # i.e. playlists that describe available qualities. On the other hand
+ # media playlists [1, 4.3.3] should be returned as is since they contain
+ # just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
- # playlist based on particular tags availability. As of [1, 2] master
- # playlist tags MUST NOT appear in a media playist and vice versa.
- # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
- # and MUST NOT appear in master playlist thus we can clearly detect media
- # playlist with this criterion.
- # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
- # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
- # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+ # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+ # master playlist tags MUST NOT appear in a media playist and vice versa.
+ # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+ # media playlist and MUST NOT appear in master playlist thus we can
+ # clearly detect media playlist with this criterion.
+
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
@@ -1334,52 +1369,72 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}]
- audio_in_video_stream = {}
- last_info = {}
- last_media = {}
+
+ groups = {}
+ last_stream_inf = {}
+
+ def extract_media(x_media_line):
+ media = parse_m3u8_attributes(x_media_line)
+ # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+ media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+ if not (media_type and group_id and name):
+ return
+ groups.setdefault(group_id, []).append(media)
+ if media_type not in ('VIDEO', 'AUDIO'):
+ return
+ media_url = media.get('URI')
+ if media_url:
+ format_id = []
+ for v in (group_id, name):
+ if v:
+ format_id.append(v)
+ f = {
+ 'format_id': '-'.join(format_id),
+ 'url': format_url(media_url),
+ 'manifest_url': m3u8_url,
+ 'language': media.get('LANGUAGE'),
+ 'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }
+ if media_type == 'AUDIO':
+ f['vcodec'] = 'none'
+ formats.append(f)
+
+ def build_stream_name():
+ # Despite specification does not mention NAME attribute for
+ # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
+ # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
+ # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+ stream_name = last_stream_inf.get('NAME')
+ if stream_name:
+ return stream_name
+ # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+ # from corresponding rendition group
+ stream_group_id = last_stream_inf.get('VIDEO')
+ if not stream_group_id:
+ return
+ stream_group = groups.get(stream_group_id)
+ if not stream_group:
+ return stream_group_id
+ rendition = stream_group[0]
+ return rendition.get('NAME') or stream_group_id
+
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
- last_info = parse_m3u8_attributes(line)
+ last_stream_inf = parse_m3u8_attributes(line)
elif line.startswith('#EXT-X-MEDIA:'):
- media = parse_m3u8_attributes(line)
- media_type = media.get('TYPE')
- if media_type in ('VIDEO', 'AUDIO'):
- group_id = media.get('GROUP-ID')
- media_url = media.get('URI')
- if media_url:
- format_id = []
- for v in (group_id, media.get('NAME')):
- if v:
- format_id.append(v)
- f = {
- 'format_id': '-'.join(format_id),
- 'url': format_url(media_url),
- 'language': media.get('LANGUAGE'),
- 'ext': ext,
- 'protocol': entry_protocol,
- 'preference': preference,
- }
- if media_type == 'AUDIO':
- f['vcodec'] = 'none'
- if group_id and not audio_in_video_stream.get(group_id):
- audio_in_video_stream[group_id] = False
- formats.append(f)
- else:
- # When there is no URI in EXT-X-MEDIA let this tag's
- # data be used by regular URI lines below
- last_media = media
- if media_type == 'AUDIO' and group_id:
- audio_in_video_stream[group_id] = True
+ extract_media(line)
elif line.startswith('#') or not line.strip():
continue
else:
- tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
+ tbr = float_or_none(
+ last_stream_inf.get('AVERAGE-BANDWIDTH') or
+ last_stream_inf.get('BANDWIDTH'), scale=1000)
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
- # Despite specification does not mention NAME attribute for
- # EXT-X-STREAM-INF it still sometimes may be present
- stream_name = last_info.get('NAME') or last_media.get('NAME')
+ stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
@@ -1389,14 +1444,14 @@ class InfoExtractor(object):
f = {
'format_id': '-'.join(format_id),
'url': manifest_url,
- 'manifest_url': manifest_url,
+ 'manifest_url': m3u8_url,
'tbr': tbr,
'ext': ext,
- 'fps': float_or_none(last_info.get('FRAME-RATE')),
+ 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
}
- resolution = last_info.get('RESOLUTION')
+ resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
if mobj:
@@ -1412,13 +1467,26 @@ class InfoExtractor(object):
'vbr': vbr,
'abr': abr,
})
- f.update(parse_codecs(last_info.get('CODECS')))
- if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
- # TODO: update acodec for audio only formats with the same GROUP-ID
- f['acodec'] = 'none'
+ codecs = parse_codecs(last_stream_inf.get('CODECS'))
+ f.update(codecs)
+ audio_group_id = last_stream_inf.get('AUDIO')
+ # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+ # references a rendition group MUST have a CODECS attribute.
+ # However, this is not always respected, for example, [2]
+ # contains EXT-X-STREAM-INF tag which references AUDIO
+ # rendition group but does not have CODECS and despite
+ # referencing audio group an audio group, it represents
+ # a complete (with audio and video) format. So, for such cases
+ # we will ignore references to rendition groups and treat them
+ # as complete formats.
+ if audio_group_id and codecs and f.get('vcodec') != 'none':
+ audio_group = groups.get(audio_group_id)
+ if audio_group and audio_group[0].get('URI'):
+ # TODO: update acodec for audio only formats with
+ # the same GROUP-ID
+ f['acodec'] = 'none'
formats.append(f)
- last_info = {}
- last_media = {}
+ last_stream_inf = {}
return formats
@staticmethod
@@ -1768,7 +1836,7 @@ class InfoExtractor(object):
if content_type == 'text':
# TODO implement WebVTT downloading
pass
- elif content_type == 'video' or content_type == 'audio':
+ elif content_type in ('video', 'audio'):
base_url = ''
for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL'))
@@ -1792,7 +1860,7 @@ class InfoExtractor(object):
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
- 'tbr': int_or_none(bandwidth, 1000),
+ 'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
@@ -1933,6 +2001,12 @@ class InfoExtractor(object):
compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+ """
+ Parse formats from ISM manifest.
+ References:
+ 1. [MS-SSTR]: Smooth Streaming Protocol,
+ https://msdn.microsoft.com/en-us/library/ff469518.aspx
+ """
if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
return []
@@ -1954,8 +2028,11 @@ class InfoExtractor(object):
self.report_warning('%s is not a supported codec' % fourcc)
continue
tbr = int(track.attrib['Bitrate']) // 1000
- width = int_or_none(track.get('MaxWidth'))
- height = int_or_none(track.get('MaxHeight'))
+ # [1] does not mention Width and Height attributes. However,
+ # they're often present while MaxWidth and MaxHeight are
+ # missing, so should be used as fallbacks
+ width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+ height = int_or_none(track.get('MaxHeight') or track.get('Height'))
sampling_rate = int_or_none(track.get('SamplingRate'))
track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
@@ -2106,7 +2183,7 @@ class InfoExtractor(object):
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
formats = []
hdcore_sign = 'hdcore=3.7.0'
- f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+ f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
hds_host = hosts.get('hds')
if hds_host:
f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
@@ -2128,8 +2205,9 @@ class InfoExtractor(object):
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
- url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
- http_base_url = 'http' + url_base
+ url_base = self._search_regex(
+ r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
+ http_base_url = '%s:%s' % ('http', url_base)
formats = []
if 'm3u8' not in skip_protocols:
formats.extend(self._extract_m3u8_formats(
@@ -2163,7 +2241,7 @@ class InfoExtractor(object):
for protocol in ('rtmp', 'rtsp'):
if protocol not in skip_protocols:
formats.append({
- 'url': protocol + url_base,
+ 'url': '%s:%s' % (protocol, url_base),
'format_id': protocol,
'protocol': protocol,
})
@@ -2171,7 +2249,7 @@ class InfoExtractor(object):
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
- r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+ r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
webpage)
if mobj:
try:
@@ -2247,11 +2325,17 @@ class InfoExtractor(object):
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ urls = []
formats = []
for source in jwplayer_sources_data:
- source_url = self._proto_relative_url(source['file'])
+ source_url = self._proto_relative_url(source.get('file'))
+ if not source_url:
+ continue
if base_url:
source_url = compat_urlparse.urljoin(base_url, source_url)
+ if source_url in urls:
+ continue
+ urls.append(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
if source_type == 'hls' or ext == 'm3u8':
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index d3463b8..0c3f0c0 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -16,7 +16,6 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_iso8601,
- remove_end,
)
@@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor):
'wmagazine': 'W Magazine',
}
- _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
+ _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/
+ (?:
+ (?:
+ embed(?:js)?|
+ (?:script|inline)/video
+ )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?|
+ (?P<type>watch|series|video)/(?P<display_id>[^/?#]+)
+ )''' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
- EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys())
+ EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())
_TESTS = [{
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
@@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor):
'upload_date': '20150916',
'timestamp': 1442434955,
}
+ }, {
+ 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js',
+ 'only_matching': True,
}]
def _extract_series(self, url, webpage):
@@ -104,7 +116,7 @@ class CondeNastIE(InfoExtractor):
entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
return self.playlist_result(entries, playlist_title=title)
- def _extract_video(self, webpage, url_type):
+ def _extract_video_params(self, webpage):
query = {}
params = self._search_regex(
r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None)
@@ -123,17 +135,30 @@ class CondeNastIE(InfoExtractor):
'playerId': params['data-player'],
'target': params['id'],
})
- video_id = query['videoId']
+ return query
+
+ def _extract_video(self, params):
+ video_id = params['videoId']
+
video_info = None
- info_page = self._download_json(
- 'http://player.cnevids.com/player/video.js',
- video_id, 'Downloading video info', fatal=False, query=query)
- if info_page:
- video_info = info_page.get('video')
- if not video_info:
+ if params.get('playerId'):
+ info_page = self._download_json(
+ 'http://player.cnevids.com/player/video.js',
+ video_id, 'Downloading video info', fatal=False, query=params)
+ if info_page:
+ video_info = info_page.get('video')
+ if not video_info:
+ info_page = self._download_webpage(
+ 'http://player.cnevids.com/player/loader.js',
+ video_id, 'Downloading loader info', query=params)
+ else:
info_page = self._download_webpage(
- 'http://player.cnevids.com/player/loader.js',
- video_id, 'Downloading loader info', query=query)
+ 'https://player.cnevids.com/inline/video/%s.js' % video_id,
+ video_id, 'Downloading inline info', query={
+ 'target': params.get('target', 'embedplayer')
+ })
+
+ if not video_info:
video_info = self._parse_json(
self._search_regex(
r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'),
@@ -161,9 +186,7 @@ class CondeNastIE(InfoExtractor):
})
self._sort_formats(formats)
- info = self._search_json_ld(
- webpage, video_id, fatal=False) if url_type != 'embed' else {}
- info.update({
+ return {
'id': video_id,
'formats': formats,
'title': title,
@@ -174,22 +197,26 @@ class CondeNastIE(InfoExtractor):
'series': video_info.get('series_title'),
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
- })
- return info
+ 'categories': video_info.get('categories'),
+ }
def _real_extract(self, url):
- site, url_type, item_id = re.match(self._VALID_URL, url).groups()
+ video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups()
- # Convert JS embed to regular embed
- if url_type == 'embedjs':
- parsed_url = compat_urlparse.urlparse(url)
- url = compat_urlparse.urlunparse(parsed_url._replace(
- path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
- url_type = 'embed'
+ if video_id:
+ return self._extract_video({
+ 'videoId': video_id,
+ 'playerId': player_id,
+ 'target': target,
+ })
- webpage = self._download_webpage(url, item_id)
+ webpage = self._download_webpage(url, display_id)
if url_type == 'series':
return self._extract_series(url, webpage)
else:
- return self._extract_video(webpage, url_type)
+ params = self._extract_video_params(webpage)
+ info = self._search_json_ld(
+ webpage, display_id, fatal=False)
+ info.update(self._extract_video(params))
+ return info
diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py
index 5fa1f00..6ea03e6 100644
--- a/youtube_dl/extractor/coub.py
+++ b/youtube_dl/extractor/coub.py
@@ -24,12 +24,11 @@ class CoubIE(InfoExtractor):
'duration': 4.6,
'timestamp': 1428527772,
'upload_date': '20150408',
- 'uploader': 'Артём Лоскутников',
+ 'uploader': 'Artyom Loskutnikov',
'uploader_id': 'artyom.loskutnikov',
'view_count': int,
'like_count': int,
'repost_count': int,
- 'comment_count': int,
'age_limit': 0,
},
}, {
@@ -118,7 +117,6 @@ class CoubIE(InfoExtractor):
view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
like_count = int_or_none(coub.get('likes_count'))
repost_count = int_or_none(coub.get('recoubs_count'))
- comment_count = int_or_none(coub.get('comments_count'))
age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
if age_restricted is not None:
@@ -137,7 +135,6 @@ class CoubIE(InfoExtractor):
'view_count': view_count,
'like_count': like_count,
'repost_count': repost_count,
- 'comment_count': comment_count,
'age_limit': age_limit,
'formats': formats,
}
diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py
index f919ed2..13f425b 100644
--- a/youtube_dl/extractor/crackle.py
+++ b/youtube_dl/extractor/crackle.py
@@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor):
'season_number': 8,
'episode_number': 4,
'subtitles': {
- 'en-US': [{
- 'ext': 'ttml',
- }]
+ 'en-US': [
+ {'ext': 'vtt'},
+ {'ext': 'tt'},
+ ]
},
},
'params': {
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index d15fd37..2ffa4a7 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'info_dict': {
'id': '727589',
'ext': 'mp4',
- 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!",
+ 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!",
'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Kadokawa Pictures Inc.',
@@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'series': "KONOSUBA -God's blessing on this wonderful world!",
'season': "KONOSUBA -God's blessing on this wonderful world! 2",
'season_number': 2,
- 'episode': 'Give Me Deliverance from this Judicial Injustice!',
+ 'episode': 'Give Me Deliverance From This Judicial Injustice!',
'episode_number': 1,
},
'params': {
@@ -390,7 +390,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
else:
webpage_url = 'http://www.' + mobj.group('url')
- webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage')
+ webpage = self._download_webpage(
+ self._add_skip_wall(webpage_url), video_id,
+ headers=self.geo_verification_headers())
note_m = self._html_search_regex(
r'<div class="showmedia-trailer-notice">(.+?)</div>',
webpage, 'trailer-notice', default='')
@@ -565,7 +567,9 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
def _real_extract(self, url):
show_id = self._match_id(url)
- webpage = self._download_webpage(self._add_skip_wall(url), show_id)
+ webpage = self._download_webpage(
+ self._add_skip_wall(url), show_id,
+ headers=self.geo_verification_headers())
title = self._html_search_regex(
r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>',
webpage, 'title')
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index d457616..171820e 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -10,6 +10,7 @@ from ..utils import (
smuggle_url,
determine_ext,
ExtractorError,
+ extract_attributes,
)
from .senateisvp import SenateISVPIE
from .ustream import UstreamIE
@@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor):
'uploader_id': '12987475',
},
}]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor):
if ustream_url:
return self.url_result(ustream_url, UstreamIE.ie_key())
+ if '&vod' not in url:
+ bc = self._search_regex(
+ r"(<[^>]+id='brightcove-player-embed'[^>]+>)",
+ webpage, 'brightcove embed', default=None)
+ if bc:
+ bc_attr = extract_attributes(bc)
+ bc_url = self.BRIGHTCOVE_URL_TEMPLATE % (
+ bc_attr.get('data-bcaccountid', '3162030207001'),
+ bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'),
+ bc_attr.get('data-newbcplayerid', 'default'),
+ bc_attr['data-bcid'])
+ return self.url_result(smuggle_url(bc_url, {'source_url': url}))
+
# We first look for clipid, because clipprog always appears before
patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
results = list(filter(None, (re.search(p, webpage) for p in patterns)))
diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py
index e3c9946..8e45923 100644
--- a/youtube_dl/extractor/curiositystream.py
+++ b/youtube_dl/extractor/curiositystream.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -46,9 +48,50 @@ class CuriosityStreamBaseIE(InfoExtractor):
def _extract_media_info(self, media):
video_id = compat_str(media['id'])
- limelight_media_id = media['limelight_media_id']
title = media['title']
+ formats = []
+ for encoding in media.get('encodings', []):
+ m3u8_url = encoding.get('master_playlist_url')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ encoding_url = encoding.get('url')
+ file_url = encoding.get('file_url')
+ if not encoding_url and not file_url:
+ continue
+ f = {
+ 'width': int_or_none(encoding.get('width')),
+ 'height': int_or_none(encoding.get('height')),
+ 'vbr': int_or_none(encoding.get('video_bitrate')),
+ 'abr': int_or_none(encoding.get('audio_bitrate')),
+ 'filesize': int_or_none(encoding.get('size_in_bytes')),
+ 'vcodec': encoding.get('video_codec'),
+ 'acodec': encoding.get('audio_codec'),
+ 'container': encoding.get('container_type'),
+ }
+ for f_url in (encoding_url, file_url):
+ if not f_url:
+ continue
+ fmt = f.copy()
+ rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
+ if rtmp:
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ 'format_id': 'rtmp',
+ })
+ else:
+ fmt.update({
+ 'url': f_url,
+ 'format_id': 'http',
+ })
+ formats.append(fmt)
+ self._sort_formats(formats)
+
subtitles = {}
for closed_caption in media.get('closed_captions', []):
sub_url = closed_caption.get('file')
@@ -60,16 +103,14 @@ class CuriosityStreamBaseIE(InfoExtractor):
})
return {
- '_type': 'url_transparent',
'id': video_id,
- 'url': 'limelight:media:' + limelight_media_id,
+ 'formats': formats,
'title': title,
'description': media.get('description'),
'thumbnail': media.get('image_large') or media.get('image_medium') or media.get('image_small'),
'duration': int_or_none(media.get('duration')),
'tags': media.get('tags'),
'subtitles': subtitles,
- 'ie_key': 'LimelightMedia',
}
@@ -78,14 +119,12 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
_VALID_URL = r'https?://app\.curiositystream\.com/video/(?P<id>\d+)'
_TEST = {
'url': 'https://app.curiositystream.com/video/2',
- 'md5': 'a0074c190e6cddaf86900b28d3e9ee7a',
+ 'md5': '262bb2f257ff301115f1973540de8983',
'info_dict': {
'id': '2',
'ext': 'mp4',
'title': 'How Did You Develop The Internet?',
'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
- 'timestamp': 1448388615,
- 'upload_date': '20151124',
}
}
@@ -105,7 +144,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
'title': 'Curious Minds: The Internet',
'description': 'How is the internet shaping our lives in the 21st Century?',
},
- 'playlist_mincount': 17,
+ 'playlist_mincount': 12,
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py
index 1ab9333..f4cf0f1 100644
--- a/youtube_dl/extractor/cwtv.py
+++ b/youtube_dl/extractor/cwtv.py
@@ -82,6 +82,11 @@ class CWTVIE(InfoExtractor):
'url': quality_url,
'tbr': tbr,
})
+ video_metadata = video_data['assetFields']
+ ism_url = video_metadata.get('smoothStreamingUrl')
+ if ism_url:
+ formats.extend(self._extract_ism_formats(
+ ism_url, video_id, ism_id='mss', fatal=False))
self._sort_formats(formats)
thumbnails = [{
@@ -90,8 +95,6 @@ class CWTVIE(InfoExtractor):
'height': image.get('height'),
} for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None
- video_metadata = video_data['assetFields']
-
subtitles = {
'en': [{
'url': video_metadata['UnicornCcUrl'],
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py
index 98c835b..538565c 100644
--- a/youtube_dl/extractor/dailymail.py
+++ b/youtube_dl/extractor/dailymail.py
@@ -2,9 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
int_or_none,
determine_protocol,
+ try_get,
unescapeHTML,
)
@@ -28,8 +30,14 @@ class DailyMailIE(InfoExtractor):
video_data = self._parse_json(self._search_regex(
r"data-opts='({.+?})'", webpage, 'video data'), video_id)
title = unescapeHTML(video_data['title'])
- video_sources = self._download_json(video_data.get(
- 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+ sources_url = (try_get(
+ video_data,
+ (lambda x: x['plugins']['sources']['url'],
+ lambda x: x['sources']['url']), compat_str) or
+ 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id)
+
+ video_sources = self._download_json(sources_url, video_id)
formats = []
for rendition in video_sources['renditions']:
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 246efde..f8db76c 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
class DailymotionIE(DailymotionBaseInfoExtractor):
- _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)'
+ _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'
IE_NAME = 'dailymotion'
_FORMATS = [
@@ -49,68 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
('stream_h264_hd1080_url', 'hd180'),
]
- _TESTS = [
- {
- 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
- 'md5': '2137c41a8e78554bb09225b8eb322406',
- 'info_dict': {
- 'id': 'x2iuewm',
- 'ext': 'mp4',
- 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
- 'description': 'Several come bundled with the Steam Controller.',
- 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
- 'duration': 74,
- 'timestamp': 1425657362,
- 'upload_date': '20150306',
- 'uploader': 'IGN',
- 'uploader_id': 'xijv66',
- 'age_limit': 0,
- 'view_count': int,
- }
+ _TESTS = [{
+ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
+ 'md5': '074b95bdee76b9e3654137aee9c79dfe',
+ 'info_dict': {
+ 'id': 'x5kesuj',
+ 'ext': 'mp4',
+ 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
+ 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
+ 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 187,
+ 'timestamp': 1493651285,
+ 'upload_date': '20170501',
+ 'uploader': 'Deadline',
+ 'uploader_id': 'x1xm8ri',
+ 'age_limit': 0,
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
+ 'md5': '2137c41a8e78554bb09225b8eb322406',
+ 'info_dict': {
+ 'id': 'x2iuewm',
+ 'ext': 'mp4',
+ 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+ 'description': 'Several come bundled with the Steam Controller.',
+ 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 74,
+ 'timestamp': 1425657362,
+ 'upload_date': '20150306',
+ 'uploader': 'IGN',
+ 'uploader_id': 'xijv66',
+ 'age_limit': 0,
+ 'view_count': int,
},
+ 'skip': 'video gone',
+ }, {
# Vevo video
- {
- 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
- 'info_dict': {
- 'title': 'Roar (Official)',
- 'id': 'USUV71301934',
- 'ext': 'mp4',
- 'uploader': 'Katy Perry',
- 'upload_date': '20130905',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'VEVO is only available in some countries',
+ 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
+ 'info_dict': {
+ 'title': 'Roar (Official)',
+ 'id': 'USUV71301934',
+ 'ext': 'mp4',
+ 'uploader': 'Katy Perry',
+ 'upload_date': '20130905',
+ },
+ 'params': {
+ 'skip_download': True,
},
+ 'skip': 'VEVO is only available in some countries',
+ }, {
# age-restricted video
- {
- 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
- 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
- 'info_dict': {
- 'id': 'xyh2zz',
- 'ext': 'mp4',
- 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
- 'uploader': 'HotWaves1012',
- 'age_limit': 18,
- },
- 'skip': 'video gone',
+ 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+ 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
+ 'info_dict': {
+ 'id': 'xyh2zz',
+ 'ext': 'mp4',
+ 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+ 'uploader': 'HotWaves1012',
+ 'age_limit': 18,
},
+ 'skip': 'video gone',
+ }, {
# geo-restricted, player v5
- {
- 'url': 'http://www.dailymotion.com/video/xhza0o',
- 'only_matching': True,
- },
+ 'url': 'http://www.dailymotion.com/video/xhza0o',
+ 'only_matching': True,
+ }, {
# with subtitles
- {
- 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
- 'only_matching': True,
- },
- {
- 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
- 'only_matching': True,
- }
- ]
+ 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
+ 'only_matching': True,
+ }]
@staticmethod
def _extract_urls(webpage):
diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py
index bdfe638..5c9c0ec 100644
--- a/youtube_dl/extractor/democracynow.py
+++ b/youtube_dl/extractor/democracynow.py
@@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor):
'info_dict': {
'id': '2015-0703-001',
'ext': 'mp4',
- 'title': 'Daily Show',
+ 'title': 'Daily Show for July 03, 2015',
+ 'description': 'md5:80eb927244d6749900de6072c7cc2c86',
},
}, {
'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
diff --git a/youtube_dl/extractor/discoveryvr.py b/youtube_dl/extractor/discoveryvr.py
new file mode 100644
index 0000000..cb63c26
--- /dev/null
+++ b/youtube_dl/extractor/discoveryvr.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class DiscoveryVRIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?discoveryvr\.com/watch/(?P<id>[^/?#]+)'
+ _TEST = {
+ 'url': 'http://www.discoveryvr.com/watch/discovery-vr-an-introduction',
+ 'md5': '32b1929798c464a54356378b7912eca4',
+ 'info_dict': {
+ 'id': 'discovery-vr-an-introduction',
+ 'ext': 'mp4',
+ 'title': 'Discovery VR - An Introduction',
+ 'description': 'md5:80d418a10efb8899d9403e61d8790f06',
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ bootstrap_data = self._search_regex(
+ r'root\.DVR\.bootstrapData\s+=\s+"({.+?})";',
+ webpage, 'bootstrap data')
+ bootstrap_data = self._parse_json(
+ bootstrap_data.encode('utf-8').decode('unicode_escape'),
+ display_id)
+ videos = self._parse_json(bootstrap_data['videos'], display_id)['allVideos']
+ video_data = next(video for video in videos if video.get('slug') == display_id)
+
+ series = video_data.get('showTitle')
+ title = episode = video_data.get('title') or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+
+ formats = []
+ for f, format_id in (('cdnUriM3U8', 'mobi'), ('webVideoUrlSd', 'sd'), ('webVideoUrlHd', 'hd')):
+ f_url = video_data.get(f)
+ if not f_url:
+ continue
+ formats.append({
+ 'format_id': format_id,
+ 'url': f_url,
+ })
+
+ return {
+ 'id': display_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': video_data.get('description'),
+ 'thumbnail': video_data.get('thumbnail'),
+ 'duration': parse_duration(video_data.get('runTime')),
+ 'formats': formats,
+ 'episode': episode,
+ 'series': series,
+ }
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py
index 1f75352..148605c 100644
--- a/youtube_dl/extractor/dotsub.py
+++ b/youtube_dl/extractor/dotsub.py
@@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor):
'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p',
'duration': 290,
'timestamp': 1476767794.2809999,
- 'upload_date': '20160525',
+ 'upload_date': '20161018',
'uploader': 'parthivi001',
'uploader_id': 'user52596202',
'view_count': int,
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 82d8a04..9757f44 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -3,11 +3,14 @@ from __future__ import unicode_literals
import time
import hashlib
+import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
+ unified_strdate,
+ urljoin,
)
@@ -20,7 +23,7 @@ class DouyuTVIE(InfoExtractor):
'id': '17732',
'display_id': 'iseven',
'ext': 'flv',
- 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '7师傅',
@@ -51,7 +54,7 @@ class DouyuTVIE(InfoExtractor):
'id': '17732',
'display_id': '17732',
'ext': 'flv',
- 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '7师傅',
@@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor):
'uploader': uploader,
'is_live': True,
}
+
+
+class DouyuShowIE(InfoExtractor):
+ _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
+
+ _TESTS = [{
+ 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
+ 'md5': '0c2cfd068ee2afe657801269b2d86214',
+ 'info_dict': {
+ 'id': 'rjNBdvnVXNzvE2yw',
+ 'ext': 'mp4',
+ 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场',
+ 'duration': 7150.08,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': '陈一发儿',
+ 'uploader_id': 'XrZwYelr5wbK',
+ 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
+ 'upload_date': '20170402',
+ },
+ }, {
+ 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ url = url.replace('vmobile.', 'v.')
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ room_info = self._parse_json(self._search_regex(
+ r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
+
+ video_info = None
+
+ for trial in range(5):
+ # Sometimes Douyu rejects our request. Let's try it more times
+ try:
+ video_info = self._download_json(
+ 'https://vmobile.douyu.com/video/getInfo', video_id,
+ query={'vid': video_id},
+ headers={
+ 'Referer': url,
+ 'x-requested-with': 'XMLHttpRequest',
+ })
+ break
+ except ExtractorError:
+ self._sleep(1, video_id)
+
+ if not video_info:
+ raise ExtractorError('Can\'t fetch video info')
+
+ formats = self._extract_m3u8_formats(
+ video_info['data']['video_url'], video_id,
+ entry_protocol='m3u8_native', ext='mp4')
+
+ upload_date = unified_strdate(self._html_search_regex(
+ r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
+ 'upload date', fatal=False))
+
+ uploader = uploader_id = uploader_url = None
+ mobj = re.search(
+ r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
+ webpage)
+ if mobj:
+ uploader_id, uploader = mobj.groups()
+ uploader_url = urljoin(url, '/author/' + uploader_id)
+
+ return {
+ 'id': video_id,
+ 'title': room_info['name'],
+ 'formats': formats,
+ 'duration': room_info.get('duration'),
+ 'thumbnail': room_info.get('pic'),
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'uploader_url': uploader_url,
+ }
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index e491701..c84624f 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor):
IE_NAME = 'drtv'
_TESTS = [{
'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
- 'md5': '25e659cccc9a2ed956110a299fdf5983',
+ 'md5': '7ae17b4e18eb5d29212f424a7511c184',
'info_dict': {
'id': 'klassen-darlig-taber-10',
'ext': 'mp4',
@@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor):
'upload_date': '20160823',
'duration': 606.84,
},
- 'params': {
- 'skip_download': True,
- },
}, {
+ # embed
'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
- 'md5': '2c37175c718155930f939ef59952474a',
'info_dict': {
'id': 'christiania-pusher-street-ryddes-drdkrjpo',
'ext': 'mp4',
'title': 'LIVE Christianias rydning af Pusher Street er i gang',
- 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.',
+ 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
'timestamp': 1472800279,
'upload_date': '20160902',
'duration': 131.4,
},
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # with SignLanguage formats
+ 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
+ 'info_dict': {
+ 'id': 'historien-om-danmark-stenalder',
+ 'ext': 'mp4',
+ 'title': 'Historien om Danmark: Stenalder (1)',
+ 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
+ 'timestamp': 1490401996,
+ 'upload_date': '20170325',
+ 'duration': 3502.04,
+ 'formats': 'mincount:20',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor):
elif kind in ('VideoResource', 'AudioResource'):
duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
restricted_to_denmark = asset.get('RestrictedToDenmark')
- spoken_subtitles = asset.get('Target') == 'SpokenSubtitles'
+ asset_target = asset.get('Target')
for link in asset.get('Links', []):
uri = link.get('Uri')
if not uri:
@@ -96,9 +112,9 @@ class DRTVIE(InfoExtractor):
target = link.get('Target')
format_id = target or ''
preference = None
- if spoken_subtitles:
+ if asset_target in ('SpokenSubtitles', 'SignLanguage'):
preference = -1
- format_id += '-spoken-subtitles'
+ format_id += '-%s' % asset_target
if target == 'HDS':
f4m_formats = self._extract_f4m_formats(
uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 6a7028a..ed603eb 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -19,6 +19,7 @@ from .acast import (
ACastChannelIE,
)
from .addanime import AddAnimeIE
+from .adn import ADNIE
from .adobetv import (
AdobeTVIE,
AdobeTVShowIE,
@@ -40,6 +41,7 @@ from .alphaporno import AlphaPornoIE
from .amcnetworks import AMCNetworksIE
from .animeondemand import AnimeOnDemandIE
from .anitube import AnitubeIE
+from .anvato import AnvatoIE
from .anysex import AnySexIE
from .aol import AolIE
from .allocine import AllocineIE
@@ -86,7 +88,6 @@ from .azmedien import (
AZMedienPlaylistIE,
AZMedienShowPlaylistIE,
)
-from .azubu import AzubuIE, AzubuLiveIE
from .baidu import BaiduVideoIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
@@ -164,7 +165,10 @@ from .ccc import CCCIE
from .ccma import CCMAIE
from .cctv import CCTVIE
from .cda import CDAIE
-from .ceskatelevize import CeskaTelevizeIE
+from .ceskatelevize import (
+ CeskaTelevizeIE,
+ CeskaTelevizePoradyIE,
+)
from .channel9 import Channel9IE
from .charlierose import CharlieRoseIE
from .chaturbate import ChaturbateIE
@@ -247,7 +251,10 @@ from .democracynow import DemocracynowIE
from .dfb import DFBIE
from .dhm import DHMIE
from .dotsub import DotsubIE
-from .douyutv import DouyuTVIE
+from .douyutv import (
+ DouyuShowIE,
+ DouyuTVIE,
+)
from .dplay import (
DPlayIE,
DPlayItIE,
@@ -272,6 +279,7 @@ from .discoverygo import (
DiscoveryGoPlaylistIE,
)
from .discoverynetworks import DiscoveryNetworksDeIE
+from .discoveryvr import DiscoveryVRIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
from .dropbox import DropboxIE
@@ -345,9 +353,9 @@ from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
- PluzzIE,
- FranceTvInfoIE,
FranceTVIE,
+ FranceTVEmbedIE,
+ FranceTVInfoIE,
GenerationQuoiIE,
CultureboxIE,
)
@@ -379,6 +387,7 @@ from .globo import (
GloboArticleIE,
)
from .go import GoIE
+from .go90 import Go90IE
from .godtube import GodTubeIE
from .godtv import GodTVIE
from .golem import GolemIE
@@ -536,6 +545,8 @@ from .mangomolo import (
)
from .matchtv import MatchTVIE
from .mdr import MDRIE
+from .mediaset import MediasetIE
+from .medici import MediciIE
from .meipai import MeipaiIE
from .melonvod import MelonVODIE
from .meta import METAIE
@@ -656,6 +667,8 @@ from .nintendo import NintendoIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .noco import NocoIE
+from .nonktube import NonkTubeIE
+from .noovo import NoovoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
from .nova import NovaIE
@@ -724,10 +737,14 @@ from .openload import OpenloadIE
from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
- ORFOE1IE,
ORFFM4IE,
+ ORFOE1IE,
ORFIPTVIE,
)
+from .packtpub import (
+ PacktPubIE,
+ PacktPubCourseIE,
+)
from .pandatv import PandaTVIE
from .pandoratv import PandoraTVIE
from .parliamentliveuk import ParliamentLiveUKIE
@@ -797,7 +814,7 @@ from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
from .rai import (
- RaiTVIE,
+ RaiPlayIE,
RaiIE,
)
from .rbmaradio import RBMARadioIE
@@ -828,7 +845,11 @@ from .rozhlas import RozhlasIE
from .rtbf import RTBFIE
from .rte import RteIE, RteRadioIE
from .rtlnl import RtlNlIE
-from .rtl2 import RTL2IE
+from .rtl2 import (
+ RTL2IE,
+ RTL2YouIE,
+ RTL2YouSeriesIE,
+)
from .rtp import RTPIE
from .rts import RTSIE
from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE
@@ -924,6 +945,7 @@ from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamable import StreamableIE
+from .streamango import StreamangoIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
@@ -970,6 +992,7 @@ from .theplatform import (
from .thescene import TheSceneIE
from .thesixtyone import TheSixtyOneIE
from .thestar import TheStarIE
+from .thesun import TheSunIE
from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
@@ -1016,8 +1039,10 @@ from .tv2 import (
TV2IE,
TV2ArticleIE,
)
+from .tv2hu import TV2HuIE
from .tv3 import TV3IE
from .tv4 import TV4IE
+from .tv5mondeplus import TV5MondePlusIE
from .tva import TVAIE
from .tvanouvelles import (
TVANouvellesIE,
@@ -1078,6 +1103,10 @@ from .uplynk import (
UplynkIE,
UplynkPreplayIE,
)
+from .upskill import (
+ UpskillIE,
+ UpskillCourseIE,
+)
from .urort import UrortIE
from .urplay import URPlayIE
from .usanetwork import USANetworkIE
@@ -1105,6 +1134,7 @@ from .vgtv import (
from .vh1 import VH1IE
from .vice import (
ViceIE,
+ ViceArticleIE,
ViceShowIE,
)
from .viceland import VicelandIE
@@ -1177,6 +1207,11 @@ from .voxmedia import VoxMediaIE
from .vporn import VpornIE
from .vrt import VRTIE
from .vrak import VrakIE
+from .vrv import (
+ VRVIE,
+ VRVSeriesIE,
+)
+from .vshare import VShareIE
from .medialaan import MedialaanIE
from .vube import VubeIE
from .vuclip import VuClipIE
@@ -1210,7 +1245,10 @@ from .wrzuta import (
WrzutaIE,
WrzutaPlaylistIE,
)
-from .wsj import WSJIE
+from .wsj import (
+ WSJIE,
+ WSJArticleIE,
+)
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE
@@ -1272,5 +1310,6 @@ from .youtube import (
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE
+from .zaq1 import Zaq1IE
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import ZingMp3IE
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py
index a3bb983..9855427 100644
--- a/youtube_dl/extractor/foxsports.py
+++ b/youtube_dl/extractor/foxsports.py
@@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
_TEST = {
- 'url': 'http://www.foxsports.com/video?vid=432609859715',
+ 'url': 'http://www.foxsports.com/tennessee/video/432609859715',
'md5': 'b49050e955bebe32c301972e4012ac17',
'info_dict': {
- 'id': 'i0qKWsk3qJaM',
+ 'id': 'bwduI3X_TgUB',
'ext': 'mp4',
'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
'description': 'Courtney Lee talks about Memphis being focused.',
@@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
config = self._parse_json(
- self._search_regex(
- r"data-player-config='([^']+)'", webpage, 'data player config'),
+ self._html_search_regex(
+ r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""",
+ webpage, 'data player config'),
video_id)
return self.url_result(smuggle_url(update_url_query(
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 48d43ae..546d5ca 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -21,11 +21,13 @@ from .dailymotion import (
class FranceTVBaseInfoExtractor(InfoExtractor):
- def _extract_video(self, video_id, catalogue):
+ def _extract_video(self, video_id, catalogue=None):
info = self._download_json(
- 'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s'
- % (video_id, catalogue),
- video_id, 'Downloading video JSON')
+ 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/',
+ video_id, 'Downloading video JSON', query={
+ 'idDiffusion': video_id,
+ 'catalogue': catalogue or '',
+ })
if info.get('status') == 'NOK':
raise ExtractorError(
@@ -109,27 +111,97 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
}
-class PluzzIE(FranceTVBaseInfoExtractor):
- IE_NAME = 'pluzz.francetv.fr'
- _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html'
+class FranceTVIE(FranceTVBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)+(?P<id>[^/]+)\.html'
- # Can't use tests, videos expire in 7 days
+ _TESTS = [{
+ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
+ 'info_dict': {
+ 'id': '157550144',
+ 'ext': 'mp4',
+ 'title': '13h15, le dimanche... - Les mystères de Jésus',
+ 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
+ 'timestamp': 1494156300,
+ 'upload_date': '20170507',
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
+ }, {
+ # france3
+ 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
+ 'only_matching': True,
+ }, {
+ # france4
+ 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html',
+ 'only_matching': True,
+ }, {
+ # france5
+ 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html',
+ 'only_matching': True,
+ }, {
+ # franceo
+ 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html',
+ 'only_matching': True,
+ }, {
+ # france2 live
+ 'url': 'https://www.france.tv/france-2/direct.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- video_id = self._html_search_meta(
- 'id_video', webpage, 'video id', default=None)
+ catalogue = None
+ video_id = self._search_regex(
+ r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1',
+ webpage, 'video id', default=None, group='id')
+
if not video_id:
- video_id = self._search_regex(
- r'data-diffusion=["\'](\d+)', webpage, 'video id')
+ video_id, catalogue = self._html_search_regex(
+ r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
+ webpage, 'video ID').split('@')
+ return self._extract_video(video_id, catalogue)
- return self._extract_video(video_id, 'Pluzz')
+class FranceTVEmbedIE(FranceTVBaseInfoExtractor):
+ _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)'
-class FranceTvInfoIE(FranceTVBaseInfoExtractor):
+ _TEST = {
+ 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961',
+ 'info_dict': {
+ 'id': 'NI_983319',
+ 'ext': 'mp4',
+ 'title': 'Le Pen Reims',
+ 'upload_date': '20170505',
+ 'timestamp': 1493981780,
+ 'duration': 16,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id,
+ video_id)
+
+ return self._extract_video(video['video_id'], video.get('catalog'))
+
+
+class FranceTVInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetvinfo.fr'
_VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)'
@@ -233,124 +305,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
return self._extract_video(video_id, catalogue)
-class FranceTVIE(FranceTVBaseInfoExtractor):
- IE_NAME = 'francetv'
- IE_DESC = 'France 2, 3, 4, 5 and Ô'
- _VALID_URL = r'''(?x)
- https?://
- (?:
- (?:www\.)?france[2345o]\.fr/
- (?:
- emissions/[^/]+/(?:videos|diffusions)|
- emission/[^/]+|
- videos|
- jt
- )
- /|
- embed\.francetv\.fr/\?ue=
- )
- (?P<id>[^/?]+)
- '''
-
- _TESTS = [
- # france2
- {
- 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
- 'md5': 'c03fc87cb85429ffd55df32b9fc05523',
- 'info_dict': {
- 'id': '109169362',
- 'ext': 'flv',
- 'title': '13h15, le dimanche...',
- 'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7',
- 'upload_date': '20140914',
- 'timestamp': 1410693600,
- },
- },
- # france3
- {
- 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
- 'md5': '679bb8f8921f8623bd658fa2f8364da0',
- 'info_dict': {
- 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
- 'ext': 'mp4',
- 'title': 'Le scandale du prix des médicaments',
- 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce',
- 'upload_date': '20131113',
- 'timestamp': 1384380000,
- },
- },
- # france4
- {
- 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
- 'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c',
- 'info_dict': {
- 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
- 'ext': 'mp4',
- 'title': 'Hero Corp Making of - Extrait 1',
- 'description': 'md5:c87d54871b1790679aec1197e73d650a',
- 'upload_date': '20131106',
- 'timestamp': 1383766500,
- },
- },
- # france5
- {
- 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1',
- 'md5': 'f6c577df3806e26471b3d21631241fd0',
- 'info_dict': {
- 'id': '123327454',
- 'ext': 'flv',
- 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?',
- 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4',
- 'upload_date': '20150831',
- 'timestamp': 1441035120,
- },
- },
- # franceo
- {
- 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015',
- 'md5': '47d5816d3b24351cdce512ad7ab31da8',
- 'info_dict': {
- 'id': '125377621',
- 'ext': 'flv',
- 'title': 'Infô soir',
- 'description': 'md5:01b8c6915a3d93d8bbbd692651714309',
- 'upload_date': '20150718',
- 'timestamp': 1437241200,
- 'duration': 414,
- },
- },
- {
- # francetv embed
- 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87',
- 'info_dict': {
- 'id': 'EV_30231',
- 'ext': 'flv',
- 'title': 'Alcaline, le concert avec Calogero',
- 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
- 'upload_date': '20150226',
- 'timestamp': 1424989860,
- 'duration': 5400,
- },
- },
- {
- 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05',
- 'only_matching': True,
- },
- {
- 'url': 'http://www.franceo.fr/videos/125377617',
- 'only_matching': True,
- }
- ]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_id, catalogue = self._html_search_regex(
- r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
- webpage, 'video ID').split('@')
- return self._extract_video(video_id, catalogue)
-
-
class GenerationQuoiIE(InfoExtractor):
IE_NAME = 'france2.fr:generation-quoi'
_VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)'
diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py
index eba00cd..8c37509 100644
--- a/youtube_dl/extractor/funimation.py
+++ b/youtube_dl/extractor/funimation.py
@@ -2,231 +2,148 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_urllib_parse_unquote_plus,
-)
+from ..compat import compat_HTTPError
from ..utils import (
- clean_html,
determine_ext,
int_or_none,
- sanitized_Request,
+ js_to_json,
ExtractorError,
urlencode_postdata
)
class FunimationIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)'
_NETRC_MACHINE = 'funimation'
+ _TOKEN = None
_TESTS = [{
- 'url': 'http://www.funimation.com/shows/air/videos/official/breeze',
+ 'url': 'https://www.funimation.com/shows/hacksign/role-play/',
'info_dict': {
- 'id': '658',
- 'display_id': 'breeze',
- 'ext': 'mp4',
- 'title': 'Air - 1 - Breeze',
- 'description': 'md5:1769f43cd5fc130ace8fd87232207892',
- 'thumbnail': r're:https?://.*\.jpg',
- },
- 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed',
- }, {
- 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
- 'info_dict': {
- 'id': '31128',
+ 'id': '91144',
'display_id': 'role-play',
'ext': 'mp4',
- 'title': '.hack//SIGN - 1 - Role Play',
+ 'title': '.hack//SIGN - Role Play',
'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
'thumbnail': r're:https?://.*\.jpg',
},
- 'skip': 'Access without user interaction is forbidden by CloudFlare',
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}, {
- 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
+ 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/',
'info_dict': {
- 'id': '9635',
+ 'id': '210051',
'display_id': 'broadcast-dub-preview',
'ext': 'mp4',
'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
- 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
- 'skip': 'Access without user interaction is forbidden by CloudFlare',
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
+ 'only_matching': True,
}]
- _LOGIN_URL = 'http://www.funimation.com/login'
-
- def _download_webpage(self, *args, **kwargs):
- try:
- return super(FunimationIE, self)._download_webpage(*args, **kwargs)
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
- response = ee.cause.read()
- if b'>Please complete the security check to access<' in response:
- raise ExtractorError(
- 'Access to funimation.com is blocked by CloudFlare. '
- 'Please browse to http://www.funimation.com/, solve '
- 'the reCAPTCHA, export browser cookies to a text file,'
- ' and then try again with --cookies YOUR_COOKIE_FILE.',
- expected=True)
- raise
-
- def _extract_cloudflare_session_ua(self, url):
- ci_session_cookie = self._get_cookies(url).get('ci_session')
- if ci_session_cookie:
- ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value)
- # ci_session is a string serialized by PHP function serialize()
- # This case is simple enough to use regular expressions only
- return self._search_regex(
- r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent',
- default=None)
-
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
- data = urlencode_postdata({
- 'email_field': username,
- 'password_field': password,
- })
- user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL)
- if not user_agent:
- user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
- login_request = sanitized_Request(self._LOGIN_URL, data, headers={
- 'User-Agent': user_agent,
- 'Content-Type': 'application/x-www-form-urlencoded'
- })
- login_page = self._download_webpage(
- login_request, None, 'Logging in as %s' % username)
- if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')):
- return
- error = self._html_search_regex(
- r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>',
- login_page, 'error messages', default=None)
- if error:
- raise ExtractorError('Unable to login: %s' % error, expected=True)
- raise ExtractorError('Unable to log in')
+ try:
+ data = self._download_json(
+ 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/',
+ None, 'Logging in as %s' % username, data=urlencode_postdata({
+ 'username': username,
+ 'password': password,
+ }))
+ self._TOKEN = data['token']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ error = self._parse_json(e.cause.read().decode(), None)['error']
+ raise ExtractorError(error, expected=True)
+ raise
def _real_initialize(self):
self._login()
def _real_extract(self, url):
display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
- errors = []
- formats = []
-
- ERRORS_MAP = {
- 'ERROR_MATURE_CONTENT_LOGGED_IN': 'matureContentLoggedIn',
- 'ERROR_MATURE_CONTENT_LOGGED_OUT': 'matureContentLoggedOut',
- 'ERROR_SUBSCRIPTION_LOGGED_OUT': 'subscriptionLoggedOut',
- 'ERROR_VIDEO_EXPIRED': 'videoExpired',
- 'ERROR_TERRITORY_UNAVAILABLE': 'territoryUnavailable',
- 'SVODBASIC_SUBSCRIPTION_IN_PLAYER': 'basicSubscription',
- 'SVODNON_SUBSCRIPTION_IN_PLAYER': 'nonSubscription',
- 'ERROR_PLAYER_NOT_RESPONDING': 'playerNotResponding',
- 'ERROR_UNABLE_TO_CONNECT_TO_CDN': 'unableToConnectToCDN',
- 'ERROR_STREAM_NOT_FOUND': 'streamNotFound',
- }
-
- USER_AGENTS = (
- # PC UA is served with m3u8 that provides some bonus lower quality formats
- ('pc', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'),
- # Mobile UA allows to extract direct links and also does not fail when
- # PC UA fails with hulu error (e.g.
- # http://www.funimation.com/shows/hacksign/videos/official/role-play)
- ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'),
- )
-
- user_agent = self._extract_cloudflare_session_ua(url)
- if user_agent:
- USER_AGENTS = ((None, user_agent),)
-
- for kind, user_agent in USER_AGENTS:
- request = sanitized_Request(url)
- request.add_header('User-Agent', user_agent)
- webpage = self._download_webpage(
- request, display_id,
- 'Downloading %s webpage' % kind if kind else 'Downloading webpage')
-
- playlist = self._parse_json(
- self._search_regex(
- r'var\s+playersData\s*=\s*(\[.+?\]);\n',
- webpage, 'players data'),
- display_id)[0]['playlist']
-
- items = next(item['items'] for item in playlist if item.get('items'))
- item = next(item for item in items if item.get('itemAK') == display_id)
-
- error_messages = {}
- video_error_messages = self._search_regex(
- r'var\s+videoErrorMessages\s*=\s*({.+?});\n',
- webpage, 'error messages', default=None)
- if video_error_messages:
- error_messages_json = self._parse_json(video_error_messages, display_id, fatal=False)
- if error_messages_json:
- for _, error in error_messages_json.items():
- type_ = error.get('type')
- description = error.get('description')
- content = error.get('content')
- if type_ == 'text' and description and content:
- error_message = ERRORS_MAP.get(description)
- if error_message:
- error_messages[error_message] = content
-
- for video in item.get('videoSet', []):
- auth_token = video.get('authToken')
- if not auth_token:
- continue
- funimation_id = video.get('FUNImationID') or video.get('videoId')
- preference = 1 if video.get('languageMode') == 'dub' else 0
- if not auth_token.startswith('?'):
- auth_token = '?%s' % auth_token
- for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)):
- format_url = video.get('%sUrl' % quality)
- if not format_url:
- continue
- if not format_url.startswith(('http', '//')):
- errors.append(format_url)
- continue
- if determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native',
- preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False))
- else:
- tbr = int_or_none(self._search_regex(
- r'-(\d+)[Kk]', format_url, 'tbr', default=None))
- formats.append({
- 'url': format_url + auth_token,
- 'format_id': '%s-http-%dp' % (funimation_id, height),
- 'height': height,
- 'tbr': tbr,
- 'preference': preference,
- })
+ def _search_kane(name):
+ return self._search_regex(
+ r"KANE_customdimensions\.%s\s*=\s*'([^']+)';" % name,
+ webpage, name, default=None)
+
+ title_data = self._parse_json(self._search_regex(
+ r'TITLE_DATA\s*=\s*({[^}]+})',
+ webpage, 'title data', default=''),
+ display_id, js_to_json, fatal=False) or {}
+
+ video_id = title_data.get('id') or self._search_regex([
+ r"KANE_customdimensions.videoID\s*=\s*'(\d+)';",
+ r'<iframe[^>]+src="/player/(\d+)"',
+ ], webpage, 'video_id', default=None)
+ if not video_id:
+ player_url = self._html_search_meta([
+ 'al:web:url',
+ 'og:video:url',
+ 'og:video:secure_url',
+ ], webpage, fatal=True)
+ video_id = self._search_regex(r'/player/(\d+)', player_url, 'video id')
+
+ title = episode = title_data.get('title') or _search_kane('videoTitle') or self._og_search_title(webpage)
+ series = _search_kane('showName')
+ if series:
+ title = '%s - %s' % (series, title)
+ description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True)
- if not formats and errors:
- raise ExtractorError(
- '%s returned error: %s'
- % (self.IE_NAME, clean_html(error_messages.get(errors[0], errors[0]))),
- expected=True)
+ try:
+ headers = {}
+ if self._TOKEN:
+ headers['Authorization'] = 'Token %s' % self._TOKEN
+ sources = self._download_json(
+ 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id,
+ video_id, headers=headers)['items']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ error = self._parse_json(e.cause.read(), video_id)['errors'][0]
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, error.get('detail') or error.get('title')), expected=True)
+ raise
+ formats = []
+ for source in sources:
+ source_url = source.get('src')
+ if not source_url:
+ continue
+ source_type = source.get('videoType') or determine_ext(source_url)
+ if source_type == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, video_id, 'mp4',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'format_id': source_type,
+ 'url': source_url,
+ })
self._sort_formats(formats)
- title = item['title']
- artist = item.get('artist')
- if artist:
- title = '%s - %s' % (artist, title)
- description = self._og_search_description(webpage) or item.get('description')
- thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl')
- video_id = item.get('itemId') or display_id
-
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
- 'thumbnail': thumbnail,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'series': series,
+ 'season_number': int_or_none(title_data.get('seasonNum') or _search_kane('season')),
+ 'episode_number': int_or_none(title_data.get('episodeNum')),
+ 'episode': episode,
+ 'season_id': title_data.get('seriesId'),
'formats': formats,
}
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 81c0ce9..4940936 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor):
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
source_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)]
bitrates.sort()
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 682c49e..00d3111 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -78,8 +78,7 @@ class GameSpotIE(OnceIE):
if m3u8_formats:
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
if len(qualities) == len(m3u8_formats):
for q, m3u8_format in zip(qualities, m3u8_formats):
f = m3u8_format.copy()
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index 3136427..f71d909 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor):
'format': 'jp', # The japanese audio
}
},
+ {
+ # gdc-player.html
+ 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo',
+ 'info_dict': {
+ 'id': '1435',
+ 'display_id': 'An-American-engine-in-Tokyo',
+ 'ext': 'flv',
+ 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT',
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
+ },
+ },
]
def _login(self, webpage_url, display_id):
@@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor):
'title': title,
}
- PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>'
+ PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>'
xml_root = self._html_search_regex(
PLAYER_REGEX, start_page, 'xml root', default=None)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 274f817..c108d4a 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -85,6 +85,11 @@ from .ustream import UstreamIE
from .openload import OpenloadIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
+from .limelight import LimelightBaseIE
+from .anvato import AnvatoIE
+from .washingtonpost import WashingtonPostIE
+from .wistia import WistiaIE
+from .mediaset import MediasetIE
class GenericIE(InfoExtractor):
@@ -430,6 +435,22 @@ class GenericIE(InfoExtractor):
},
},
{
+ # Brightcove video in <iframe>
+ 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724',
+ 'md5': '36d74ef5e37c8b4a2ce92880d208b968',
+ 'info_dict': {
+ 'id': '5360463607001',
+ 'ext': 'mp4',
+ 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活',
+ 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。',
+ 'uploader': 'United Nations',
+ 'uploader_id': '1362235914001',
+ 'timestamp': 1489593889,
+ 'upload_date': '20170315',
+ },
+ 'add_ie': ['BrightcoveLegacy'],
+ },
+ {
# Brightcove with alternative playerID key
'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html',
'info_dict': {
@@ -465,6 +486,59 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 download
},
+ 'skip': 'video rotates...weekly?',
+ },
+ {
+ # Brightcove:new type [2].
+ 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis',
+ 'md5': '2b35148fcf48da41c9fb4591650784f3',
+ 'info_dict': {
+ 'id': '5348741021001',
+ 'ext': 'mp4',
+ 'upload_date': '20170306',
+ 'uploader_id': '4191638492001',
+ 'timestamp': 1488769918,
+ 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis',
+
+ },
+ },
+ {
+ # Alternative brightcove <video> attributes
+ 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/',
+ 'info_dict': {
+ 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs",
+ },
+ 'playlist': [{
+ 'md5': '732d22ba3d33f2f3fc253c39f8f36523',
+ 'info_dict': {
+ 'id': '5311302538001',
+ 'ext': 'mp4',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche",
+ 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)",
+ 'timestamp': 1486321708,
+ 'upload_date': '20170205',
+ 'uploader_id': '800000640001',
+ },
+ 'only_matching': True,
+ }],
+ },
+ {
+ # Brightcove with UUID in videoPlayer
+ 'url': 'http://www8.hp.com/cn/zh/home.html',
+ 'info_dict': {
+ 'id': '5255815316001',
+ 'ext': 'mp4',
+ 'title': 'Sprocket Video - China',
+ 'description': 'Sprocket Video - China',
+ 'uploader': 'HP-Video Gallery',
+ 'timestamp': 1482263210,
+ 'upload_date': '20161220',
+ 'uploader_id': '1107601872001',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
},
# ooyala video
{
@@ -730,6 +804,21 @@ class GenericIE(InfoExtractor):
'skip_download': True,
}
},
+ # YouTube <object> embed
+ {
+ 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/',
+ 'md5': '516718101ec834f74318df76259fb3cc',
+ 'info_dict': {
+ 'id': 'msN87y-iEx0',
+ 'ext': 'webm',
+ 'title': 'Feynman: Mirrors FUN TO IMAGINE 6',
+ 'upload_date': '20080526',
+ 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d',
+ 'uploader': 'Christopher Sykes',
+ 'uploader_id': 'ChristopherJSykes',
+ },
+ 'add_ie': ['Youtube'],
+ },
# Camtasia studio
{
'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
@@ -1080,6 +1169,21 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Kaltura'],
},
+ {
+ # Kaltura iframe embed
+ 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/',
+ 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44',
+ 'info_dict': {
+ 'id': '0_f2cfbpwy',
+ 'ext': 'mp4',
+ 'title': 'I. M. Pei: A Centennial Celebration',
+ 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c',
+ 'upload_date': '20170403',
+ 'uploader_id': 'batchUser',
+ 'timestamp': 1491232186,
+ },
+ 'add_ie': ['Kaltura'],
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -1327,6 +1431,22 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ # Brightcove embed with whitespace around attribute names
+ 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
+ 'info_dict': {
+ 'id': '3167554373001',
+ 'ext': 'mp4',
+ 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
+ 'description': 'md5:57bacb0e0f29349de4972bfda3191713',
+ 'uploader_id': '1079349493',
+ 'upload_date': '20140207',
+ 'timestamp': 1391810548,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Another form of arte.tv embed
{
'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
@@ -1568,6 +1688,51 @@ class GenericIE(InfoExtractor):
},
'add_ie': [SenateISVPIE.ie_key()],
},
+ {
+ # Limelight embeds (1 channel embed + 4 media embeds)
+ 'url': 'http://www.sedona.com/FacilitatorTraining2017',
+ 'info_dict': {
+ 'id': 'FacilitatorTraining2017',
+ 'title': 'Facilitator Training 2017',
+ },
+ 'playlist_mincount': 5,
+ },
+ {
+ 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
+ 'info_dict': {
+ 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
+ 'title': 'Standoff with Walnut Creek murder suspect ends',
+ 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
+ },
+ 'playlist_mincount': 4,
+ },
+ {
+ # WashingtonPost embed
+ 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches',
+ 'info_dict': {
+ 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac',
+ 'ext': 'mp4',
+ 'title': "No one has seen the drama series based on Trump's life \u2014 until now",
+ 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.',
+ 'timestamp': 1455216756,
+ 'uploader': 'The Washington Post',
+ 'upload_date': '20160211',
+ },
+ 'add_ie': [WashingtonPostIE.ie_key()],
+ },
+ {
+ # Mediaset embed
+ 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
+ 'info_dict': {
+ 'id': '720642',
+ 'ext': 'mp4',
+ 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [MediasetIE.ie_key()],
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -1610,7 +1775,7 @@ class GenericIE(InfoExtractor):
continue
entries.append({
- '_type': 'url',
+ '_type': 'url_transparent',
'url': next_url,
'title': it.find('title').text,
})
@@ -1870,7 +2035,6 @@ class GenericIE(InfoExtractor):
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
- self.to_screen('Brightcove video detected.')
entries = [{
'_type': 'url',
'url': smuggle_url(bc_url, {'Referer': url}),
@@ -1885,7 +2049,7 @@ class GenericIE(InfoExtractor):
}
# Look for Brightcove New Studio embeds
- bc_urls = BrightcoveNewIE._extract_urls(webpage)
+ bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
if bc_urls:
return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
@@ -1923,6 +2087,7 @@ class GenericIE(InfoExtractor):
data-video-url=|
<embed[^>]+?src=|
embedSWF\(?:\s*|
+ <object[^>]+data=|
new\s+SWFObject\(
)
(["\'])
@@ -1961,57 +2126,20 @@ class GenericIE(InfoExtractor):
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player
- match = re.search(
- r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
- if match:
- embed_url = self._proto_relative_url(
- unescapeHTML(match.group('url')))
+ wistia_url = WistiaIE._extract_url(webpage)
+ if wistia_url:
return {
'_type': 'url_transparent',
- 'url': embed_url,
- 'ie_key': 'Wistia',
+ 'url': self._proto_relative_url(wistia_url),
+ 'ie_key': WistiaIE.ie_key(),
'uploader': video_uploader,
}
- match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
- if match:
- return {
- '_type': 'url_transparent',
- 'url': 'wistia:%s' % match.group('id'),
- 'ie_key': 'Wistia',
- 'uploader': video_uploader,
- }
-
- match = re.search(
- r'''(?sx)
- <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
- <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
- ''', webpage)
- if match:
- return self.url_result(self._proto_relative_url(
- 'wistia:%s' % match.group('id')), 'Wistia')
-
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
return self.url_result(svt_url, 'SVT')
- # Look for embedded condenast player
- matches = re.findall(
- r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
- webpage)
- if matches:
- return {
- '_type': 'playlist',
- 'entries': [{
- '_type': 'url',
- 'ie_key': 'CondeNast',
- 'url': ma,
- } for ma in matches],
- 'title': video_title,
- 'id': video_id,
- }
-
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
@@ -2400,28 +2528,16 @@ class GenericIE(InfoExtractor):
return self.url_result(piksel_url, PikselIE.ie_key())
# Look for Limelight embeds
- mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
- if mobj:
- lm = {
- 'Media': 'media',
- 'Channel': 'channel',
- 'ChannelList': 'channel_list',
- }
- return self.url_result(smuggle_url('limelight:%s:%s' % (
- lm[mobj.group(1)], mobj.group(2)), {'source_url': url}),
- 'Limelight%s' % mobj.group(1), mobj.group(2))
+ limelight_urls = LimelightBaseIE._extract_urls(webpage, url)
+ if limelight_urls:
+ return self.playlist_result(
+ limelight_urls, video_id, video_title, video_description)
- mobj = re.search(
- r'''(?sx)
- <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*?
- <param[^>]+
- name=(["\'])flashVars\2[^>]+
- value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
- ''', webpage)
- if mobj:
- return self.url_result(smuggle_url(
- 'limelight:media:%s' % mobj.group('id'),
- {'source_url': url}), 'LimelightMedia', mobj.group('id'))
+ # Look for Anvato embeds
+ anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
+ if anvato_urls:
+ return self.playlist_result(
+ anvato_urls, video_id, video_title, video_description)
# Look for AdobeTVVideo embeds
mobj = re.search(
@@ -2540,6 +2656,18 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, ie=RutubeIE.ie_key())
+ # Look for WashingtonPost embeds
+ wapo_urls = WashingtonPostIE._extract_urls(webpage)
+ if wapo_urls:
+ return self.playlist_from_matches(
+ wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
+
+ # Look for Mediaset embeds
+ mediaset_urls = MediasetIE._extract_urls(webpage)
+ if mediaset_urls:
+ return self.playlist_from_matches(
+ mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
+
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')
@@ -2568,7 +2696,7 @@ class GenericIE(InfoExtractor):
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
info = self._parse_jwplayer_data(
- jwplayer_data, video_id, require_title=False)
+ jwplayer_data, video_id, require_title=False, base_url=url)
if not info.get('title'):
info['title'] = video_title
return info
@@ -2580,7 +2708,7 @@ class GenericIE(InfoExtractor):
return True
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)
- return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js')
+ return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
def filter_video(urls):
return list(filter(check_video, urls))
diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py
index 4c9be47..9c7b1bd 100644
--- a/youtube_dl/extractor/go.py
+++ b/youtube_dl/extractor/go.py
@@ -36,22 +36,26 @@ class GoIE(AdobePassIE):
'requestor_id': 'DisneyXD',
}
}
- _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
+ _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_TESTS = [{
- 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',
+ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',
'info_dict': {
- 'id': '0_g86w5onx',
+ 'id': 'VDKA3807643',
'ext': 'mp4',
- 'title': 'Sneak Peek: Language Arts',
- 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c',
+ 'title': 'The Traitor in the White House',
+ 'description': 'md5:05b009d2d145a1e85d25111bd37222e8',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
- 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601',
- 'only_matching': True,
+ 'url': 'http://watchdisneyxd.go.com/doraemon',
+ 'info_dict': {
+ 'title': 'Doraemon',
+ 'id': 'SH55574025',
+ },
+ 'playlist_mincount': 51,
}, {
'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
'only_matching': True,
@@ -60,19 +64,36 @@ class GoIE(AdobePassIE):
'only_matching': True,
}]
+ def _extract_videos(self, brand, video_id='-1', show_id='-1'):
+ display_id = video_id if video_id != '-1' else show_id
+ return self._download_json(
+ 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id),
+ display_id)['video']
+
def _real_extract(self, url):
sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups()
+ site_info = self._SITE_INFO[sub_domain]
+ brand = site_info['brand']
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
# There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
# from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
- r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id')
- site_info = self._SITE_INFO[sub_domain]
- brand = site_info['brand']
- video_data = self._download_json(
- 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id),
- video_id)['video'][0]
+ r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None)
+ if not video_id:
+ # show extraction works for Disney, DisneyJunior and DisneyXD
+ # ABC and Freeform has different layout
+ show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id')
+ videos = self._extract_videos(brand, show_id=show_id)
+ show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False)
+ entries = []
+ for video in videos:
+ entries.append(self.url_result(
+ video['url'], 'Go', video.get('id'), video.get('title')))
+ entries.reverse()
+ return self.playlist_result(entries, show_id, show_title)
+ video_data = self._extract_videos(brand, video_id)[0]
+ video_id = video_data['id']
title = video_data['title']
formats = []
@@ -105,7 +126,7 @@ class GoIE(AdobePassIE):
self._initialize_geo_bypass(['US'])
entitlement = self._download_json(
'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
- video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers())
+ video_id, data=urlencode_postdata(data))
errors = entitlement.get('errors', {}).get('errors', [])
if errors:
for error in errors:
diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py
new file mode 100644
index 0000000..9b2e1c1
--- /dev/null
+++ b/youtube_dl/extractor/go90.py
@@ -0,0 +1,126 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class Go90IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)'
+ _TEST = {
+ 'url': 'https://www.go90.com/videos/84BUqjLpf9D',
+ 'md5': 'efa7670dbbbf21a7b07b360652b24a32',
+ 'info_dict': {
+ 'id': '84BUqjLpf9D',
+ 'ext': 'mp4',
+ 'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention',
+ 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.',
+ 'timestamp': 1491868800,
+ 'upload_date': '20170411',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_data = self._download_json(
+ 'https://www.go90.com/api/view/items/' + video_id,
+ video_id, headers={
+ 'Content-Type': 'application/json; charset=utf-8',
+ }, data=b'{"client":"web","device_type":"pc"}')
+ main_video_asset = video_data['main_video_asset']
+
+ episode_number = int_or_none(video_data.get('episode_number'))
+ series = None
+ season = None
+ season_id = None
+ season_number = None
+ for metadata in video_data.get('__children', {}).get('Item', {}).values():
+ if metadata.get('type') == 'show':
+ series = metadata.get('title')
+ elif metadata.get('type') == 'season':
+ season = metadata.get('title')
+ season_id = metadata.get('id')
+ season_number = int_or_none(metadata.get('season_number'))
+
+ title = episode = video_data.get('title') or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+
+ thumbnails = []
+ formats = []
+ subtitles = {}
+ for asset in video_data.get('assets'):
+ if asset.get('id') == main_video_asset:
+ for source in asset.get('sources', []):
+ source_location = source.get('location')
+ if not source_location:
+ continue
+ source_type = source.get('type')
+ if source_type == 'hls':
+ m3u8_formats = self._extract_m3u8_formats(
+ source_location, video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False)
+ for f in m3u8_formats:
+ mobj = re.search(r'/hls-(\d+)-(\d+)K', f['url'])
+ if mobj:
+ height, tbr = mobj.groups()
+ height = int_or_none(height)
+ f.update({
+ 'height': f.get('height') or height,
+ 'width': f.get('width') or int_or_none(height / 9.0 * 16.0 if height else None),
+ 'tbr': f.get('tbr') or int_or_none(tbr),
+ })
+ formats.extend(m3u8_formats)
+ elif source_type == 'dash':
+ formats.extend(self._extract_mpd_formats(
+ source_location, video_id, mpd_id='dash', fatal=False))
+ else:
+ formats.append({
+ 'format_id': source.get('name'),
+ 'url': source_location,
+ 'width': int_or_none(source.get('width')),
+ 'height': int_or_none(source.get('height')),
+ 'tbr': int_or_none(source.get('bitrate')),
+ })
+
+ for caption in asset.get('caption_metadata', []):
+ caption_url = caption.get('source_url')
+ if not caption_url:
+ continue
+ subtitles.setdefault(caption.get('language', 'en'), []).append({
+ 'url': caption_url,
+ 'ext': determine_ext(caption_url, 'vtt'),
+ })
+ elif asset.get('type') == 'image':
+ asset_location = asset.get('location')
+ if not asset_location:
+ continue
+ thumbnails.append({
+ 'url': asset_location,
+ 'width': int_or_none(asset.get('width')),
+ 'height': int_or_none(asset.get('height')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': video_data.get('short_description'),
+ 'like_count': int_or_none(video_data.get('like_count')),
+ 'timestamp': parse_iso8601(video_data.get('released_at')),
+ 'series': series,
+ 'episode': episode,
+ 'season': season,
+ 'season_id': season_id,
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py
index 931f71a..859ad54 100644
--- a/youtube_dl/extractor/hbo.py
+++ b/youtube_dl/extractor/hbo.py
@@ -92,12 +92,14 @@ class HBOBaseIE(InfoExtractor):
video_url.replace('.tar', '/base_index_w8.m3u8'),
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
elif source.tag == 'hls':
- # #EXT-X-BYTERANGE is not supported by native hls downloader
- # and ffmpeg (#10955)
- # formats.extend(self._extract_m3u8_formats(
- # video_url.replace('.tar', '/base_index.m3u8'),
- # video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- continue
+ m3u8_formats = self._extract_m3u8_formats(
+ video_url.replace('.tar', '/base_index.m3u8'),
+ video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ for f in m3u8_formats:
+ if f.get('vcodec') == 'none' and not f.get('tbr'):
+ f['tbr'] = int_or_none(self._search_regex(
+ r'-(\d+)k/', f['url'], 'tbr', default=None))
+ formats.extend(m3u8_formats)
elif source.tag == 'dash':
formats.extend(self._extract_mpd_formats(
video_url.replace('.tar', '/manifest.mpd'),
@@ -110,7 +112,7 @@ class HBOBaseIE(InfoExtractor):
'width': format_info.get('width'),
'height': format_info.get('height'),
})
- self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
+ self._sort_formats(formats)
thumbnails = []
card_sizes = xpath_element(video_data, 'titleCardSizes')
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index f95c00c..3ff672a 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -13,7 +13,7 @@ from ..utils import (
class ImdbIE(InfoExtractor):
IE_NAME = 'imdb'
IE_DESC = 'Internet Movie Database trailers'
- _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.imdb.com/video/imdb/vi2524815897',
@@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor):
}, {
'url': 'http://www.imdb.com/videoplayer/vi1562949145',
'only_matching': True,
+ }, {
+ 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index 9fb71e8..fe425e7 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE):
def _extract_http_audio(self, webpage, video_id):
fields = self._hidden_inputs(webpage)
- http_audio_url = fields['filename']
- if http_audio_url is None:
+ http_audio_url = fields.get('filename')
+ if not http_audio_url:
return []
cookies_header = {'Cookie': self._extract_cookies(webpage)}
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index c1921cb..4667335 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -112,7 +112,8 @@ class InstagramIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
(video_url, description, thumbnail, timestamp, uploader,
- uploader_id, like_count, comment_count, height, width) = [None] * 10
+ uploader_id, like_count, comment_count, comments, height,
+ width) = [None] * 11
shared_data = self._parse_json(
self._search_regex(
@@ -121,7 +122,10 @@ class InstagramIE(InfoExtractor):
video_id, fatal=False)
if shared_data:
media = try_get(
- shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict)
+ shared_data,
+ (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
+ lambda x: x['entry_data']['PostPage'][0]['media']),
+ dict)
if media:
video_url = media.get('video_url')
height = int_or_none(media.get('dimensions', {}).get('height'))
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
index 2af6a6d..fdfa7de 100644
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -189,7 +189,11 @@ class IqiyiIE(InfoExtractor):
'only_matching': True,
}, {
'url': 'http://yule.iqiyi.com/pcb.html',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '4a0af228fddb55ec96398a364248ed7f',
+ 'ext': 'mp4',
+ 'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰',
+ },
}, {
# VIP-only video. The first 2 parts (6 minutes) are available without login
# MD5 sums omitted as values are different on Travis CI and my machine
@@ -337,15 +341,18 @@ class IqiyiIE(InfoExtractor):
url, 'temp_id', note='download video page')
# There's no simple way to determine whether an URL is a playlist or not
- # So detect it
- playlist_result = self._extract_playlist(webpage)
- if playlist_result:
- return playlist_result
-
+ # Sometimes there are playlist links in individual videos, so treat it
+ # as a single video first
tvid = self._search_regex(
- r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+ r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None)
+ if tvid is None:
+ playlist_result = self._extract_playlist(webpage)
+ if playlist_result:
+ return playlist_result
+ raise ExtractorError('Can\'t find any video')
+
video_id = self._search_regex(
- r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+ r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
formats = []
for _ in range(5):
@@ -377,7 +384,8 @@ class IqiyiIE(InfoExtractor):
self._sort_formats(formats)
title = (get_element_by_id('widget-videotitle', webpage) or
- clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)))
+ clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or
+ self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py
index 021c6b2..f315680 100644
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@@ -116,13 +116,25 @@ class ITVIE(InfoExtractor):
if not play_path:
continue
tbr = int_or_none(media_file.get('bitrate'), 1000)
- formats.append({
+ f = {
'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
- 'url': rtmp_url,
'play_path': play_path,
+ # Providing this swfVfy allows to avoid truncated downloads
+ 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
+ 'page_url': url,
'tbr': tbr,
'ext': 'flv',
- })
+ }
+ app = self._search_regex(
+ 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
+ if app:
+ f.update({
+ 'url': rtmp_url.split('?', 1)[0],
+ 'app': app,
+ })
+ else:
+ f['url'] = rtmp_url
+ formats.append(f)
ios_playlist_url = params.get('data-video-playlist')
hmac = params.get('data-video-hmac')
@@ -172,7 +184,9 @@ class ITVIE(InfoExtractor):
href = ios_base_url + href
ext = determine_ext(href)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(href, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ href, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
else:
formats.append({
'url': href,
@@ -189,7 +203,8 @@ class ITVIE(InfoExtractor):
'ext': 'ttml' if ext == 'xml' else ext,
})
- return {
+ info = self._search_json_ld(webpage, video_id, default={})
+ info.update({
'id': video_id,
'title': title,
'formats': formats,
@@ -198,4 +213,5 @@ class ITVIE(InfoExtractor):
'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
'series': xpath_text(playlist, 'ProgrammeTitle'),
'duartion': parse_duration(xpath_text(playlist, 'Duration')),
- }
+ })
+ return info
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index 54374ea..41c1f3d 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -91,6 +91,7 @@ class KalturaIE(InfoExtractor):
}],
},
},
+ 'skip': 'Gone. Maybe https://www.safaribooksonline.com/library/tutorials/introduction-to-python-anon/3469/',
'params': {
'skip_download': True,
},
@@ -107,27 +108,37 @@ class KalturaIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
+ # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
mobj = (
re.search(
r"""(?xs)
kWidget\.(?:thumb)?[Ee]mbed\(
\{.*?
- (?P<q1>['\"])wid(?P=q1)\s*:\s*
- (?P<q2>['\"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*?
- (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*
- (?P<q4>['\"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\})
+ (?P<q1>['"])wid(?P=q1)\s*:\s*
+ (?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*?
+ (?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s*
+ (?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\})
""", webpage) or
re.search(
r'''(?xs)
- (?P<q1>["\'])
+ (?P<q1>["'])
(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)*
(?P=q1).*?
(?:
entry_?[Ii]d|
- (?P<q2>["\'])entry_?[Ii]d(?P=q2)
+ (?P<q2>["'])entry_?[Ii]d(?P=q2)
)\s*:\s*
- (?P<q3>["\'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
- ''', webpage))
+ (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
+ ''', webpage) or
+ re.search(
+ r'''(?xs)
+ <iframe[^>]+src=(?P<q1>["'])
+ (?:https?:)?//(?:www\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)
+ (?:(?!(?P=q1)).)*
+ [?&]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+)
+ (?P=q1)
+ ''', webpage)
+ )
if mobj:
embed_info = mobj.groupdict()
url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index 3190b18..1f91ba0 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -8,15 +10,15 @@ from ..utils import (
urlencode_postdata,
xpath_element,
xpath_text,
- urljoin,
update_url_query,
+ js_to_json,
)
class Laola1TvEmbedIE(InfoExtractor):
IE_NAME = 'laola1tv:embed'
_VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
# flashvars.premium = "false";
'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024',
'info_dict': {
@@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor):
'uploader': 'ITTF - International Table Tennis Federation',
'upload_date': '20161211',
},
- }
+ }]
+
+ def _extract_token_url(self, stream_access_url, video_id, data):
+ return self._download_json(
+ stream_access_url, video_id, headers={
+ 'Content-Type': 'application/json',
+ }, data=json.dumps(data).encode())['data']['stream-access'][0]
+
+ def _extract_formats(self, token_url, video_id):
+ token_doc = self._download_xml(
+ token_url, video_id, 'Downloading token',
+ headers=self.geo_verification_headers())
+
+ token_attrib = xpath_element(token_doc, './/token').attrib
+
+ if token_attrib['status'] != '0':
+ raise ExtractorError(
+ 'Token error: %s' % token_attrib['comment'], expected=True)
+
+ formats = self._extract_akamai_formats(
+ '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
+ video_id)
+ self._sort_formats(formats)
+ return formats
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor):
else:
data_abo = urlencode_postdata(
dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(','))))
- token_url = self._download_json(
- 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access',
- video_id, query={
+ stream_access_url = update_url_query(
+ 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', {
'videoId': _v('id'),
'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'),
'label': _v('label'),
'area': _v('area'),
- }, data=data_abo)['data']['stream-access'][0]
-
- token_doc = self._download_xml(
- token_url, video_id, 'Downloading token',
- headers=self.geo_verification_headers())
-
- token_attrib = xpath_element(token_doc, './/token').attrib
-
- if token_attrib['status'] != '0':
- raise ExtractorError(
- 'Token error: %s' % token_attrib['comment'], expected=True)
+ })
+ token_url = self._extract_token_url(stream_access_url, video_id, data_abo)
- formats = self._extract_akamai_formats(
- '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
- video_id)
- self._sort_formats(formats)
+ formats = self._extract_formats(token_url, video_id)
categories_str = _v('meta_sports')
categories = categories_str.split(',') if categories_str else []
@@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor):
}
-class Laola1TvIE(InfoExtractor):
+class Laola1TvIE(Laola1TvEmbedIE):
IE_NAME = 'laola1tv'
_VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)'
_TESTS = [{
@@ -164,13 +176,42 @@ class Laola1TvIE(InfoExtractor):
if 'Dieser Livestream ist bereits beendet.' in webpage:
raise ExtractorError('This live stream has already finished.', expected=True)
- iframe_url = urljoin(url, self._search_regex(
- r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"',
- webpage, 'iframe url'))
+ conf = self._parse_json(self._search_regex(
+ r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'),
+ display_id, js_to_json)
+
+ video_id = conf['videoid']
+
+ config = self._download_json(conf['configUrl'], video_id, query={
+ 'videoid': video_id,
+ 'partnerid': conf['partnerid'],
+ 'language': conf.get('language', ''),
+ 'portal': conf.get('portalid', ''),
+ })
+ error = config.get('error')
+ if error:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+ video_data = config['video']
+ title = video_data['title']
+ is_live = video_data.get('isLivestream') and video_data.get('isLive')
+ meta = video_data.get('metaInformation')
+ sports = meta.get('sports')
+ categories = sports.split(',') if sports else []
+
+ token_url = self._extract_token_url(
+ video_data['streamAccess'], video_id,
+ video_data['abo']['required'])
+
+ formats = self._extract_formats(token_url, video_id)
return {
- '_type': 'url',
+ 'id': video_id,
'display_id': display_id,
- 'url': iframe_url,
- 'ie_key': 'Laola1TvEmbed',
+ 'title': self._live_title(title) if is_live else title,
+ 'description': video_data.get('description'),
+ 'thumbnail': video_data.get('image'),
+ 'categories': categories,
+ 'formats': formats,
+ 'is_live': is_live,
}
diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py
index 9eda956..0a07c13 100644
--- a/youtube_dl/extractor/leeco.py
+++ b/youtube_dl/extractor/leeco.py
@@ -23,7 +23,6 @@ from ..utils import (
str_or_none,
url_basename,
urshift,
- update_url_query,
)
@@ -51,7 +50,7 @@ class LeIE(InfoExtractor):
'id': '1415246',
'ext': 'mp4',
'title': '美人天下01',
- 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
+ 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',
},
'params': {
'hls_prefer_native': True,
@@ -69,7 +68,6 @@ class LeIE(InfoExtractor):
'params': {
'hls_prefer_native': True,
},
- 'skip': 'Only available in China',
}, {
'url': 'http://sports.le.com/video/25737697.html',
'only_matching': True,
@@ -81,7 +79,7 @@ class LeIE(InfoExtractor):
'only_matching': True,
}]
- # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
+ # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf
def ror(self, param1, param2):
_loc3_ = 0
while _loc3_ < param2:
@@ -90,15 +88,8 @@ class LeIE(InfoExtractor):
return param1
def calc_time_key(self, param1):
- _loc2_ = 773625421
- _loc3_ = self.ror(param1, _loc2_ % 13)
- _loc3_ = _loc3_ ^ _loc2_
- _loc3_ = self.ror(_loc3_, _loc2_ % 17)
- return _loc3_
-
- # reversed from http://jstatic.letvcdn.com/sdk/player.js
- def get_mms_key(self, time):
- return self.ror(time, 8) ^ 185025305
+ _loc2_ = 185025305
+ return self.ror(param1, _loc2_ % 17) ^ _loc2_
# see M3U8Encryption class in KLetvPlayer.swf
@staticmethod
@@ -122,7 +113,7 @@ class LeIE(InfoExtractor):
def _check_errors(self, play_json):
# Check for errors
- playstatus = play_json['playstatus']
+ playstatus = play_json['msgs']['playstatus']
if playstatus['status'] == 0:
flag = playstatus['flag']
if flag == 1:
@@ -134,58 +125,31 @@ class LeIE(InfoExtractor):
media_id = self._match_id(url)
page = self._download_webpage(url, media_id)
- play_json_h5 = self._download_json(
- 'http://api.le.com/mms/out/video/playJsonH5',
- media_id, 'Downloading html5 playJson data', query={
- 'id': media_id,
- 'platid': 3,
- 'splatid': 304,
- 'format': 1,
- 'tkey': self.get_mms_key(int(time.time())),
- 'domain': 'www.le.com',
- 'tss': 'no',
- },
- headers=self.geo_verification_headers())
- self._check_errors(play_json_h5)
-
play_json_flash = self._download_json(
- 'http://api.le.com/mms/out/video/playJson',
+ 'http://player-pc.le.com/mms/out/video/playJson',
media_id, 'Downloading flash playJson data', query={
'id': media_id,
'platid': 1,
'splatid': 101,
'format': 1,
+ 'source': 1000,
'tkey': self.calc_time_key(int(time.time())),
'domain': 'www.le.com',
+ 'region': 'cn',
},
headers=self.geo_verification_headers())
self._check_errors(play_json_flash)
- def get_h5_urls(media_url, format_id):
- location = self._download_json(
- media_url, media_id,
- 'Download JSON metadata for format %s' % format_id, query={
- 'format': 1,
- 'expect': 3,
- 'tss': 'no',
- })['location']
-
- return {
- 'http': update_url_query(location, {'tss': 'no'}),
- 'hls': update_url_query(location, {'tss': 'ios'}),
- }
-
def get_flash_urls(media_url, format_id):
- media_url += '&' + compat_urllib_parse_urlencode({
- 'm3v': 1,
- 'format': 1,
- 'expect': 3,
- 'rateid': format_id,
- })
-
nodes_data = self._download_json(
media_url, media_id,
- 'Download JSON metadata for format %s' % format_id)
+ 'Download JSON metadata for format %s' % format_id,
+ query={
+ 'm3v': 1,
+ 'format': 1,
+ 'expect': 3,
+ 'tss': 'ios',
+ })
req = self._request_webpage(
nodes_data['nodelist'][0]['location'], media_id,
@@ -199,29 +163,28 @@ class LeIE(InfoExtractor):
extracted_formats = []
formats = []
- for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)):
- playurl = play_json['playurl']
- play_domain = playurl['domain'][0]
-
- for format_id, format_data in playurl.get('dispatch', []).items():
- if format_id in extracted_formats:
- continue
- extracted_formats.append(format_id)
-
- media_url = play_domain + format_data[0]
- for protocol, format_url in get_urls(media_url, format_id).items():
- f = {
- 'url': format_url,
- 'ext': determine_ext(format_data[1]),
- 'format_id': '%s-%s' % (protocol, format_id),
- 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
- 'quality': int_or_none(format_id),
- }
-
- if format_id[-1:] == 'p':
- f['height'] = int_or_none(format_id[:-1])
-
- formats.append(f)
+ playurl = play_json_flash['msgs']['playurl']
+ play_domain = playurl['domain'][0]
+
+ for format_id, format_data in playurl.get('dispatch', []).items():
+ if format_id in extracted_formats:
+ continue
+ extracted_formats.append(format_id)
+
+ media_url = play_domain + format_data[0]
+ for protocol, format_url in get_flash_urls(media_url, format_id).items():
+ f = {
+ 'url': format_url,
+ 'ext': determine_ext(format_data[1]),
+ 'format_id': '%s-%s' % (protocol, format_id),
+ 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+ 'quality': int_or_none(format_id),
+ }
+
+ if format_id[-1:] == 'p':
+ f['height'] = int_or_none(format_id[:-1])
+
+ formats.append(f)
self._sort_formats(formats, ('height', 'quality', 'format_id'))
publish_time = parse_iso8601(self._html_search_regex(
diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py
index d3bca64..b312e77 100644
--- a/youtube_dl/extractor/lego.py
+++ b/youtube_dl/extractor/lego.py
@@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor):
formats = self._extract_akamai_formats(
'%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none',
formats))
if len(m3u8_formats) == len(self._BITRATES):
self._sort_formats(m3u8_formats)
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
index 422be25..0a5a395 100644
--- a/youtube_dl/extractor/limelight.py
+++ b/youtube_dl/extractor/limelight.py
@@ -9,6 +9,7 @@ from ..utils import (
determine_ext,
float_or_none,
int_or_none,
+ smuggle_url,
unsmuggle_url,
ExtractorError,
)
@@ -18,6 +19,42 @@ class LimelightBaseIE(InfoExtractor):
_PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
_API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json'
+ @classmethod
+ def _extract_urls(cls, webpage, source_url):
+ lm = {
+ 'Media': 'media',
+ 'Channel': 'channel',
+ 'ChannelList': 'channel_list',
+ }
+ entries = []
+ for kind, video_id in re.findall(
+ r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',
+ webpage):
+ entries.append(cls.url_result(
+ smuggle_url(
+ 'limelight:%s:%s' % (lm[kind], video_id),
+ {'source_url': source_url}),
+ 'Limelight%s' % kind, video_id))
+ for mobj in re.finditer(
+ # As per [1] class attribute should be exactly equal to
+ # LimelightEmbeddedPlayerFlash but numerous examples seen
+ # that don't exactly match it (e.g. [2]).
+ # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage
+ # 2. http://www.sedona.com/FacilitatorTraining2017
+ r'''(?sx)
+ <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*?
+ <param[^>]+
+ name=(["\'])flashVars\2[^>]+
+ value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32})
+ ''', webpage):
+ kind, video_id = mobj.group('kind'), mobj.group('id')
+ entries.append(cls.url_result(
+ smuggle_url(
+ 'limelight:%s:%s' % (kind, video_id),
+ {'source_url': source_url}),
+ 'Limelight%s' % kind.capitalize(), video_id))
+ return entries
+
def _call_playlist_service(self, item_id, method, fatal=True, referer=None):
headers = {}
if referer:
@@ -62,13 +99,21 @@ class LimelightBaseIE(InfoExtractor):
fmt = {
'url': stream_url,
'abr': float_or_none(stream.get('audioBitRate')),
- 'vbr': float_or_none(stream.get('videoBitRate')),
'fps': float_or_none(stream.get('videoFrameRate')),
- 'width': int_or_none(stream.get('videoWidthInPixels')),
- 'height': int_or_none(stream.get('videoHeightInPixels')),
'ext': ext,
}
- rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url)
+ width = int_or_none(stream.get('videoWidthInPixels'))
+ height = int_or_none(stream.get('videoHeightInPixels'))
+ vbr = float_or_none(stream.get('videoBitRate'))
+ if width or height or vbr:
+ fmt.update({
+ 'width': width,
+ 'height': height,
+ 'vbr': vbr,
+ })
+ else:
+ fmt['vcodec'] = 'none'
+ rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', stream_url)
if rtmp:
format_id = 'rtmp'
if stream.get('videoBitRate'):
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index c7de653..c545196 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
@@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor):
_VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)'
_TESTS = [{
'url': 'http://www.liveleak.com/view?i=757_1364311680',
- 'md5': '50f79e05ba149149c1b4ea961223d5b3',
+ 'md5': '0813c2430bea7a46bf13acf3406992f4',
'info_dict': {
'id': '757_1364311680',
- 'ext': 'flv',
+ 'ext': 'mp4',
'description': 'extremely bad day for this guy..!',
'uploader': 'ljfriel2',
'title': 'Most unlucky car accident',
@@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor):
}
}, {
'url': 'http://www.liveleak.com/view?i=f93_1390833151',
- 'md5': 'b13a29626183c9d33944e6a04f41aafc',
+ 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
'info_dict': {
'id': 'f93_1390833151',
'ext': 'mp4',
@@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$'
}
}, {
+ # Prochan embed
'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
'md5': '42c6d97d54f1db107958760788c5f48f',
'info_dict': {
@@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor):
'uploader': 'CapObveus',
'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
'age_limit': 18,
- }
+ },
+ 'skip': 'Video is dead',
}, {
# Covers https://github.com/rg3/youtube-dl/pull/5983
+ # Multiple resolutions
'url': 'http://www.liveleak.com/view?i=801_1409392012',
- 'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+ 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b',
'info_dict': {
'id': '801_1409392012',
'ext': 'mp4',
@@ -93,57 +95,38 @@ class LiveLeakIE(InfoExtractor):
webpage, 'age limit', default=None))
video_thumbnail = self._og_search_thumbnail(webpage)
- sources_raw = self._search_regex(
- r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
- if sources_raw is None:
- alt_source = self._search_regex(
- r'(file: ".*?"),', webpage, 'video URL', default=None)
- if alt_source:
- sources_raw = '[{ %s}]' % alt_source
- else:
- # Maybe an embed?
- embed_url = self._search_regex(
- r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
- webpage, 'embed URL')
- return {
- '_type': 'url_transparent',
- 'url': embed_url,
- 'id': video_id,
- 'title': video_title,
- 'description': video_description,
- 'uploader': video_uploader,
- 'age_limit': age_limit,
- }
+ entries = self._parse_html5_media_entries(url, webpage, video_id)
+ if not entries:
+ # Maybe an embed?
+ embed_url = self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
+ webpage, 'embed URL')
+ return {
+ '_type': 'url_transparent',
+ 'url': embed_url,
+ 'id': video_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'age_limit': age_limit,
+ }
- sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
- sources = json.loads(sources_json)
+ info_dict = entries[0]
- formats = [{
- 'format_id': '%s' % i,
- 'format_note': s.get('label'),
- 'url': s['file'],
- } for i, s in enumerate(sources)]
+ for a_format in info_dict['formats']:
+ if not a_format.get('height'):
+ a_format['height'] = self._search_regex(
+ r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None)
- for i, s in enumerate(sources):
- # Removing '.h264_*.mp4' gives the raw video, which is essentially
- # the same video without the LiveLeak logo at the top (see
- # https://github.com/rg3/youtube-dl/pull/4768)
- orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
- if s['file'] != orig_url:
- formats.append({
- 'format_id': 'original-%s' % i,
- 'format_note': s.get('label'),
- 'url': orig_url,
- 'preference': 1,
- })
- self._sort_formats(formats)
+ self._sort_formats(info_dict['formats'])
- return {
+ info_dict.update({
'id': video_id,
'title': video_title,
'description': video_description,
'uploader': video_uploader,
- 'formats': formats,
'age_limit': age_limit,
'thumbnail': video_thumbnail,
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py
new file mode 100644
index 0000000..9760eaf
--- /dev/null
+++ b/youtube_dl/extractor/mediaset.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ parse_duration,
+ try_get,
+ unified_strdate,
+)
+
+
+class MediasetIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ mediaset:|
+ https?://
+ (?:www\.)?video\.mediaset\.it/
+ (?:
+ (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
+ player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
+ )
+ )(?P<id>[0-9]+)
+ '''
+ _TESTS = [{
+ # full episode
+ 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
+ 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
+ 'info_dict': {
+ 'id': '661824',
+ 'ext': 'mp4',
+ 'title': 'Quarta puntata',
+ 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1414,
+ 'creator': 'mediaset',
+ 'upload_date': '20161107',
+ 'series': 'Hello Goodbye',
+ 'categories': ['reality'],
+ },
+ 'expected_warnings': ['is not a supported codec'],
+ }, {
+ # clip
+ 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
+ 'only_matching': True,
+ }, {
+ # iframe simple
+ 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
+ 'only_matching': True,
+ }, {
+ # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
+ 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
+ 'only_matching': True,
+ }, {
+ 'url': 'mediaset:661824',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video_list = self._download_json(
+ 'http://cdnsel01.mediaset.net/GetCdn.aspx',
+ video_id, 'Downloading video CDN JSON', query={
+ 'streamid': video_id,
+ 'format': 'json',
+ })['videoList']
+
+ formats = []
+ for format_url in video_list:
+ if '.ism' in format_url:
+ formats.extend(self._extract_ism_formats(
+ format_url, video_id, ism_id='mss', fatal=False))
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': determine_ext(format_url),
+ })
+ self._sort_formats(formats)
+
+ mediainfo = self._download_json(
+ 'http://plr.video.mediaset.it/html/metainfo.sjson',
+ video_id, 'Downloading video info JSON', query={
+ 'id': video_id,
+ })['video']
+
+ title = mediainfo['title']
+
+ creator = try_get(
+ mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
+ category = try_get(
+ mediainfo, lambda x: x['brand-info']['category'], compat_str)
+ categories = [category] if category else None
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': mediainfo.get('short-description'),
+ 'thumbnail': mediainfo.get('thumbnail'),
+ 'duration': parse_duration(mediainfo.get('duration')),
+ 'creator': creator,
+ 'upload_date': unified_strdate(mediainfo.get('production-date')),
+ 'webpage_url': mediainfo.get('url'),
+ 'series': mediainfo.get('brand-value'),
+ 'categories': categories,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/medici.py b/youtube_dl/extractor/medici.py
new file mode 100644
index 0000000..cd91023
--- /dev/null
+++ b/youtube_dl/extractor/medici.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ unified_strdate,
+ update_url_query,
+ urlencode_postdata,
+)
+
+
+class MediciIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?medici\.tv/#!/(?P<id>[^?#&]+)'
+ _TEST = {
+ 'url': 'http://www.medici.tv/#!/daniel-harding-frans-helmerson-verbier-festival-music-camp',
+ 'md5': '004c21bb0a57248085b6ff3fec72719d',
+ 'info_dict': {
+ 'id': '3059',
+ 'ext': 'flv',
+ 'title': 'Daniel Harding conducts the Verbier Festival Music Camp \u2013 With Frans Helmerson',
+ 'description': 'md5:322a1e952bafb725174fd8c1a8212f58',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20170408',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ # Sets csrftoken cookie
+ self._download_webpage(url, video_id)
+
+ MEDICI_URL = 'http://www.medici.tv/'
+
+ data = self._download_json(
+ MEDICI_URL, video_id,
+ data=urlencode_postdata({
+ 'json': 'true',
+ 'page': '/%s' % video_id,
+ 'timezone_offset': -420,
+ }), headers={
+ 'X-CSRFToken': self._get_cookies(url)['csrftoken'].value,
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'Referer': MEDICI_URL,
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ })
+
+ video = data['video']['videos']['video1']
+
+ title = video.get('nom') or data['title']
+
+ video_id = video.get('id') or video_id
+ formats = self._extract_f4m_formats(
+ update_url_query(video['url_akamai'], {
+ 'hdcore': '3.1.0',
+ 'plugin=aasp': '3.1.0.43.124',
+ }), video_id, f4m_id='hds')
+
+ description = data.get('meta_description')
+ thumbnail = video.get('url_thumbnail') or data.get('main_image')
+ upload_date = unified_strdate(data['video'].get('date'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index a24b316..0efbe66 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -97,7 +97,7 @@ class MixcloudIE(InfoExtractor):
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
r'/listeners/?">([0-9,.]+)</a>',
- r'm-tooltip=["\']([\d,.]+) plays'],
+ r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
webpage, 'play count', default=None))
return {
@@ -138,12 +138,12 @@ class MixcloudPlaylistBaseIE(InfoExtractor):
def _get_user_description(self, page_content):
return self._html_search_regex(
- r'<div[^>]+class="description-text"[^>]*>(.+?)</div>',
+ r'<div[^>]+class="profile-bio"[^>]*>(.+?)</div>',
page_content, 'user description', fatal=False)
class MixcloudUserIE(MixcloudPlaylistBaseIE):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
+ _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
IE_NAME = 'mixcloud:user'
_TESTS = [{
@@ -151,7 +151,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)',
- 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ 'description': 'md5:def36060ac8747b3aabca54924897e47',
},
'playlist_mincount': 11,
}, {
@@ -159,7 +159,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)',
- 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ 'description': 'md5:def36060ac8747b3aabca54924897e47',
},
'playlist_mincount': 11,
}, {
@@ -167,7 +167,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_favorites',
'title': 'Daniel Holbach (favorites)',
- 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ 'description': 'md5:def36060ac8747b3aabca54924897e47',
},
'params': {
'playlist_items': '1-100',
@@ -178,7 +178,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_listens',
'title': 'Daniel Holbach (listens)',
- 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ 'description': 'md5:def36060ac8747b3aabca54924897e47',
},
'params': {
'playlist_items': '1-100',
@@ -216,7 +216,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
+ _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
IE_NAME = 'mixcloud:playlist'
_TESTS = [{
@@ -229,12 +229,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
'playlist_mincount': 16,
}, {
'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
- 'info_dict': {
- 'id': 'maxvibes_jazzcat-on-ness-radio',
- 'title': 'Jazzcat on Ness Radio',
- 'description': 'md5:7bbbf0d6359a0b8cda85224be0f8f263',
- },
- 'playlist_mincount': 23
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -243,15 +238,16 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
playlist_id = mobj.group('playlist')
video_id = '%s_%s' % (user_id, playlist_id)
- profile = self._download_webpage(
+ webpage = self._download_webpage(
url, user_id,
note='Downloading playlist page',
errnote='Unable to download playlist page')
- description = self._get_user_description(profile)
- playlist_title = self._html_search_regex(
- r'<span[^>]+class="[^"]*list-playlist-title[^"]*"[^>]*>(.*?)</span>',
- profile, 'playlist title')
+ title = self._html_search_regex(
+ r'<a[^>]+class="parent active"[^>]*><b>\d+</b><span[^>]*>([^<]+)',
+ webpage, 'playlist title',
+ default=None) or self._og_search_title(webpage, fatal=False)
+ description = self._get_user_description(webpage)
entries = OnDemandPagedList(
functools.partial(
@@ -259,11 +255,11 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
'%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'),
self._PAGE_SIZE)
- return self.playlist_result(entries, video_id, playlist_title, description)
+ return self.playlist_result(entries, video_id, title, description)
class MixcloudStreamIE(MixcloudPlaylistBaseIE):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
+ _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
IE_NAME = 'mixcloud:stream'
_TEST = {
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py
index f281238..e164d59 100644
--- a/youtube_dl/extractor/myspace.py
+++ b/youtube_dl/extractor/myspace.py
@@ -12,64 +12,62 @@ from ..utils import (
class MySpaceIE(InfoExtractor):
- _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ myspace\.com/[^/]+/
+ (?P<mediatype>
+ video/[^/]+/(?P<video_id>\d+)|
+ music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$)
+ )
+ '''
- _TESTS = [
- {
- 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
- 'md5': '9c1483c106f4a695c47d2911feed50a7',
- 'info_dict': {
- 'id': '109594919',
- 'ext': 'mp4',
- 'title': 'Little Big Town',
- 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
- 'uploader': 'Five Minutes to the Stage',
- 'uploader_id': 'fiveminutestothestage',
- 'timestamp': 1414108751,
- 'upload_date': '20141023',
- },
+ _TESTS = [{
+ 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
+ 'md5': '9c1483c106f4a695c47d2911feed50a7',
+ 'info_dict': {
+ 'id': '109594919',
+ 'ext': 'mp4',
+ 'title': 'Little Big Town',
+ 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
+ 'uploader': 'Five Minutes to the Stage',
+ 'uploader_id': 'fiveminutestothestage',
+ 'timestamp': 1414108751,
+ 'upload_date': '20141023',
},
+ }, {
# songs
- {
- 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
- 'md5': '1d7ee4604a3da226dd69a123f748b262',
- 'info_dict': {
- 'id': '93388656',
- 'ext': 'm4a',
- 'title': 'Of weakened soul...',
- 'uploader': 'Killsorrow',
- 'uploader_id': 'killsorrow',
- },
- }, {
- 'add_ie': ['Youtube'],
- 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
- 'info_dict': {
- 'id': 'xqds0B_meys',
- 'ext': 'webm',
- 'title': 'Three Days Grace - Animal I Have Become',
- 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
- 'uploader': 'ThreeDaysGraceVEVO',
- 'uploader_id': 'ThreeDaysGraceVEVO',
- 'upload_date': '20091002',
- },
- }, {
- 'add_ie': ['Youtube'],
- 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
- 'info_dict': {
- 'id': 'ypWvQgnJrSU',
- 'ext': 'mp4',
- 'title': 'Starset - First Light',
- 'description': 'md5:2d5db6c9d11d527683bcda818d332414',
- 'uploader': 'Yumi K',
- 'uploader_id': 'SorenPromotions',
- 'upload_date': '20140725',
- }
+ 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
+ 'md5': '1d7ee4604a3da226dd69a123f748b262',
+ 'info_dict': {
+ 'id': '93388656',
+ 'ext': 'm4a',
+ 'title': 'Of weakened soul...',
+ 'uploader': 'Killsorrow',
+ 'uploader_id': 'killsorrow',
},
- ]
+ }, {
+ 'add_ie': ['Youtube'],
+ 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
+ 'info_dict': {
+ 'id': 'xqds0B_meys',
+ 'ext': 'webm',
+ 'title': 'Three Days Grace - Animal I Have Become',
+ 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
+ 'uploader': 'ThreeDaysGraceVEVO',
+ 'uploader_id': 'ThreeDaysGraceVEVO',
+ 'upload_date': '20091002',
+ },
+ }, {
+ 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.group('video_id') or mobj.group('song_id')
is_song = mobj.group('mediatype').startswith('music/song')
webpage = self._download_webpage(url, video_id)
player_url = self._search_regex(
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index d2a44d0..62db70b 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -5,10 +5,8 @@ import re
from .common import InfoExtractor
from .theplatform import ThePlatformIE
from .adobepass import AdobePassIE
-from ..compat import compat_urllib_parse_urlparse
from ..utils import (
find_xpath_attr,
- lowercase_escape,
smuggle_url,
unescapeHTML,
update_url_query,
@@ -17,7 +15,7 @@ from ..utils import (
class NBCIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+ _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))'
_TESTS = [
{
@@ -37,16 +35,6 @@ class NBCIE(AdobePassIE):
},
},
{
- 'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
- 'info_dict': {
- 'id': '176',
- 'ext': 'flv',
- 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
- 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
- },
- 'skip': '404 Not Found',
- },
- {
'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
'info_dict': {
'id': '2832821',
@@ -64,11 +52,6 @@ class NBCIE(AdobePassIE):
'skip': 'Only works from US',
},
{
- # This video has expired but with an escaped embedURL
- 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
- 'only_matching': True,
- },
- {
# HLS streams requires the 'hdnea3' cookie
'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
'info_dict': {
@@ -88,59 +71,38 @@ class NBCIE(AdobePassIE):
]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- info = {
+ permalink, video_id = re.match(self._VALID_URL, url).groups()
+ video_data = self._download_json(
+ 'https://api.nbc.com/v3/videos', video_id, query={
+ 'filter[permalink]': permalink,
+ })['data'][0]['attributes']
+ query = {
+ 'mbr': 'true',
+ 'manifest': 'm3u',
+ }
+ video_id = video_data['guid']
+ title = video_data['title']
+ if video_data.get('entitlement') == 'auth':
+ resource = self._get_mvpd_resource(
+ 'nbcentertainment', title, video_id,
+ video_data.get('vChipRating'))
+ query['auth'] = self._extract_mvpd_auth(
+ url, video_id, 'nbcentertainment', resource)
+ theplatform_url = smuggle_url(update_url_query(
+ 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
+ query), {'force_smil_url': True})
+ return {
'_type': 'url_transparent',
- 'ie_key': 'ThePlatform',
'id': video_id,
+ 'title': title,
+ 'url': theplatform_url,
+ 'description': video_data.get('description'),
+ 'keywords': video_data.get('keywords'),
+ 'season_number': int_or_none(video_data.get('seasonNumber')),
+ 'episode_number': int_or_none(video_data.get('episodeNumber')),
+ 'series': video_data.get('showName'),
+ 'ie_key': 'ThePlatform',
}
- video_data = None
- preload = self._search_regex(
- r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None)
- if preload:
- preload_data = self._parse_json(preload, video_id)
- path = compat_urllib_parse_urlparse(url).path.rstrip('/')
- entity_id = preload_data.get('xref', {}).get(path)
- video_data = preload_data.get('entities', {}).get(entity_id)
- if video_data:
- query = {
- 'mbr': 'true',
- 'manifest': 'm3u',
- }
- video_id = video_data['guid']
- title = video_data['title']
- if video_data.get('entitlement') == 'auth':
- resource = self._get_mvpd_resource(
- 'nbcentertainment', title, video_id,
- video_data.get('vChipRating'))
- query['auth'] = self._extract_mvpd_auth(
- url, video_id, 'nbcentertainment', resource)
- theplatform_url = smuggle_url(update_url_query(
- 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
- query), {'force_smil_url': True})
- info.update({
- 'id': video_id,
- 'title': title,
- 'url': theplatform_url,
- 'description': video_data.get('description'),
- 'keywords': video_data.get('keywords'),
- 'season_number': int_or_none(video_data.get('seasonNumber')),
- 'episode_number': int_or_none(video_data.get('episodeNumber')),
- 'series': video_data.get('showName'),
- })
- else:
- theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
- [
- r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
- r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',
- r'"embedURL"\s*:\s*"([^"]+)"'
- ],
- webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
- if theplatform_url.startswith('//'):
- theplatform_url = 'http:' + theplatform_url
- info['url'] = smuggle_url(theplatform_url, {'source_url': url})
- return info
class NBCSportsVPlayerIE(InfoExtractor):
diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py
new file mode 100644
index 0000000..63e58aa
--- /dev/null
+++ b/youtube_dl/extractor/nonktube.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .nuevo import NuevoBaseIE
+
+
+class NonkTubeIE(NuevoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized',
+ 'info_dict': {
+ 'id': '118636',
+ 'ext': 'mp4',
+ 'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized',
+ 'age_limit': 18,
+ 'duration': 1150.98,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://www.nonktube.com/embed/118636',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = self._extract_nuevo(
+ 'https://www.nonktube.com/media/nuevo/econfig.php?key=%s'
+ % video_id, video_id)
+
+ info['age_limit'] = 18
+ return info
diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py
new file mode 100644
index 0000000..f7fa098
--- /dev/null
+++ b/youtube_dl/extractor/noovo.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ smuggle_url,
+ try_get,
+)
+
+
+class NoovoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)'
+ _TESTS = [{
+ # clip
+ 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial',
+ 'info_dict': {
+ 'id': '5386045029001',
+ 'ext': 'mp4',
+ 'title': 'Chrysler Imperial',
+ 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056',
+ 'timestamp': 1491399228,
+ 'upload_date': '20170405',
+ 'uploader_id': '618566855001',
+ 'creator': 'vtele',
+ 'view_count': int,
+ 'series': 'RPM+',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # episode
+ 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8',
+ 'info_dict': {
+ 'id': '5395865725001',
+ 'title': 'Épisode 13 : Les retrouvailles',
+ 'description': 'md5:336d5ebc5436534e61d16e63ddfca327',
+ 'ext': 'mp4',
+ 'timestamp': 1492019320,
+ 'upload_date': '20170412',
+ 'uploader_id': '618566855001',
+ 'creator': 'vtele',
+ 'view_count': int,
+ 'series': "L'amour est dans le pré",
+ 'season_number': 5,
+ 'episode': 'Épisode 13',
+ 'episode_number': 13,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ data = self._download_json(
+ 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id,
+ video_id)['data']
+
+ content = try_get(data, lambda x: x['contents'][0])
+
+ brightcove_id = data.get('brightcoveId') or content['brightcoveId']
+
+ series = try_get(
+ data, (
+ lambda x: x['show']['title'],
+ lambda x: x['season']['show']['title']),
+ compat_str)
+
+ episode = None
+ og = data.get('og')
+ if isinstance(og, dict) and og.get('type') == 'video.episode':
+ episode = og.get('title')
+
+ video = content or data
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': BrightcoveNewIE.ie_key(),
+ 'url': smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+ {'geo_countries': ['CA']}),
+ 'id': brightcove_id,
+ 'title': video.get('title'),
+ 'creator': video.get('source'),
+ 'view_count': int_or_none(video.get('viewsCount')),
+ 'series': series,
+ 'season_number': int_or_none(try_get(
+ data, lambda x: x['season']['seasonNumber'])),
+ 'episode': episode,
+ 'episode_number': int_or_none(data.get('episodeNumber')),
+ }
diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py
index b6c5ee6..f26dafb 100644
--- a/youtube_dl/extractor/nowness.py
+++ b/youtube_dl/extractor/nowness.py
@@ -28,7 +28,7 @@ class NownessBaseIE(InfoExtractor):
bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code)
if bc_url:
return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
- bc_url = BrightcoveNewIE._extract_url(player_code)
+ bc_url = BrightcoveNewIE._extract_url(self, player_code)
if bc_url:
return self.url_result(bc_url, BrightcoveNewIE.ie_key())
raise ExtractorError('Could not find player definition')
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 38fefe4..79296f0 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -313,9 +313,9 @@ class NPOIE(NPOBaseIE):
class NPOLiveIE(NPOBaseIE):
IE_NAME = 'npo.nl:live'
- _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?npo\.nl/live(?:/(?P<id>[^/?#&]+))?'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.npo.nl/live/npo-1',
'info_dict': {
'id': 'LI_NL1_4188102',
@@ -327,10 +327,13 @@ class NPOLiveIE(NPOBaseIE):
'params': {
'skip_download': True,
}
- }
+ }, {
+ 'url': 'http://www.npo.nl/live',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ display_id = self._match_id(url) or 'npo-1'
webpage = self._download_webpage(url, display_id)
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index 7fe79cb..3b4f51f 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor):
vcodec = 'none' if data.get('mediaType') == 'Audio' else None
- # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged
-
for entry in entries:
entry.update(common_info)
for f in entry['formats']:
f['vcodec'] = vcodec
+ points = data.get('shortIndexPoints')
+ if isinstance(points, list):
+ chapters = []
+ for next_num, point in enumerate(points, start=1):
+ if not isinstance(point, dict):
+ continue
+ start_time = parse_duration(point.get('startPoint'))
+ if start_time is None:
+ continue
+ end_time = parse_duration(
+ data.get('duration')
+ if next_num == len(points)
+ else points[next_num].get('startPoint'))
+ if end_time is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': point.get('title'),
+ })
+ if chapters and len(entries) == 1:
+ entries[0]['chapters'] = chapters
+
return self.playlist_result(entries, video_id, title, description)
diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py
index 87fb94d..be1e09d 100644
--- a/youtube_dl/extractor/nuevo.py
+++ b/youtube_dl/extractor/nuevo.py
@@ -10,9 +10,10 @@ from ..utils import (
class NuevoBaseIE(InfoExtractor):
- def _extract_nuevo(self, config_url, video_id):
+ def _extract_nuevo(self, config_url, video_id, headers={}):
config = self._download_xml(
- config_url, video_id, transform_source=lambda s: s.strip())
+ config_url, video_id, transform_source=lambda s: s.strip(),
+ headers=headers)
title = xpath_text(config, './title', 'title', fatal=True).strip()
video_id = xpath_text(config, './mediaid', default=video_id)
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
index 986708e..854b680 100644
--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
+ compat_etree_fromstring,
compat_parse_qs,
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
@@ -37,7 +38,7 @@ class OdnoklassnikiIE(InfoExtractor):
}, {
# metadataUrl
'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
- 'md5': '9676cf86eff5391d35dea675d224e131',
+ 'md5': '6ff470ea2dd51d5d18c295a355b0b6bc',
'info_dict': {
'id': '63567059965189-0',
'ext': 'mp4',
@@ -53,7 +54,7 @@ class OdnoklassnikiIE(InfoExtractor):
}, {
# YouTube embed (metadataUrl, provider == USER_YOUTUBE)
'url': 'http://ok.ru/video/64211978996595-1',
- 'md5': '5d7475d428845cd2e13bae6f1a992278',
+ 'md5': '2f206894ffb5dbfcce2c5a14b909eea5',
'info_dict': {
'id': '64211978996595-1',
'ext': 'mp4',
@@ -61,8 +62,8 @@ class OdnoklassnikiIE(InfoExtractor):
'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0',
'duration': 440,
'upload_date': '20150826',
- 'uploader_id': '750099571',
- 'uploader': 'Алина П',
+ 'uploader_id': 'tvroscosmos',
+ 'uploader': 'Телестудия Роскосмоса',
'age_limit': 0,
},
}, {
@@ -81,6 +82,7 @@ class OdnoklassnikiIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'Video has not been found',
}, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
'only_matching': True,
@@ -176,14 +178,32 @@ class OdnoklassnikiIE(InfoExtractor):
})
return info
- quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd'))
+ quality = qualities(('4', '0', '1', '2', '3', '5'))
formats = [{
'url': f['url'],
'ext': 'mp4',
'format_id': f['name'],
- 'quality': quality(f['name']),
} for f in metadata['videos']]
+
+ m3u8_url = metadata.get('hlsManifestUrl')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+
+ dash_manifest = metadata.get('metadataEmbedded')
+ if dash_manifest:
+ formats.extend(self._parse_mpd_formats(
+ compat_etree_fromstring(dash_manifest), 'mpd'))
+
+ for fmt in formats:
+ fmt_type = self._search_regex(
+ r'\btype[/=](\d)', fmt['url'],
+ 'format type', default=None)
+ if fmt_type:
+ fmt['quality'] = quality(fmt_type)
+
self._sort_formats(formats)
info['formats'] = formats
diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py
index 58ffde5..d8036b5 100644
--- a/youtube_dl/extractor/openload.py
+++ b/youtube_dl/extractor/openload.py
@@ -75,51 +75,38 @@ class OpenloadIE(InfoExtractor):
'<span[^>]+id="[^"]+"[^>]*>([0-9A-Za-z]+)</span>',
webpage, 'openload ID')
- video_url_chars = []
-
- first_char = ord(ol_id[0])
- key = first_char - 55
- maxKey = max(2, key)
- key = min(maxKey, len(ol_id) - 38)
- t = ol_id[key:key + 36]
-
- hashMap = {}
- v = ol_id.replace(t, '')
- h = 0
-
- while h < len(t):
- f = t[h:h + 3]
- i = int(f, 8)
- hashMap[h / 3] = i
- h += 3
-
- h = 0
- H = 0
- while h < len(v):
- B = ''
- C = ''
- if len(v) >= h + 2:
- B = v[h:h + 2]
- if len(v) >= h + 3:
- C = v[h:h + 3]
- i = int(B, 16)
- h += 2
- if H % 3 == 0:
- i = int(C, 8)
- h += 1
- elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60:
- i = int(C, 10)
- h += 1
- index = H % 7
-
- A = hashMap[index]
- i ^= 213
- i ^= A
- video_url_chars.append(compat_chr(i))
- H += 1
+ decoded = ''
+ a = ol_id[0:24]
+ b = []
+ for i in range(0, len(a), 8):
+ b.append(int(a[i:i + 8] or '0', 16))
+ ol_id = ol_id[24:]
+ j = 0
+ k = 0
+ while j < len(ol_id):
+ c = 128
+ d = 0
+ e = 0
+ f = 0
+ _more = True
+ while _more:
+ if j + 1 >= len(ol_id):
+ c = 143
+ f = int(ol_id[j:j + 2] or '0', 16)
+ j += 2
+ d += (f & 127) << e
+ e += 7
+ _more = f >= c
+ g = d ^ b[k % 3]
+ for i in range(4):
+ char_dec = (g >> 8 * i) & (c + 127)
+ char = compat_chr(char_dec)
+ if char != '#':
+ decoded += char
+ k += 1
video_url = 'https://openload.co/stream/%s?mime=true'
- video_url = video_url % (''.join(video_url_chars))
+ video_url = video_url % decoded
title = self._og_search_title(webpage, default=None) or self._search_regex(
r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 1e2c54e..cc296ea 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -2,8 +2,6 @@
from __future__ import unicode_literals
import re
-import calendar
-import datetime
from .common import InfoExtractor
from ..compat import compat_str
@@ -144,77 +142,25 @@ class ORFTVthekIE(InfoExtractor):
}
-class ORFOE1IE(InfoExtractor):
- IE_NAME = 'orf:oe1'
- IE_DESC = 'Radio Österreich 1'
- _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)'
-
- # Audios on ORF radio are only available for 7 days, so we can't add tests.
- _TESTS = [{
- 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211',
- 'only_matching': True,
- }, {
- 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- show_id = self._match_id(url)
- data = self._download_json(
- 'http://oe1.orf.at/programm/%s/konsole' % show_id,
- show_id
- )
-
- timestamp = datetime.datetime.strptime('%s %s' % (
- data['item']['day_label'],
- data['item']['time']
- ), '%d.%m.%Y %H:%M')
- unix_timestamp = calendar.timegm(timestamp.utctimetuple())
-
- return {
- 'id': show_id,
- 'title': data['item']['title'],
- 'url': data['item']['url_stream'],
- 'ext': 'mp3',
- 'description': data['item'].get('info'),
- 'timestamp': unix_timestamp
- }
-
-
-class ORFFM4IE(InfoExtractor):
- IE_NAME = 'orf:fm4'
- IE_DESC = 'radio FM4'
- _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)'
-
- _TEST = {
- 'url': 'http://fm4.orf.at/player/20160110/IS/',
- 'md5': '01e736e8f1cef7e13246e880a59ad298',
- 'info_dict': {
- 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244',
- 'ext': 'mp3',
- 'title': 'Im Sumpf',
- 'description': 'md5:384c543f866c4e422a55f66a62d669cd',
- 'duration': 7173,
- 'timestamp': 1452456073,
- 'upload_date': '20160110',
- },
- 'skip': 'Live streams on FM4 got deleted soon',
- }
-
+class ORFRadioIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ station = mobj.group('station')
show_date = mobj.group('date')
show_id = mobj.group('show')
+ if station == 'fm4':
+ show_id = '4%s' % show_id
+
data = self._download_json(
- 'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id),
+ 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date),
show_id
)
def extract_entry_dict(info, title, subtitle):
return {
'id': info['loopStreamId'].replace('.mp3', ''),
- 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'],
+ 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']),
'title': title,
'description': subtitle,
'duration': (info['end'] - info['start']) / 1000,
@@ -233,6 +179,47 @@ class ORFFM4IE(InfoExtractor):
}
+class ORFFM4IE(ORFRadioIE):
+ IE_NAME = 'orf:fm4'
+ IE_DESC = 'radio FM4'
+ _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+
+ _TEST = {
+ 'url': 'http://fm4.orf.at/player/20170107/CC',
+ 'md5': '2b0be47375432a7ef104453432a19212',
+ 'info_dict': {
+ 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
+ 'ext': 'mp3',
+ 'title': 'Solid Steel Radioshow',
+ 'description': 'Die Mixshow von Coldcut und Ninja Tune.',
+ 'duration': 3599,
+ 'timestamp': 1483819257,
+ 'upload_date': '20170107',
+ },
+ 'skip': 'Shows from ORF radios are only available for 7 days.'
+ }
+
+
+class ORFOE1IE(ORFRadioIE):
+ IE_NAME = 'orf:oe1'
+ IE_DESC = 'Radio Österreich 1'
+ _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+
+ _TEST = {
+ 'url': 'http://oe1.orf.at/player/20170108/456544',
+ 'md5': '34d8a6e67ea888293741c86a099b745b',
+ 'info_dict': {
+ 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
+ 'ext': 'mp3',
+ 'title': 'Morgenjournal',
+ 'duration': 609,
+ 'timestamp': 1483858796,
+ 'upload_date': '20170108',
+ },
+ 'skip': 'Shows from ORF radios are only available for 7 days.'
+ }
+
+
class ORFIPTVIE(InfoExtractor):
IE_NAME = 'orf:iptv'
IE_DESC = 'iptv.ORF.at'
diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py
new file mode 100644
index 0000000..bb668c9
--- /dev/null
+++ b/youtube_dl/extractor/packtpub.py
@@ -0,0 +1,171 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_HTTPError,
+)
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ remove_end,
+ strip_or_none,
+ unified_timestamp,
+ urljoin,
+ urlencode_postdata,
+)
+
+
+class PacktPubBaseIE(InfoExtractor):
+ _PACKT_BASE = 'https://www.packtpub.com'
+ _MAPT_REST = '%s/mapt-rest' % _PACKT_BASE
+
+
+class PacktPubIE(PacktPubBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>\d+)/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro',
+ 'md5': '1e74bd6cfd45d7d07666f4684ef58f70',
+ 'info_dict': {
+ 'id': '20530',
+ 'ext': 'mp4',
+ 'title': 'Project Intro',
+ 'thumbnail': r're:(?i)^https?://.*\.jpg',
+ 'timestamp': 1490918400,
+ 'upload_date': '20170331',
+ },
+ }
+ _NETRC_MACHINE = 'packtpub'
+ _TOKEN = None
+
+ def _real_initialize(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ webpage = self._download_webpage(self._PACKT_BASE, None)
+ login_form = self._form_hidden_inputs(
+ 'packt-user-login-form', webpage)
+ login_form.update({
+ 'email': username,
+ 'password': password,
+ })
+ self._download_webpage(
+ self._PACKT_BASE, None, 'Logging in as %s' % username,
+ data=urlencode_postdata(login_form))
+ try:
+ self._TOKEN = self._download_json(
+ '%s/users/tokens/sessions' % self._MAPT_REST, None,
+ 'Downloading Authorization Token')['data']['token']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 404):
+ message = self._parse_json(e.cause.read().decode(), None)['message']
+ raise ExtractorError(message, expected=True)
+ raise
+
+ def _handle_error(self, response):
+ if response.get('status') != 'success':
+ raise ExtractorError(
+ '% said: %s' % (self.IE_NAME, response['message']),
+ expected=True)
+
+ def _download_json(self, *args, **kwargs):
+ response = super(PacktPubIE, self)._download_json(*args, **kwargs)
+ self._handle_error(response)
+ return response
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_id, chapter_id, video_id = mobj.group(
+ 'course_id', 'chapter_id', 'id')
+
+ headers = {}
+ if self._TOKEN:
+ headers['Authorization'] = self._TOKEN
+ video = self._download_json(
+ '%s/users/me/products/%s/chapters/%s/sections/%s'
+ % (self._MAPT_REST, course_id, chapter_id, video_id), video_id,
+ 'Downloading JSON video', headers=headers)['data']
+
+ content = video.get('content')
+ if not content:
+ self.raise_login_required('This video is locked')
+
+ video_url = content['file']
+
+ metadata = self._download_json(
+ '%s/products/%s/chapters/%s/sections/%s/metadata'
+ % (self._MAPT_REST, course_id, chapter_id, video_id),
+ video_id)['data']
+
+ title = metadata['pageTitle']
+ course_title = metadata.get('title')
+ if course_title:
+ title = remove_end(title, ' - %s' % course_title)
+ timestamp = unified_timestamp(metadata.get('publicationDate'))
+ thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath'))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ }
+
+
+class PacktPubCourseIE(PacktPubBaseIE):
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<id>\d+))'
+ _TEST = {
+ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215',
+ 'info_dict': {
+ 'id': '9781787122215',
+ 'title': 'Learn Nodejs by building 12 projects [Video]',
+ },
+ 'playlist_count': 90,
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ return False if PacktPubIE.suitable(url) else super(
+ PacktPubCourseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ url, course_id = mobj.group('url', 'id')
+
+ course = self._download_json(
+ '%s/products/%s/metadata' % (self._MAPT_REST, course_id),
+ course_id)['data']
+
+ entries = []
+ for chapter_num, chapter in enumerate(course['tableOfContents'], 1):
+ if chapter.get('type') != 'chapter':
+ continue
+ children = chapter.get('children')
+ if not isinstance(children, list):
+ continue
+ chapter_info = {
+ 'chapter': chapter.get('title'),
+ 'chapter_number': chapter_num,
+ 'chapter_id': chapter.get('id'),
+ }
+ for section in children:
+ if section.get('type') != 'section':
+ continue
+ section_url = section.get('seoUrl')
+ if not isinstance(section_url, compat_str):
+ continue
+ entry = {
+ '_type': 'url_transparent',
+ 'url': urljoin(url + '/', section_url),
+ 'title': strip_or_none(section.get('title')),
+ 'description': clean_html(section.get('summary')),
+ 'ie_key': PacktPubIE.ie_key(),
+ }
+ entry.update(chapter_info)
+ entries.append(entry)
+
+ return self.playlist_result(entries, course_id, course.get('title'))
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 3e51b4d..16cc667 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -8,7 +8,9 @@ from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
+ float_or_none,
js_to_json,
+ orderedSet,
strip_jsonp,
strip_or_none,
unified_strdate,
@@ -264,6 +266,13 @@ class PBSIE(InfoExtractor):
'playlist_count': 2,
},
{
+ 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/',
+ 'info_dict': {
+ 'id': 'great-war',
+ },
+ 'playlist_count': 3,
+ },
+ {
'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
'info_dict': {
'id': '2276541483',
@@ -381,10 +390,10 @@ class PBSIE(InfoExtractor):
# tabbed frontline videos
MULTI_PART_REGEXES = (
r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
- r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)',
+ r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',
)
for p in MULTI_PART_REGEXES:
- tabbed_videos = re.findall(p, webpage)
+ tabbed_videos = orderedSet(re.findall(p, webpage))
if tabbed_videos:
return tabbed_videos, presumptive_id, upload_date, description
@@ -464,6 +473,7 @@ class PBSIE(InfoExtractor):
redirects.append(redirect)
redirect_urls.add(redirect_url)
+ chapters = []
# Player pages may also serve different qualities
for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
@@ -479,6 +489,20 @@ class PBSIE(InfoExtractor):
extract_redirect_urls(video_info)
if not info:
info = video_info
+ if not chapters:
+ for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+ chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+ if not chapter:
+ continue
+ start_time = float_or_none(chapter.get('start_time'), 1000)
+ duration = float_or_none(chapter.get('duration'), 1000)
+ if start_time is None or duration is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': start_time + duration,
+ 'title': chapter.get('title'),
+ })
formats = []
http_url = None
@@ -515,7 +539,7 @@ class PBSIE(InfoExtractor):
http_url = format_url
self._remove_duplicate_formats(formats)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
@@ -588,4 +612,5 @@ class PBSIE(InfoExtractor):
'upload_date': upload_date,
'formats': formats,
'subtitles': subtitles,
+ 'chapters': chapters,
}
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index 0e36230..1add6b8 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -20,7 +20,7 @@ class PeriscopeBaseIE(InfoExtractor):
class PeriscopeIE(PeriscopeBaseIE):
IE_DESC = 'Periscope'
IE_NAME = 'periscope'
- _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)'
# Alive example URLs can be found here http://onperiscope.com/
_TESTS = [{
'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
@@ -41,6 +41,9 @@ class PeriscopeIE(PeriscopeBaseIE):
}, {
'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX',
'only_matching': True,
+ }, {
+ 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv',
+ 'only_matching': True,
}]
@staticmethod
@@ -103,7 +106,7 @@ class PeriscopeIE(PeriscopeBaseIE):
class PeriscopeUserIE(PeriscopeBaseIE):
- _VALID_URL = r'https?://(?:www\.)?periscope\.tv/(?P<id>[^/]+)/?$'
+ _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$'
IE_DESC = 'Periscope user videos'
IE_NAME = 'periscope:user'
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
index 073fc3e..24c3600 100644
--- a/youtube_dl/extractor/porn91.py
+++ b/youtube_dl/extractor/porn91.py
@@ -1,10 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_urllib_parse_urlencode,
-)
from .common import InfoExtractor
from ..utils import (
parse_duration,
@@ -19,7 +15,7 @@ class Porn91IE(InfoExtractor):
_TEST = {
'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
- 'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+ 'md5': '7fcdb5349354f40d41689bd0fa8db05a',
'info_dict': {
'id': '7e42283b4f5ab36da134',
'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
@@ -43,24 +39,7 @@ class Porn91IE(InfoExtractor):
r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
title = title.replace('\n', '')
- # get real url
- file_id = self._search_regex(
- r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
- sec_code = self._search_regex(
- r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
- max_vid = self._search_regex(
- r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
- url_params = compat_urllib_parse_urlencode({
- 'VID': file_id,
- 'mp4': '1',
- 'seccode': sec_code,
- 'max_vid': max_vid,
- })
- info_cn = self._download_webpage(
- 'http://91porn.com/getfile.php?' + url_params, video_id,
- 'Downloading real video url')
- video_url = compat_urllib_parse_unquote(self._search_regex(
- r'file=([^&]+)&', info_cn, 'url'))
+ info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
duration = parse_duration(self._search_regex(
r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
@@ -68,11 +47,12 @@ class Porn91IE(InfoExtractor):
comment_count = int_or_none(self._search_regex(
r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
- return {
+ info_dict.update({
'id': video_id,
'title': title,
- 'url': video_url,
'duration': duration,
'comment_count': comment_count,
'age_limit': self._rta_search(webpage),
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index b25f1f1..1dcc8df 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
- (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
+ (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
(?:www\.)?thumbzilla\.com/video/
)
(?P<id>[\da-z]+)
@@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor):
}, {
'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
'only_matching': True,
+ }, {
+ 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
+ 'only_matching': True,
}]
@staticmethod
diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py
index ed38c77..e2202d6 100644
--- a/youtube_dl/extractor/r7.py
+++ b/youtube_dl/extractor/r7.py
@@ -62,8 +62,7 @@ class R7IE(InfoExtractor):
# m3u8 format always matches the http format, let's copy metadata from
# one to another
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- formats))
+ lambda f: f.get('vcodec') != 'none', formats))
if len(m3u8_formats) == 1:
f_copy = m3u8_formats[0].copy()
f_copy.update(f)
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index 41afbd9..81eb9db 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -1,23 +1,40 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_urlparse,
+ compat_str,
+)
from ..utils import (
- determine_ext,
ExtractorError,
+ determine_ext,
find_xpath_attr,
fix_xml_ampersands,
+ GeoRestrictedError,
int_or_none,
parse_duration,
+ strip_or_none,
+ try_get,
unified_strdate,
+ unified_timestamp,
update_url_query,
+ urljoin,
xpath_text,
)
class RaiBaseIE(InfoExtractor):
- def _extract_relinker_formats(self, relinker_url, video_id):
+ _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+ _GEO_COUNTRIES = ['IT']
+ _GEO_BYPASS = False
+
+ def _extract_relinker_info(self, relinker_url, video_id):
formats = []
+ geoprotection = None
+ is_live = None
+ duration = None
for platform in ('mon', 'flash', 'native'):
relinker = self._download_xml(
@@ -27,9 +44,27 @@ class RaiBaseIE(InfoExtractor):
query={'output': 45, 'pl': platform},
headers=self.geo_verification_headers())
- media_url = find_xpath_attr(relinker, './url', 'type', 'content').text
+ if not geoprotection:
+ geoprotection = xpath_text(
+ relinker, './geoprotection', default=None) == 'Y'
+
+ if not is_live:
+ is_live = xpath_text(
+ relinker, './is_live', default=None) == 'Y'
+ if not duration:
+ duration = parse_duration(xpath_text(
+ relinker, './duration', default=None))
+
+ url_elem = find_xpath_attr(relinker, './url', 'type', 'content')
+ if url_elem is None:
+ continue
+
+ media_url = url_elem.text
+
+ # This does not imply geo restriction (e.g.
+ # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html)
if media_url == 'http://download.rai.it/video_no_available.mp4':
- self.raise_geo_restricted()
+ continue
ext = determine_ext(media_url)
if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
@@ -53,215 +88,333 @@ class RaiBaseIE(InfoExtractor):
'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
})
- return formats
+ if not formats and geoprotection is True:
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+
+ return dict((k, v) for k, v in {
+ 'is_live': is_live,
+ 'duration': duration,
+ 'formats': formats,
+ }.items() if v is not None)
+
+ @staticmethod
+ def _extract_subtitles(url, subtitle_url):
+ subtitles = {}
+ if subtitle_url and isinstance(subtitle_url, compat_str):
+ subtitle_url = urljoin(url, subtitle_url)
+ STL_EXT = '.stl'
+ SRT_EXT = '.srt'
+ subtitles['it'] = [{
+ 'ext': 'stl',
+ 'url': subtitle_url,
+ }]
+ if subtitle_url.endswith(STL_EXT):
+ srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT
+ subtitles['it'].append({
+ 'ext': 'srt',
+ 'url': srt_url,
+ })
+ return subtitles
+
+
+class RaiPlayIE(RaiBaseIE):
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE
+ _TESTS = [{
+ 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',
+ 'md5': '340aa3b7afb54bfd14a8c11786450d76',
+ 'info_dict': {
+ 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',
+ 'ext': 'mp4',
+ 'title': 'La Casa Bianca',
+ 'alt_title': 'S2016 - Puntata del 23/10/2016',
+ 'description': 'md5:a09d45890850458077d1f68bb036e0a5',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Rai 3',
+ 'creator': 'Rai 3',
+ 'duration': 3278,
+ 'timestamp': 1477764300,
+ 'upload_date': '20161029',
+ 'series': 'La Casa Bianca',
+ 'season': '2016',
+ },
+ }, {
+ 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
+ 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
+ 'info_dict': {
+ 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
+ 'ext': 'mp4',
+ 'title': 'Report del 07/04/2014',
+ 'alt_title': 'S2013/14 - Puntata del 07/04/2014',
+ 'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Rai 5',
+ 'creator': 'Rai 5',
+ 'duration': 6160,
+ 'series': 'Report',
+ 'season_number': 5,
+ 'season': '2013/14',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ url, video_id = mobj.group('url', 'id')
- def _extract_from_content_id(self, content_id, base_url):
+ media = self._download_json(
+ '%s?json' % url, video_id, 'Downloading video JSON')
+
+ title = media['name']
+
+ video = media['video']
+
+ relinker_info = self._extract_relinker_info(video['contentUrl'], video_id)
+ self._sort_formats(relinker_info['formats'])
+
+ thumbnails = []
+ if 'images' in media:
+ for _, value in media.get('images').items():
+ if value:
+ thumbnails.append({
+ 'url': value.replace('[RESOLUTION]', '600x400')
+ })
+
+ timestamp = unified_timestamp(try_get(
+ media, lambda x: x['availabilities'][0]['start'], compat_str))
+
+ subtitles = self._extract_subtitles(url, video.get('subtitles'))
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'alt_title': media.get('subtitle'),
+ 'description': media.get('description'),
+ 'uploader': media.get('channel'),
+ 'creator': media.get('editor'),
+ 'duration': parse_duration(video.get('duration')),
+ 'timestamp': timestamp,
+ 'thumbnails': thumbnails,
+ 'series': try_get(
+ media, lambda x: x['isPartOf']['name'], compat_str),
+ 'season_number': int_or_none(try_get(
+ media, lambda x: x['isPartOf']['numeroStagioni'])),
+ 'season': media.get('stagione') or None,
+ 'subtitles': subtitles,
+ }
+
+ info.update(relinker_info)
+
+ return info
+
+
+class RaiIE(RaiBaseIE):
+ _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
+ _TESTS = [{
+ # var uniquename = "ContentItem-..."
+ # data-id="ContentItem-..."
+ 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
+ 'info_dict': {
+ 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
+ 'ext': 'mp4',
+ 'title': 'TG PRIMO TEMPO',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1758,
+ 'upload_date': '20140612',
+ }
+ }, {
+ # with ContentItem in many metas
+ 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
+ 'info_dict': {
+ 'id': '1632c009-c843-4836-bb65-80c33084a64b',
+ 'ext': 'mp4',
+ 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"',
+ 'description': 'I film in uscita questa settimana.',
+ 'thumbnail': r're:^https?://.*\.png$',
+ 'duration': 833,
+ 'upload_date': '20161103',
+ }
+ }, {
+ # with ContentItem in og:url
+ 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
+ 'md5': '11959b4e44fa74de47011b5799490adf',
+ 'info_dict': {
+ 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',
+ 'ext': 'mp4',
+ 'title': 'TG1 ore 20:00 del 03/11/2016',
+ 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 2214,
+ 'upload_date': '20161103',
+ }
+ }, {
+ # drawMediaRaiTV(...)
+ 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
+ 'md5': '2dd727e61114e1ee9c47f0da6914e178',
+ 'info_dict': {
+ 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
+ 'ext': 'mp4',
+ 'title': 'Il pacco',
+ 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20141221',
+ },
+ }, {
+ # initEdizione('ContentItem-...'
+ 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
+ 'info_dict': {
+ 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303',
+ 'ext': 'mp4',
+ 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}',
+ 'duration': 2274,
+ 'upload_date': '20170401',
+ },
+ 'skip': 'Changes daily',
+ }, {
+ # HDS live stream with only relinker URL
+ 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
+ 'info_dict': {
+ 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
+ 'ext': 'flv',
+ 'title': 'EuroNews',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # HLS live stream with ContentItem in og:url
+ 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
+ 'info_dict': {
+ 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
+ 'ext': 'mp4',
+ 'title': 'La diretta di Rainews24',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _extract_from_content_id(self, content_id, url):
media = self._download_json(
'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,
content_id, 'Downloading video JSON')
- thumbnails = []
- for image_type in ('image', 'image_medium', 'image_300'):
- thumbnail_url = media.get(image_type)
- if thumbnail_url:
- thumbnails.append({
- 'url': compat_urlparse.urljoin(base_url, thumbnail_url),
- })
+ title = media['name'].strip()
- formats = []
media_type = media['type']
if 'Audio' in media_type:
- formats.append({
- 'format_id': media.get('formatoAudio'),
- 'url': media['audioUrl'],
- 'ext': media.get('formatoAudio'),
- })
+ relinker_info = {
+ 'formats': {
+ 'format_id': media.get('formatoAudio'),
+ 'url': media['audioUrl'],
+ 'ext': media.get('formatoAudio'),
+ }
+ }
elif 'Video' in media_type:
- formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id))
- self._sort_formats(formats)
+ relinker_info = self._extract_relinker_info(media['mediaUri'], content_id)
else:
raise ExtractorError('not a media file')
- subtitles = {}
- captions = media.get('subtitlesUrl')
- if captions:
- STL_EXT = '.stl'
- SRT_EXT = '.srt'
- if captions.endswith(STL_EXT):
- captions = captions[:-len(STL_EXT)] + SRT_EXT
- subtitles['it'] = [{
- 'ext': 'srt',
- 'url': captions,
- }]
+ self._sort_formats(relinker_info['formats'])
+
+ thumbnails = []
+ for image_type in ('image', 'image_medium', 'image_300'):
+ thumbnail_url = media.get(image_type)
+ if thumbnail_url:
+ thumbnails.append({
+ 'url': compat_urlparse.urljoin(url, thumbnail_url),
+ })
- return {
+ subtitles = self._extract_subtitles(url, media.get('subtitlesUrl'))
+
+ info = {
'id': content_id,
- 'title': media['name'],
- 'description': media.get('desc'),
+ 'title': title,
+ 'description': strip_or_none(media.get('desc')),
'thumbnails': thumbnails,
'uploader': media.get('author'),
'upload_date': unified_strdate(media.get('date')),
'duration': parse_duration(media.get('length')),
- 'formats': formats,
'subtitles': subtitles,
}
+ info.update(relinker_info)
-class RaiTVIE(RaiBaseIE):
- _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
- _TESTS = [
- {
- 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
- 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
- 'info_dict': {
- 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
- 'ext': 'mp4',
- 'title': 'Report del 07/04/2014',
- 'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
- 'upload_date': '20140407',
- 'duration': 6160,
- 'thumbnail': r're:^https?://.*\.jpg$',
- }
- },
- {
- # no m3u8 stream
- 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
- # HDS download, MD5 is unstable
- 'info_dict': {
- 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
- 'ext': 'flv',
- 'title': 'TG PRIMO TEMPO',
- 'upload_date': '20140612',
- 'duration': 1758,
- 'thumbnail': r're:^https?://.*\.jpg$',
- },
- 'skip': 'Geo-restricted to Italy',
- },
- {
- 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html',
- 'md5': '35cf7c229f22eeef43e48b5cf923bef0',
- 'info_dict': {
- 'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13',
- 'ext': 'mp4',
- 'title': 'State of the Net, Antonella La Carpia: regole virali',
- 'description': 'md5:b0ba04a324126903e3da7763272ae63c',
- 'upload_date': '20140613',
- },
- 'skip': 'Error 404',
- },
- {
- 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html',
- 'info_dict': {
- 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132',
- 'ext': 'mp4',
- 'title': 'Alluvione in Sardegna e dissesto idrogeologico',
- 'description': 'Edizione delle ore 20:30 ',
- },
- 'skip': 'invalid urls',
- },
- {
- 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html',
- 'md5': 'e57493e1cb8bc7c564663f363b171847',
- 'info_dict': {
- 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',
- 'ext': 'mp4',
- 'title': 'Il Candidato - Primo episodio: "Le Primarie"',
- 'description': 'md5:364b604f7db50594678f483353164fb8',
- 'upload_date': '20140923',
- 'duration': 386,
- 'thumbnail': r're:^https?://.*\.jpg$',
- }
- },
- ]
+ return info
def _real_extract(self, url):
video_id = self._match_id(url)
- return self._extract_from_content_id(video_id, url)
+ webpage = self._download_webpage(url, video_id)
+ content_item_id = None
-class RaiIE(RaiBaseIE):
- _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
- _TESTS = [
- {
- 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
- 'md5': '2dd727e61114e1ee9c47f0da6914e178',
- 'info_dict': {
- 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
- 'ext': 'mp4',
- 'title': 'Il pacco',
- 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
- 'upload_date': '20141221',
- },
- },
- {
- # Direct relinker URL
- 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
- # HDS live stream, MD5 is unstable
- 'info_dict': {
- 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
- 'ext': 'flv',
- 'title': 'EuroNews',
- },
- 'skip': 'Geo-restricted to Italy',
- },
- {
- # Embedded content item ID
- 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
- 'md5': '84c1135ce960e8822ae63cec34441d63',
- 'info_dict': {
- 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8',
- 'ext': 'mp4',
- 'title': 'TG1 ore 20:00 del 02/07/2016',
- 'upload_date': '20160702',
- },
- },
- {
- 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
- # HDS live stream, MD5 is unstable
- 'info_dict': {
- 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
- 'ext': 'flv',
- 'title': 'La diretta di Rainews24',
- },
- },
- ]
+ content_item_url = self._html_search_meta(
+ ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url',
+ 'twitter:player', 'jsonlink'), webpage, default=None)
+ if content_item_url:
+ content_item_id = self._search_regex(
+ r'ContentItem-(%s)' % self._UUID_RE, content_item_url,
+ 'content item id', default=None)
- @classmethod
- def suitable(cls, url):
- return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url)
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ if not content_item_id:
+ content_item_id = self._search_regex(
+ r'''(?x)
+ (?:
+ (?:initEdizione|drawMediaRaiTV)\(|
+ <(?:[^>]+\bdata-id|var\s+uniquename)=
+ )
+ (["\'])
+ (?:(?!\1).)*\bContentItem-(?P<id>%s)
+ ''' % self._UUID_RE,
+ webpage, 'content item id', default=None, group='id')
- iframe_url = self._search_regex(
- [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',
- r'drawMediaRaiTV\(["\'](.+?)["\']'],
- webpage, 'iframe', default=None)
- if iframe_url:
- if not iframe_url.startswith('http'):
- iframe_url = compat_urlparse.urljoin(url, iframe_url)
- return self.url_result(iframe_url)
-
- content_item_id = self._search_regex(
- r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)',
- webpage, 'content item ID', group='content_id', default=None)
+ content_item_ids = set()
if content_item_id:
- return self._extract_from_content_id(content_item_id, url)
+ content_item_ids.add(content_item_id)
+ if video_id not in content_item_ids:
+ content_item_ids.add(video_id)
+
+ for content_item_id in content_item_ids:
+ try:
+ return self._extract_from_content_id(content_item_id, url)
+ except GeoRestrictedError:
+ raise
+ except ExtractorError:
+ pass
+
+ relinker_url = self._search_regex(
+ r'''(?x)
+ (?:
+ var\s+videoURL|
+ mediaInfo\.mediaUri
+ )\s*=\s*
+ ([\'"])
+ (?P<url>
+ (?:https?:)?
+ //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\?
+ (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1
+ ''',
+ webpage, 'relinker URL', group='url')
- relinker_url = compat_urlparse.urljoin(url, self._search_regex(
- r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)',
- webpage, 'relinker URL', group='url'))
- formats = self._extract_relinker_formats(relinker_url, video_id)
- self._sort_formats(formats)
+ relinker_info = self._extract_relinker_info(
+ urljoin(url, relinker_url), video_id)
+ self._sort_formats(relinker_info['formats'])
title = self._search_regex(
r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1',
- webpage, 'title', group='title', default=None) or self._og_search_title(webpage)
+ webpage, 'title', group='title',
+ default=None) or self._og_search_title(webpage)
- return {
+ info = {
'id': video_id,
'title': title,
- 'formats': formats,
}
+
+ info.update(relinker_info)
+
+ return info
diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py
index 53b82fb..afa7b91 100644
--- a/youtube_dl/extractor/rbmaradio.py
+++ b/youtube_dl/extractor/rbmaradio.py
@@ -13,15 +13,15 @@ from ..utils import (
class RBMARadioIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011',
'md5': '6bc6f9bcb18994b4c983bc3bf4384d95',
'info_dict': {
'id': 'ford-lopatin-live-at-primavera-sound-2011',
'ext': 'mp3',
- 'title': 'Main Stage - Ford & Lopatin',
- 'description': 'md5:4f340fb48426423530af5a9d87bd7b91',
+ 'title': 'Main Stage - Ford & Lopatin at Primavera Sound',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 2452,
'timestamp': 1307103164,
diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py
index 2340dae..e921ca3 100644
--- a/youtube_dl/extractor/rmcdecouverte.py
+++ b/youtube_dl/extractor/rmcdecouverte.py
@@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor):
_VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)'
_TEST = {
- 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE',
+ 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET',
'info_dict': {
- 'id': '5111223049001',
+ 'id': '5419055995001',
'ext': 'mp4',
- 'title': ': LES HEROS DU 88e ETAGE',
- 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.',
+ 'title': 'UN DELICIEUX PROJET',
+ 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5',
'uploader_id': '1969646226001',
- 'upload_date': '20160904',
- 'timestamp': 1472951103,
+ 'upload_date': '20170502',
+ 'timestamp': 1493745308,
},
'params': {
- # rtmp download
'skip_download': True,
},
- 'skip': 'Only works from France',
+ 'skip': 'only available for a week',
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s'
@@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0]
- return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
+ if brightcove_legacy_url:
+ brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
+ brightcove_legacy_url).query)['@videoPlayer'][0]
+ else:
+ brightcove_id = self._search_regex(
+ r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew',
+ brightcove_id)
diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py
index 721ee73..666e90e 100644
--- a/youtube_dl/extractor/rtl2.py
+++ b/youtube_dl/extractor/rtl2.py
@@ -1,13 +1,26 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..aes import aes_cbc_decrypt
+from ..compat import (
+ compat_ord,
+ compat_str,
+)
+from ..utils import (
+ bytes_to_intlist,
+ ExtractorError,
+ intlist_to_bytes,
+ int_or_none,
+ strip_or_none,
+)
class RTL2IE(InfoExtractor):
+ IE_NAME = 'rtl2'
_VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))'
_TESTS = [{
'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0',
@@ -98,3 +111,98 @@ class RTL2IE(InfoExtractor):
'duration': int_or_none(video_info.get('duration')),
'formats': formats,
}
+
+
+class RTL2YouBaseIE(InfoExtractor):
+ _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/'
+
+
+class RTL2YouIE(RTL2YouBaseIE):
+ IE_NAME = 'rtl2:you'
+ _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du',
+ 'info_dict': {
+ 'id': '15740',
+ 'ext': 'mp4',
+ 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!',
+ 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01',
+ 'age_limit': 12,
+ },
+ }, {
+ 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712',
+ 'only_matching': True,
+ }]
+ _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!'
+ _GEO_COUNTRIES = ['DE']
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ stream_data = self._download_json(
+ self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id)
+
+ data, iv = base64.b64decode(stream_data['streamUrl']).decode().split(':')
+ stream_url = intlist_to_bytes(aes_cbc_decrypt(
+ bytes_to_intlist(base64.b64decode(data)),
+ bytes_to_intlist(self._AES_KEY),
+ bytes_to_intlist(base64.b64decode(iv))
+ ))
+ if b'rtl2_you_video_not_found' in stream_url:
+ raise ExtractorError('video not found', expected=True)
+
+ formats = self._extract_m3u8_formats(
+ stream_url[:-compat_ord(stream_url[-1])].decode(),
+ video_id, 'mp4', 'm3u8_native')
+ self._sort_formats(formats)
+
+ video_data = self._download_json(
+ self._BACKWERK_BASE_URL + 'video/' + video_id, video_id)
+
+ series = video_data.get('formatTitle')
+ title = episode = video_data.get('title') or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': strip_or_none(video_data.get('description')),
+ 'thumbnail': video_data.get('image'),
+ 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000),
+ 'series': series,
+ 'episode': episode,
+ 'age_limit': int_or_none(video_data.get('minimumAge')),
+ }
+
+
+class RTL2YouSeriesIE(RTL2YouBaseIE):
+ IE_NAME = 'rtl2:you:series'
+ _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://you.rtl2.de/videos/115/dragon-ball',
+ 'info_dict': {
+ 'id': '115',
+ },
+ 'playlist_mincount': 5,
+ }
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ stream_data = self._download_json(
+ self._BACKWERK_BASE_URL + 'videos',
+ series_id, query={
+ 'formatId': series_id,
+ 'limit': 1000000000,
+ })
+
+ entries = []
+ for video in stream_data.get('videos', []):
+ video_id = compat_str(video['videoId'])
+ if not video_id:
+ continue
+ entries.append(self.url_result(
+ 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id),
+ 'RTL2You', video_id))
+ return self.playlist_result(entries, series_id)
diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py
index 5164401..f036f67 100644
--- a/youtube_dl/extractor/rudo.py
+++ b/youtube_dl/extractor/rudo.py
@@ -26,7 +26,7 @@ class RudoIE(InfoExtractor):
}
@classmethod
- def _extract_url(self, webpage):
+ def _extract_url(cls, webpage):
mobj = re.search(
r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)',
webpage)
diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py
index 9f5c237..3472527 100644
--- a/youtube_dl/extractor/streamable.py
+++ b/youtube_dl/extractor/streamable.py
@@ -12,7 +12,7 @@ from ..utils import (
class StreamableIE(InfoExtractor):
- _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)'
+ _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)'
_TESTS = [
{
'url': 'https://streamable.com/dnd1',
@@ -47,6 +47,10 @@ class StreamableIE(InfoExtractor):
{
'url': 'https://streamable.com/e/dnd1',
'only_matching': True,
+ },
+ {
+ 'url': 'https://streamable.com/s/okkqk/drxjds',
+ 'only_matching': True,
}
]
diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py
new file mode 100644
index 0000000..aa4fad1
--- /dev/null
+++ b/youtube_dl/extractor/streamango.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+)
+
+
+class StreamangoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
+ 'md5': 'e992787515a182f55e38fc97588d802a',
+ 'info_dict': {
+ 'id': 'clapasobsptpkdfe',
+ 'ext': 'mp4',
+ 'title': '20170315_150006.mp4',
+ }
+ }, {
+ 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+
+ formats = []
+ for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
+ video = self._parse_json(
+ format_, video_id, transform_source=js_to_json, fatal=False)
+ if not video:
+ continue
+ src = video.get('src')
+ if not src:
+ continue
+ ext = determine_ext(src, default_ext=None)
+ if video.get('type') == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ 'ext': ext or 'mp4',
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ 'tbr': int_or_none(video.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'url': url,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 1b1afab..3f3c681 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -210,7 +210,7 @@ class TEDIE(InfoExtractor):
resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 9a424b1..de236bb 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE):
'url': src,
})
+ duration = info.get('duration')
+ tp_chapters = info.get('chapters', [])
+ chapters = []
+ if tp_chapters:
+ def _add_chapter(start_time, end_time):
+ start_time = float_or_none(start_time, 1000)
+ end_time = float_or_none(end_time, 1000)
+ if start_time is None or end_time is None:
+ return
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ })
+
+ for chapter in tp_chapters[:-1]:
+ _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
+ _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
+
return {
'title': info['title'],
'subtitles': subtitles,
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
- 'duration': int_or_none(info.get('duration'), 1000),
+ 'duration': float_or_none(duration, 1000),
'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'),
+ 'chapters': chapters,
}
def _extract_theplatform_metadata(self, path, video_id):
diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py
index b8504f0..cd64235 100644
--- a/youtube_dl/extractor/thescene.py
+++ b/youtube_dl/extractor/thescene.py
@@ -3,10 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urlparse
-from ..utils import (
- int_or_none,
- qualities,
-)
class TheSceneIE(InfoExtractor):
@@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor):
'season': 'Ready To Wear Spring 2013',
'tags': list,
'categories': list,
+ 'upload_date': '20120913',
+ 'timestamp': 1347512400,
+ 'uploader': 'vogue',
},
}
@@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor):
self._html_search_regex(
r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url'))
- player = self._download_webpage(player_url, display_id)
- info = self._parse_json(
- self._search_regex(
- r'(?m)video\s*:\s*({.+?}),$', player, 'info json'),
- display_id)
-
- video_id = info['id']
- title = info['title']
-
- qualities_order = qualities(('low', 'high'))
- formats = [{
- 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']),
- 'url': f['src'],
- 'quality': qualities_order(f['quality']),
- } for f in info['sources']]
- self._sort_formats(formats)
-
return {
- 'id': video_id,
+ '_type': 'url_transparent',
'display_id': display_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': info.get('poster_frame'),
- 'duration': int_or_none(info.get('duration')),
- 'series': info.get('series_title'),
- 'season': info.get('season_title'),
- 'tags': info.get('tags'),
- 'categories': info.get('categories'),
+ 'url': player_url,
+ 'ie_key': 'CondeNast',
}
diff --git a/youtube_dl/extractor/thesun.py b/youtube_dl/extractor/thesun.py
new file mode 100644
index 0000000..22d0037
--- /dev/null
+++ b/youtube_dl/extractor/thesun.py
@@ -0,0 +1,32 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class TheSunIE(InfoExtractor):
+ _VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/',
+ 'info_dict': {
+ 'id': '2261604',
+ 'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf',
+ },
+ 'playlist_count': 2,
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, article_id)
+
+ entries = []
+ for ooyala_id in re.findall(
+ r'<[^>]+\b(?:id\s*=\s*"thesun-ooyala-player-|data-content-id\s*=\s*")([^"]+)',
+ webpage):
+ entries.append(OoyalaIE._build_url_result(ooyala_id))
+
+ return self.playlist_result(
+ entries, article_id, self._og_search_title(webpage, fatal=False))
diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py
index 1c0be9f..efeb677 100644
--- a/youtube_dl/extractor/turner.py
+++ b/youtube_dl/extractor/turner.py
@@ -13,6 +13,7 @@ from ..utils import (
xpath_attr,
update_url_query,
ExtractorError,
+ strip_or_none,
)
@@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE):
'height': int_or_none(image.get('height')),
} for image in video_data.findall('images/image')]
+ is_live = xpath_text(video_data, 'isLive') == 'true'
+
return {
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if is_live else title,
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
- 'description': xpath_text(video_data, 'description'),
+ 'thumbnail': xpath_text(video_data, 'poster'),
+ 'description': strip_or_none(xpath_text(video_data, 'description')),
'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')),
'timestamp': self._extract_timestamp(video_data),
'upload_date': xpath_attr(video_data, 'metas', 'version'),
'series': xpath_text(video_data, 'showTitle'),
'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
+ 'is_live': is_live,
}
diff --git a/youtube_dl/extractor/tv2hu.py b/youtube_dl/extractor/tv2hu.py
new file mode 100644
index 0000000..86017b7
--- /dev/null
+++ b/youtube_dl/extractor/tv2hu.py
@@ -0,0 +1,62 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class TV2HuIE(InfoExtractor):
+ IE_NAME = 'tv2.hu'
+ _VALID_URL = r'https?://(?:www\.)?tv2\.hu/(?:[^/]+/)+(?P<id>\d+)_[^/?#]+?\.html'
+ _TESTS = [{
+ 'url': 'http://tv2.hu/ezek_megorultek/217679_ezek-megorultek---1.-adas-1.-resz.html',
+ 'md5': '585e58e2e090f34603804bb2c48e98d8',
+ 'info_dict': {
+ 'id': '217679',
+ 'ext': 'mp4',
+ 'title': 'Ezek megőrültek! - 1. adás 1. rész',
+ 'upload_date': '20160826',
+ 'thumbnail': r're:^https?://.*\.jpg$'
+ }
+ }, {
+ 'url': 'http://tv2.hu/ezek_megorultek/teljes_adasok/217677_ezek-megorultek---1.-adas-2.-resz.html',
+ 'only_matching': True
+ }, {
+ 'url': 'http://tv2.hu/musoraink/aktiv/aktiv_teljes_adas/217963_aktiv-teljes-adas---2016.08.30..html',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ json_url = self._search_regex(
+ r'jsonUrl\s*=\s*"([^"]+)"', webpage, 'json url')
+ json_data = self._download_json(json_url, video_id)
+
+ formats = []
+ for b in ('bitrates', 'backupBitrates'):
+ bitrates = json_data.get(b, {})
+ m3u8_url = bitrates.get('hls')
+ if m3u8_url:
+ formats.extend(self._extract_wowza_formats(
+ m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp']))
+
+ for mp4_url in bitrates.get('mp4', []):
+ height = int_or_none(self._search_regex(
+ r'\.(\d+)p\.mp4', mp4_url, 'height', default=None))
+ formats.append({
+ 'format_id': 'http' + ('-%d' % height if height else ''),
+ 'url': mp4_url,
+ 'height': height,
+ 'width': int_or_none(height / 9.0 * 16.0 if height else None),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage).strip(),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'upload_date': self._search_regex(
+ r'/vod/(\d{8})/', json_url, 'upload_date', default=None),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py
new file mode 100644
index 0000000..88b6baa
--- /dev/null
+++ b/youtube_dl/extractor/tv5mondeplus.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ determine_ext,
+ extract_attributes,
+ get_element_by_class,
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+)
+
+
+class TV5MondePlusIE(InfoExtractor):
+ IE_DESC = 'TV5MONDE+'
+ _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)'
+ _TEST = {
+ 'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants',
+ 'md5': '12130fc199f020673138a83466542ec6',
+ 'info_dict': {
+ 'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants',
+ 'ext': 'mp4',
+ 'title': 'Tdah, mon amour - Enfants',
+ 'description': 'md5:230e3aca23115afcf8006d1bece6df74',
+ 'upload_date': '20170401',
+ 'timestamp': 1491022860,
+ }
+ }
+ _GEO_BYPASS = False
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
+ self.raise_geo_restricted(countries=['FR'])
+
+ series = get_element_by_class('video-detail__title', webpage)
+ title = episode = get_element_by_class(
+ 'video-detail__subtitle', webpage) or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+ vpl_data = extract_attributes(self._search_regex(
+ r'(<[^>]+class="video_player_loader"[^>]+>)',
+ webpage, 'video player loader'))
+
+ video_files = self._parse_json(
+ vpl_data['data-broadcast'], display_id).get('files', [])
+ formats = []
+ for video_file in video_files:
+ v_url = video_file.get('url')
+ if not v_url:
+ continue
+ video_format = video_file.get('format') or determine_ext(v_url)
+ if video_format == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ v_url, display_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': v_url,
+ 'format_id': video_format,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': display_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': clean_html(get_element_by_class('video-detail__description', webpage)),
+ 'thumbnail': vpl_data.get('data-image'),
+ 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)),
+ 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)),
+ 'formats': formats,
+ 'episode': episode,
+ 'series': series,
+ }
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py
index 06ea2b4..c5b3288 100644
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor):
'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
formats.extend(m3u8_formats)
for i, m3u8_format in enumerate(m3u8_formats, 2):
http_url = '%s-%d.mp4' % (video_url_base, i)
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index 3eda0a3..99ff82a 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -225,7 +225,11 @@ class TVPlayIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
+ geo_country = self._search_regex(
+ r'https?://[^/]+\.([a-z]{2})', url,
+ 'geo country', default=None)
+ if geo_country:
+ self._initialize_geo_bypass([geo_country.upper()])
video = self._download_json(
'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py
index b653714..ebde605 100644
--- a/youtube_dl/extractor/tvplayer.py
+++ b/youtube_dl/extractor/tvplayer.py
@@ -2,9 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
extract_attributes,
+ try_get,
urlencode_postdata,
ExtractorError,
)
@@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor):
webpage, 'channel element'))
title = current_channel['data-name']
- resource_id = self._search_regex(
- r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id')
- platform = self._search_regex(
- r'platform\s*=\s*"([^"]+)"', webpage, 'platform')
+ resource_id = current_channel['data-id']
+
token = self._search_regex(
- r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null')
- validate = self._search_regex(
- r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null')
+ r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage,
+ 'token', group='token')
+
+ context = self._download_json(
+ 'https://tvplayer.com/watch/context', display_id,
+ 'Downloading JSON context', query={
+ 'resource': resource_id,
+ 'nonce': token,
+ })
+
+ validate = context['validate']
+ platform = try_get(
+ context, lambda x: x['platform']['key'], compat_str) or 'firefox'
try:
response = self._download_json(
'http://api.tvplayer.com/api/v2/stream/live',
- resource_id, headers={
+ display_id, 'Downloading JSON stream', headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}, data=urlencode_postdata({
+ 'id': resource_id,
'service': 1,
'platform': platform,
- 'id': resource_id,
- 'token': token,
'validate': validate,
}))['tvplayer']['response']
except ExtractorError as e:
@@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor):
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
raise
- formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4')
+ formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4')
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index cce29c6..dae1aa3 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -212,12 +212,15 @@ class UdemyIE(InfoExtractor):
thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl')
duration = float_or_none(asset.get('data', {}).get('duration'))
+ subtitles = {}
+ automatic_captions = {}
+
formats = []
- def extract_output_format(src):
+ def extract_output_format(src, f_id):
return {
'url': src['url'],
- 'format_id': '%sp' % (src.get('height') or format_id),
+ 'format_id': '%sp' % (src.get('height') or f_id),
'width': int_or_none(src.get('width')),
'height': int_or_none(src.get('height')),
'vbr': int_or_none(src.get('video_bitrate_in_kbps')),
@@ -237,30 +240,33 @@ class UdemyIE(InfoExtractor):
def add_output_format_meta(f, key):
output = outputs.get(key)
if isinstance(output, dict):
- output_format = extract_output_format(output)
+ output_format = extract_output_format(output, key)
output_format.update(f)
return output_format
return f
+ def extract_formats(source_list):
+ if not isinstance(source_list, list):
+ return
+ for source in source_list:
+ video_url = source.get('file') or source.get('src')
+ if not video_url or not isinstance(video_url, compat_str):
+ continue
+ format_id = source.get('label')
+ f = {
+ 'url': video_url,
+ 'format_id': '%sp' % format_id,
+ 'height': int_or_none(format_id),
+ }
+ if format_id:
+ # Some videos contain additional metadata (e.g.
+ # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
+ f = add_output_format_meta(f, format_id)
+ formats.append(f)
+
download_urls = asset.get('download_urls')
if isinstance(download_urls, dict):
- video = download_urls.get('Video')
- if isinstance(video, list):
- for format_ in video:
- video_url = format_.get('file')
- if not video_url:
- continue
- format_id = format_.get('label')
- f = {
- 'url': format_['file'],
- 'format_id': '%sp' % format_id,
- 'height': int_or_none(format_id),
- }
- if format_id:
- # Some videos contain additional metadata (e.g.
- # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
- f = add_output_format_meta(f, format_id)
- formats.append(f)
+ extract_formats(download_urls.get('Video'))
view_html = lecture.get('view_html')
if view_html:
@@ -294,6 +300,35 @@ class UdemyIE(InfoExtractor):
'height': height,
}, res))
+ # react rendition since 2017.04.15 (see
+ # https://github.com/rg3/youtube-dl/issues/12744)
+ data = self._parse_json(
+ self._search_regex(
+ r'videojs-setup-data=(["\'])(?P<data>{.+?})\1', view_html,
+ 'setup data', default='{}', group='data'), video_id,
+ transform_source=unescapeHTML, fatal=False)
+ if data and isinstance(data, dict):
+ extract_formats(data.get('sources'))
+ if not duration:
+ duration = int_or_none(data.get('duration'))
+ tracks = data.get('tracks')
+ if isinstance(tracks, list):
+ for track in tracks:
+ if not isinstance(track, dict):
+ continue
+ if track.get('kind') != 'captions':
+ continue
+ src = track.get('src')
+ if not src or not isinstance(src, compat_str):
+ continue
+ lang = track.get('language') or track.get(
+ 'srclang') or track.get('label')
+ sub_dict = automatic_captions if track.get(
+ 'autogenerated') is True else subtitles
+ sub_dict.setdefault(lang, []).append({
+ 'url': src,
+ })
+
self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
return {
@@ -302,7 +337,9 @@ class UdemyIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'duration': duration,
- 'formats': formats
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'automatic_captions': automatic_captions,
}
diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py
new file mode 100644
index 0000000..30297b4
--- /dev/null
+++ b/youtube_dl/extractor/upskill.py
@@ -0,0 +1,176 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .wistia import WistiaIE
+from ..compat import compat_str
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ get_element_by_class,
+ urlencode_postdata,
+ urljoin,
+)
+
+
+class UpskillBaseIE(InfoExtractor):
+ _LOGIN_URL = 'http://upskillcourses.com/sign_in'
+ _NETRC_MACHINE = 'upskill'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login_page, urlh = self._download_webpage_handle(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ login_url = compat_str(urlh.geturl())
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'user[email]': username,
+ 'user[password]': password,
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page,
+ 'post url', default=login_url, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = urljoin(login_url, post_url)
+
+ response = self._download_webpage(
+ post_url, None, 'Logging in',
+ data=urlencode_postdata(login_form),
+ headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Referer': login_url,
+ })
+
+ # Successful login
+ if any(re.search(p, response) for p in (
+ r'class=["\']user-signout',
+ r'<a[^>]+\bhref=["\']/sign_out',
+ r'>\s*Log out\s*<')):
+ return
+
+ message = get_element_by_class('alert', response)
+ if message is not None:
+ raise ExtractorError(
+ 'Unable to login: %s' % clean_html(message), expected=True)
+
+ raise ExtractorError('Unable to log in')
+
+
+class UpskillIE(UpskillBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
+ 'info_dict': {
+ 'id': 'uzw6zw58or',
+ 'ext': 'mp4',
+ 'title': 'Welcome to the Course!',
+ 'description': 'md5:8d66c13403783370af62ca97a7357bdd',
+ 'duration': 138.763,
+ 'timestamp': 1479846621,
+ 'upload_date': '20161122',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ wistia_url = WistiaIE._extract_url(webpage)
+ if not wistia_url:
+ if any(re.search(p, webpage) for p in (
+ r'class=["\']lecture-contents-locked',
+ r'>\s*Lecture contents locked',
+ r'id=["\']lecture-locked')):
+ self.raise_login_required('Lecture contents locked')
+
+ title = self._og_search_title(webpage, default=None)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': wistia_url,
+ 'ie_key': WistiaIE.ie_key(),
+ 'title': title,
+ }
+
+
+class UpskillCourseIE(UpskillBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/',
+ 'info_dict': {
+ 'id': '119763',
+ 'title': 'The Essential Web Developer Course (Free)',
+ },
+ 'playlist_count': 192,
+ }, {
+ 'url': 'http://upskillcourses.com/courses/119763/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://upskillcourses.com/courses/enrolled/119763',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if UpskillIE.suitable(url) else super(
+ UpskillCourseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, course_id)
+
+ course_id = self._search_regex(
+ r'data-course-id=["\'](\d+)', webpage, 'course id',
+ default=course_id)
+
+ entries = []
+
+ for mobj in re.finditer(
+ r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)',
+ webpage):
+ li = mobj.group('li')
+ if 'fa-youtube-play' not in li:
+ continue
+ lecture_url = self._search_regex(
+ r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li,
+ 'lecture url', default=None, group='url')
+ if not lecture_url:
+ continue
+ lecture_id = self._search_regex(
+ r'/lectures/(\d+)', lecture_url, 'lecture id', default=None)
+ title = self._html_search_regex(
+ r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li,
+ 'title', default=None)
+ entries.append(
+ self.url_result(
+ urljoin('http://upskillcourses.com/', lecture_url),
+ ie=UpskillIE.ie_key(), video_id=lecture_id,
+ video_title=clean_html(title)))
+
+ course_title = self._html_search_regex(
+ (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h',
+ r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'),
+ webpage, 'course title', fatal=False)
+
+ return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index 9aa38bc..890a149 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
from ..compat import (
@@ -11,7 +12,6 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
- sanitized_Request,
parse_iso8601,
)
@@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE):
}
def _initialize_api(self, video_id):
- req = sanitized_Request(
- 'http://www.vevo.com/auth', data=b'')
webpage = self._download_webpage(
- req, None,
+ 'https://accounts.vevo.com/token', None,
note='Retrieving oauth token',
- errnote='Unable to retrieve oauth token')
+ errnote='Unable to retrieve oauth token',
+ data=json.dumps({
+ 'client_id': 'SPupX1tvqFEopQ1YS6SS',
+ 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous',
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json',
+ })
if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):
self.raise_geo_restricted(
'%s said: This page is currently unavailable in your region' % self.IE_NAME)
auth_info = self._parse_json(webpage, video_id)
- self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
+ self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']
def _call_api(self, path, *args, **kwargs):
try:
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index f0a7fd7..54e207b 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -20,7 +20,7 @@ from ..utils import (
class ViceBaseIE(AdobePassIE):
- def _extract_preplay_video(self, url, webpage):
+ def _extract_preplay_video(self, url, locale, webpage):
watch_hub_data = extract_attributes(self._search_regex(
r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub'))
video_id = watch_hub_data['vms-id']
@@ -32,7 +32,8 @@ class ViceBaseIE(AdobePassIE):
resource = self._get_mvpd_resource(
'VICELAND', title, video_id,
watch_hub_data.get('video-rating'))
- query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource)
+ query['tvetoken'] = self._extract_mvpd_auth(
+ url, video_id, 'VICELAND', resource)
# signature generation algorithm is reverse engineered from signatureGenerator in
# webpack:///../shared/~/vice-player/dist/js/vice-player.js in
@@ -45,11 +46,14 @@ class ViceBaseIE(AdobePassIE):
try:
host = 'www.viceland' if is_locked else self._PREPLAY_HOST
- preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query)
+ preplay = self._download_json(
+ 'https://%s.com/%s/preplay/%s' % (host, locale, video_id),
+ video_id, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
error = json.loads(e.cause.read().decode())
- raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True)
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, error['details']), expected=True)
raise
video_data = preplay['video']
@@ -88,41 +92,30 @@ class ViceBaseIE(AdobePassIE):
class ViceIE(ViceBaseIE):
- _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+ IE_NAME = 'vice'
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
- 'md5': 'e9d77741f9e42ba583e683cd170660f7',
+ 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+ 'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2',
'info_dict': {
- 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj',
'ext': 'flv',
- 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- 'duration': 725.983,
+ 'title': 'Monkey Labs of Holland',
+ 'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149',
},
'add_ie': ['Ooyala'],
}, {
- 'url': 'http://www.vice.com/video/how-to-hack-a-car',
- 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
- 'info_dict': {
- 'id': '3jstaBeXgAs',
- 'ext': 'mp4',
- 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
- 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
- 'uploader_id': 'MotherboardTV',
- 'uploader': 'Motherboard',
- 'upload_date': '20140529',
- },
- 'add_ie': ['Youtube'],
- }, {
'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
- 'md5': '',
'info_dict': {
'id': '5816510690b70e6c5fd39a56',
'ext': 'mp4',
'uploader': 'Waypoint',
'title': 'The Signal From Tölva',
+ 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
'uploader_id': '57f7d621e05ca860fa9ccaf9',
- 'timestamp': 1477941983938,
+ 'timestamp': 1477941983,
+ 'upload_date': '20161031',
},
'params': {
# m3u8 download
@@ -130,19 +123,31 @@ class ViceIE(ViceBaseIE):
},
'add_ie': ['UplynkPreplay'],
}, {
- 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
- 'only_matching': True,
- }, {
- 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
- 'only_matching': True,
+ 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
+ 'info_dict': {
+ 'id': '581b12b60a0e1f4c0fb6ea2f',
+ 'ext': 'mp4',
+ 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
+ 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>',
+ 'uploader': 'VICE',
+ 'uploader_id': '57a204088cb727dec794c67b',
+ 'timestamp': 1485368119,
+ 'upload_date': '20170125',
+ 'age_limit': 14,
+ },
+ 'params': {
+ # AES-encrypted m3u8
+ 'skip_download': True,
+ },
+ 'add_ie': ['UplynkPreplay'],
}, {
- 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+ 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
'only_matching': True,
}]
_PREPLAY_HOST = 'video.vice'
def _real_extract(self, url):
- video_id = self._match_id(url)
+ locale, video_id = re.match(self._VALID_URL, url).groups()
webpage, urlh = self._download_webpage_handle(url, video_id)
embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', webpage,
@@ -153,10 +158,11 @@ class ViceIE(ViceBaseIE):
r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None)
if youtube_id:
return self.url_result(youtube_id, 'Youtube')
- return self._extract_preplay_video(urlh.geturl(), webpage)
+ return self._extract_preplay_video(urlh.geturl(), locale, webpage)
class ViceShowIE(InfoExtractor):
+ IE_NAME = 'vice:show'
_VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
_TEST = {
@@ -183,6 +189,86 @@ class ViceShowIE(InfoExtractor):
r'<title>(.+?)</title>', webpage, 'title', default=None)
if title:
title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
- description = self._html_search_meta('description', webpage, 'description')
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
return self.playlist_result(entries, show_id, title, description)
+
+
+class ViceArticleIE(InfoExtractor):
+ IE_NAME = 'vice:article'
+ _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P<id>[^?#]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
+ 'info_dict': {
+ 'id': '58dc0a3dee202d2a0ccfcbd8',
+ 'ext': 'mp4',
+ 'title': 'Mormon War on Porn ',
+ 'description': 'md5:ad396a2481e7f8afb5ed486878421090',
+ 'uploader': 'VICE',
+ 'uploader_id': '57a204088cb727dec794c693',
+ 'timestamp': 1489160690,
+ 'upload_date': '20170310',
+ },
+ 'params': {
+ # AES-encrypted m3u8
+ 'skip_download': True,
+ },
+ 'add_ie': ['UplynkPreplay'],
+ }, {
+ 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
+ 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
+ 'info_dict': {
+ 'id': '3jstaBeXgAs',
+ 'ext': 'mp4',
+ 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
+ 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
+ 'uploader_id': 'MotherboardTV',
+ 'uploader': 'Motherboard',
+ 'upload_date': '20140529',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ prefetch_data = self._parse_json(self._search_regex(
+ r'window\.__PREFETCH_DATA\s*=\s*({.*});',
+ webpage, 'prefetch data'), display_id)
+ body = prefetch_data['body']
+
+ def _url_res(video_url, ie_key):
+ return {
+ '_type': 'url_transparent',
+ 'url': video_url,
+ 'display_id': display_id,
+ 'ie_key': ie_key,
+ }
+
+ embed_code = self._search_regex(
+ r'embedCode=([^&\'"]+)', body,
+ 'ooyala embed code', default=None)
+ if embed_code:
+ return _url_res('ooyala:%s' % embed_code, 'Ooyala')
+
+ youtube_url = self._html_search_regex(
+ r'<iframe[^>]+src="(.*youtube\.com/.*)"',
+ body, 'YouTube URL', default=None)
+ if youtube_url:
+ return _url_res(youtube_url, 'Youtube')
+
+ video_url = self._html_search_regex(
+ r'data-video-url="([^"]+)"',
+ prefetch_data['embed_code'], 'video URL')
+
+ return _url_res(video_url, ViceIE.ie_key())
diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py
index 87f9216..bd60235 100644
--- a/youtube_dl/extractor/viceland.py
+++ b/youtube_dl/extractor/viceland.py
@@ -1,11 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .vice import ViceBaseIE
class VicelandIE(ViceBaseIE):
- _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)'
_TEST = {
'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316',
'info_dict': {
@@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE):
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
+ 'skip': '404',
}
_PREPLAY_HOST = 'www.viceland'
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ locale = mobj.group('locale')
webpage = self._download_webpage(url, video_id)
- return self._extract_preplay_video(url, webpage)
+ return self._extract_preplay_video(url, locale, webpage)
diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py
index 049db25..e5f964d 100644
--- a/youtube_dl/extractor/videopress.py
+++ b/youtube_dl/extractor/videopress.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import random
import re
from .common import InfoExtractor
@@ -11,6 +10,7 @@ from ..utils import (
float_or_none,
parse_age_limit,
qualities,
+ random_birthday,
try_get,
unified_timestamp,
urljoin,
@@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ query = random_birthday('birth_year', 'birth_month', 'birth_day')
video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
- video_id, query={
- 'birth_month': random.randint(1, 12),
- 'birth_day': random.randint(1, 31),
- 'birth_year': random.randint(1950, 1995),
- })
+ video_id, query=query)
title = video['title']
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py
index 4e4b4e3..701bb1d 100644
--- a/youtube_dl/extractor/vidio.py
+++ b/youtube_dl/extractor/vidio.py
@@ -49,8 +49,11 @@ class VidioIE(InfoExtractor):
thumbnail = clip.get('image')
m3u8_url = m3u8_url or self._search_regex(
- r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url')
- formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+ r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?!\1).+)\1',
+ webpage, 'hls url')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+ self._sort_formats(formats)
duration = int_or_none(duration or self._search_regex(
r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
index d055629..e64873b 100644
--- a/youtube_dl/extractor/vidzi.py
+++ b/youtube_dl/extractor/vidzi.py
@@ -42,14 +42,15 @@ class VidziIE(InfoExtractor):
title = self._html_search_regex(
r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
- packed_codes = [mobj.group(0) for mobj in re.finditer(
- PACKED_CODES_RE, webpage)]
- for num, pc in enumerate(packed_codes, 1):
- code = decode_packed_codes(pc).replace('\\\'', '\'')
+ codes = [webpage]
+ codes.extend([
+ decode_packed_codes(mobj.group(0)).replace('\\\'', '\'')
+ for mobj in re.finditer(PACKED_CODES_RE, webpage)])
+ for num, code in enumerate(codes, 1):
jwplayer_data = self._parse_json(
self._search_regex(
r'setup\(([^)]+)\)', code, 'jwplayer data',
- default=NO_DEFAULT if num == len(packed_codes) else '{}'),
+ default=NO_DEFAULT if num == len(codes) else '{}'),
video_id, transform_source=js_to_json)
if jwplayer_data:
break
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index 5086f59..3e67eb8 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -5,23 +5,30 @@ import re
import itertools
from .common import InfoExtractor
+from ..utils import (
+ urlencode_postdata,
+ int_or_none,
+ unified_strdate,
+)
class VierIE(InfoExtractor):
IE_NAME = 'vier'
+ IE_DESC = 'vier.be and vijf.be'
_VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
+ _NETRC_MACHINE = 'vier'
_TESTS = [{
'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
+ 'md5': 'e4ae2054a6b040ef1e289e20d111b46e',
'info_dict': {
'id': '16129',
'display_id': 'het-wordt-warm-de-moestuin',
'ext': 'mp4',
'title': 'Het wordt warm in De Moestuin',
'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'upload_date': '20121025',
+ 'series': 'Plan B',
+ 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'],
},
}, {
'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
@@ -29,32 +36,103 @@ class VierIE(InfoExtractor):
'id': '2561614',
'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
'ext': 'mp4',
- 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',
- 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',
+ 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7',
+ 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe',
+ 'upload_date': '20170228',
+ 'series': 'Temptation Island',
+ 'tags': list,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+ 'info_dict': {
+ 'id': '2674839',
+ 'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+ 'ext': 'mp4',
+ 'title': 'Jani gaat naar Tokio - Aflevering 4',
+ 'description': 'md5:aa8d611541db6ae9e863125704511f88',
+ 'upload_date': '20170501',
+ 'series': 'Jani gaat',
+ 'episode_number': 4,
+ 'tags': ['Jani Gaat', 'Volledige Aflevering'],
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Requires account credentials',
+ }, {
+ # Requires account credentials but bypassed extraction via v3/embed page
+ # without metadata
+ 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+ 'info_dict': {
+ 'id': '2674839',
+ 'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+ 'ext': 'mp4',
+ 'title': 'jani-gaat-naar-tokio-aflevering-4',
},
'params': {
- # m3u8 download
'skip_download': True,
},
+ 'expected_warnings': ['Log in to extract metadata'],
}, {
- 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
+ # Without video id in URL
+ 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b',
'only_matching': True,
}, {
'url': 'http://www.vier.be/video/v3/embed/16129',
'only_matching': True,
}]
+ def _real_initialize(self):
+ self._logged_in = False
+
+ def _login(self, site):
+ username, password = self._get_login_info()
+ if username is None or password is None:
+ return
+
+ login_page = self._download_webpage(
+ 'http://www.%s.be/user/login' % site,
+ None, note='Logging in', errnote='Unable to log in',
+ data=urlencode_postdata({
+ 'form_id': 'user_login',
+ 'name': username,
+ 'pass': password,
+ }),
+ headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+ login_error = self._html_search_regex(
+ r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<',
+ login_page, 'login error', default=None)
+ if login_error:
+ self.report_warning('Unable to log in: %s' % login_error)
+ else:
+ self._logged_in = True
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
embed_id = mobj.group('embed_id')
display_id = mobj.group('display_id') or embed_id
+ video_id = mobj.group('id') or embed_id
site = mobj.group('site')
+ if not self._logged_in:
+ self._login(site)
+
webpage = self._download_webpage(url, display_id)
+ if r'id="user-login"' in webpage:
+ self.report_warning(
+ 'Log in to extract metadata', video_id=display_id)
+ webpage = self._download_webpage(
+ 'http://www.%s.be/video/v3/embed/%s' % (site, video_id),
+ display_id)
+
video_id = self._search_regex(
[r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
- webpage, 'video id')
+ webpage, 'video id', default=video_id or display_id)
application = self._search_regex(
[r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
webpage, 'application', default=site + '_vod')
@@ -63,12 +141,25 @@ class VierIE(InfoExtractor):
webpage, 'filename')
playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
- formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])
+ formats = self._extract_wowza_formats(
+ playlist_url, display_id, skip_protocols=['dash'])
self._sort_formats(formats)
title = self._og_search_title(webpage, default=display_id)
- description = self._og_search_description(webpage, default=None)
+ description = self._html_search_regex(
+ r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>',
+ webpage, 'description', default=None, group='value')
thumbnail = self._og_search_thumbnail(webpage, default=None)
+ upload_date = unified_strdate(self._html_search_regex(
+ r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})',
+ webpage, 'upload date', default=None, group='value'))
+
+ series = self._search_regex(
+ r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+ 'series', default=None, group='value')
+ episode_number = int_or_none(self._search_regex(
+ r'(?i)aflevering (\d+)', title, 'episode number', default=None))
+ tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage)
return {
'id': video_id,
@@ -76,6 +167,10 @@ class VierIE(InfoExtractor):
'title': title,
'description': description,
'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'series': series,
+ 'episode_number': episode_number,
+ 'tags': tags,
'formats': formats,
}
diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py
index 18735cf..1f29c27 100644
--- a/youtube_dl/extractor/viewlift.py
+++ b/youtube_dl/extractor/viewlift.py
@@ -68,7 +68,7 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
type_ = source.get('type')
ext = determine_ext(file_)
format_id = source.get('label') or ext
- if all(v == 'm3u8' or v == 'hls' for v in (type_, ext)):
+ if all(v in ('m3u8', 'hls') for v in (type_, ext)):
formats.extend(self._extract_m3u8_formats(
file_, video_id, 'mp4', m3u8_id='hls'))
else:
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index fcf0cb1..d5d5b4c 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor):
if m3u8_formats:
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
if len(qualities) == len(m3u8_formats):
for q, m3u8_format in zip(qualities, m3u8_formats):
f = m3u8_format.copy()
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
index b971890..e589406 100644
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -70,9 +70,9 @@ class VLiveIE(InfoExtractor):
status, long_video_id, key = params[2], params[5], params[6]
status = remove_start(status, 'PRODUCT_')
- if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR':
+ if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
return self._live(video_id, webpage)
- elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO':
+ elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
if long_video_id and key:
return self._replay(video_id, webpage, long_video_id, key)
else:
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py
index 00c72e3..444295d 100644
--- a/youtube_dl/extractor/vrt.py
+++ b/youtube_dl/extractor/vrt.py
@@ -10,6 +10,7 @@ from ..utils import (
class VRTIE(InfoExtractor):
+ IE_DESC = 'deredactie.be, sporza.be, cobra.be and cobra.canvas.be'
_VALID_URL = r'https?://(?:deredactie|sporza|cobra(?:\.canvas)?)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*'
_TESTS = [
# deredactie.be
diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py
new file mode 100644
index 0000000..9959627
--- /dev/null
+++ b/youtube_dl/extractor/vrv.py
@@ -0,0 +1,212 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import json
+import hashlib
+import hmac
+import random
+import string
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlencode,
+ compat_urlparse,
+)
+from ..utils import (
+ float_or_none,
+ int_or_none,
+)
+
+
+class VRVBaseIE(InfoExtractor):
+ _API_DOMAIN = None
+ _API_PARAMS = {}
+ _CMS_SIGNING = {}
+
+ def _call_api(self, path, video_id, note, data=None):
+ base_url = self._API_DOMAIN + '/core/' + path
+ encoded_query = compat_urllib_parse_urlencode({
+ 'oauth_consumer_key': self._API_PARAMS['oAuthKey'],
+ 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]),
+ 'oauth_signature_method': 'HMAC-SHA1',
+ 'oauth_timestamp': int(time.time()),
+ 'oauth_version': '1.0',
+ })
+ headers = self.geo_verification_headers()
+ if data:
+ data = json.dumps(data).encode()
+ headers['Content-Type'] = 'application/json'
+ method = 'POST' if data else 'GET'
+ base_string = '&'.join([method, compat_urlparse.quote(base_url, ''), compat_urlparse.quote(encoded_query, '')])
+ oauth_signature = base64.b64encode(hmac.new(
+ (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'),
+ base_string.encode(), hashlib.sha1).digest()).decode()
+ encoded_query += '&oauth_signature=' + compat_urlparse.quote(oauth_signature, '')
+ return self._download_json(
+ '?'.join([base_url, encoded_query]), video_id,
+ note='Downloading %s JSON metadata' % note, headers=headers, data=data)
+
+ def _call_cms(self, path, video_id, note):
+ if not self._CMS_SIGNING:
+ self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing']
+ return self._download_json(
+ self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING,
+ note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers())
+
+ def _set_api_params(self, webpage, video_id):
+ if not self._API_PARAMS:
+ self._API_PARAMS = self._parse_json(self._search_regex(
+ r'window\.__APP_CONFIG__\s*=\s*({.+?})</script>',
+ webpage, 'api config'), video_id)['cxApiParams']
+ self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co')
+
+ def _get_cms_resource(self, resource_key, video_id):
+ return self._call_api(
+ 'cms_resource', video_id, 'resource path', data={
+ 'resource_key': resource_key,
+ })['__links__']['cms_resource']['href']
+
+
+class VRVIE(VRVBaseIE):
+ IE_NAME = 'vrv'
+ _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
+ _TEST = {
+ 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
+ 'info_dict': {
+ 'id': 'GR9PNZ396',
+ 'ext': 'mp4',
+ 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT',
+ 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f',
+ 'uploader_id': 'seeso',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, video_id,
+ headers=self.geo_verification_headers())
+ media_resource = self._parse_json(self._search_regex(
+ r'window\.__INITIAL_STATE__\s*=\s*({.+?})</script>',
+ webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {}
+
+ video_data = media_resource.get('json')
+ if not video_data:
+ self._set_api_params(webpage, video_id)
+ episode_path = self._get_cms_resource(
+ 'cms:/episodes/' + video_id, video_id)
+ video_data = self._call_cms(episode_path, video_id, 'video')
+ title = video_data['title']
+
+ streams_json = media_resource.get('streams', {}).get('json', {})
+ if not streams_json:
+ self._set_api_params(webpage, video_id)
+ streams_path = video_data['__links__']['streams']['href']
+ streams_json = self._call_cms(streams_path, video_id, 'streams')
+
+ audio_locale = streams_json.get('audio_locale')
+ formats = []
+ for stream_type, streams in streams_json.get('streams', {}).items():
+ if stream_type in ('adaptive_hls', 'adaptive_dash'):
+ for stream in streams.values():
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ stream_id = stream.get('hardsub_locale') or audio_locale
+ format_id = '%s-%s' % (stream_type.split('_')[1], stream_id)
+ if stream_type == 'adaptive_hls':
+ adaptive_formats = self._extract_m3u8_formats(
+ stream_url, video_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s m3u8 information' % stream_id,
+ fatal=False)
+ else:
+ adaptive_formats = self._extract_mpd_formats(
+ stream_url, video_id, mpd_id=format_id,
+ note='Downloading %s MPD information' % stream_id,
+ fatal=False)
+ if audio_locale:
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = audio_locale
+ formats.extend(adaptive_formats)
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for subtitle in streams_json.get('subtitles', {}).values():
+ subtitle_url = subtitle.get('url')
+ if not subtitle_url:
+ continue
+ subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({
+ 'url': subtitle_url,
+ 'ext': subtitle.get('format', 'ass'),
+ })
+
+ thumbnails = []
+ for thumbnail in video_data.get('images', {}).get('thumbnails', []):
+ thumbnail_url = thumbnail.get('source')
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int_or_none(thumbnail.get('width')),
+ 'height': int_or_none(thumbnail.get('height')),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'description': video_data.get('description'),
+ 'duration': float_or_none(video_data.get('duration_ms'), 1000),
+ 'uploader_id': video_data.get('channel_id'),
+ 'series': video_data.get('series_title'),
+ 'season': video_data.get('season_title'),
+ 'season_number': int_or_none(video_data.get('season_number')),
+ 'season_id': video_data.get('season_id'),
+ 'episode': title,
+ 'episode_number': int_or_none(video_data.get('episode_number')),
+ 'episode_id': video_data.get('production_episode_id'),
+ }
+
+
+class VRVSeriesIE(VRVBaseIE):
+ IE_NAME = 'vrv:series'
+ _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P<id>[A-Z0-9]+)'
+ _TEST = {
+ 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider',
+ 'info_dict': {
+ 'id': 'G68VXG3G6',
+ },
+ 'playlist_mincount': 11,
+ }
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, series_id,
+ headers=self.geo_verification_headers())
+
+ self._set_api_params(webpage, series_id)
+ seasons_path = self._get_cms_resource(
+ 'cms:/seasons?series_id=' + series_id, series_id)
+ seasons_data = self._call_cms(seasons_path, series_id, 'seasons')
+
+ entries = []
+ for season in seasons_data.get('items', []):
+ episodes_path = season['__links__']['season/episodes']['href']
+ episodes = self._call_cms(episodes_path, series_id, 'episodes')
+ for episode in episodes.get('items', []):
+ episode_id = episode['id']
+ entries.append(self.url_result(
+ 'https://vrv.co/watch/' + episode_id,
+ 'VRV', episode_id, episode.get('title')))
+
+ return self.playlist_result(entries, series_id)
diff --git a/youtube_dl/extractor/vshare.py b/youtube_dl/extractor/vshare.py
new file mode 100644
index 0000000..5addbc2
--- /dev/null
+++ b/youtube_dl/extractor/vshare.py
@@ -0,0 +1,38 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class VShareIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://vshare.io/d/0f64ce6',
+ 'md5': '16d7b8fef58846db47419199ff1ab3e7',
+ 'info_dict': {
+ 'id': '0f64ce6',
+ 'title': 'vl14062007715967',
+ 'ext': 'mp4',
+ }
+ }, {
+ 'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://vshare.io/d/%s' % video_id, video_id)
+
+ title = self._html_search_regex(
+ r'(?s)<div id="root-container">(.+?)<br/>', webpage, 'title')
+ video_url = self._search_regex(
+ r'<a[^>]+href=(["\'])(?P<url>(?:https?:)?//.+?)\1[^>]*>[Cc]lick\s+here',
+ webpage, 'video url', group='url')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ }
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index 839cad9..625d0a1 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -13,6 +13,7 @@ from ..utils import (
class WashingtonPostIE(InfoExtractor):
IE_NAME = 'washingtonpost'
_VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_TEST = {
'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
@@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor):
},
}
+ @classmethod
+ def _extract_urls(cls, webpage):
+ return re.findall(
+ r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage)
+
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index c634b8d..2182d6f 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -1,10 +1,13 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
+ unescapeHTML,
)
@@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_url(webpage):
+ match = re.search(
+ r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ if match:
+ return unescapeHTML(match.group('url'))
+
+ match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
+ if match:
+ return 'wistia:%s' % match.group('id')
+
+ match = re.search(
+ r'''(?sx)
+ <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
+ <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+ ''', webpage)
+ if match:
+ return 'wistia:%s' % match.group('id')
+
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index 09415b5..82587b4 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -1,12 +1,10 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class WorldStarHipHopIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P<id>.*)'
+ _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?.*?\bv=(?P<id>[^&]+)'
_TESTS = [{
'url': 'http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO',
'md5': '9d04de741161603bf7071bbf4e883186',
@@ -17,48 +15,26 @@ class WorldStarHipHopIE(InfoExtractor):
}
}, {
'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO',
- 'md5': 'dc1c76c83ecc4190bb1eb143899b87d3',
- 'info_dict': {
- 'id': 'wshh6a7q1ny0G34ZwuIO',
- 'ext': 'mp4',
- 'title': 'KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!'
- }
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- m_vevo_id = re.search(r'videoId=(.*?)&amp?', webpage)
- if m_vevo_id is not None:
- return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
-
- video_url = self._search_regex(
- [r'so\.addVariable\("file","(.*?)"\)',
- r'<div class="artlist">\s*<a[^>]+href="([^"]+)">'],
- webpage, 'video URL')
+ entries = self._parse_html5_media_entries(url, webpage, video_id)
- if 'youtube' in video_url:
- return self.url_result(video_url, ie='Youtube')
+ if not entries:
+ return self.url_result(url, 'Generic')
- video_title = self._html_search_regex(
+ title = self._html_search_regex(
[r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
r'<span[^>]+class="tc-sp-pinned-title">(.*)</span>'],
webpage, 'title')
- # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- thumbnail = self._html_search_regex(
- r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',
- default=None)
- if not thumbnail:
- _title = r'candytitles.*>(.*)</span>'
- mobj = re.search(_title, webpage)
- if mobj is not None:
- video_title = mobj.group(1)
-
- return {
+ info = entries[0]
+ info.update({
'id': video_id,
- 'url': video_url,
- 'title': video_title,
- 'thumbnail': thumbnail,
- }
+ 'title': title,
+ })
+ return info
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index deb7483..45cfca7 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -10,12 +10,14 @@ from ..utils import (
class WSJIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://
- (?:
- video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
- (?:www\.)?wsj\.com/video/[^/]+/
- )
- (?P<id>[a-zA-Z0-9-]+)'''
+ _VALID_URL = r'''(?x)
+ (?:
+ https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
+ https?://(?:www\.)?wsj\.com/video/[^/]+/|
+ wsj:
+ )
+ (?P<id>[a-fA-F0-9-]{36})
+ '''
IE_DESC = 'Wall Street Journal'
_TESTS = [{
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
@@ -38,12 +40,17 @@ class WSJIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- api_url = (
- 'http://video-api.wsj.com/api-video/find_all_videos.asp?'
- 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
- 'thumbnailList,author,description,name,duration,videoURL,'
- 'titletag,formattedCreationDate,keywords,editor' % video_id)
- info = self._download_json(api_url, video_id)['items'][0]
+ info = self._download_json(
+ 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
+ query={
+ 'type': 'guid',
+ 'count': 1,
+ 'query': video_id,
+ 'fields': ','.join((
+ 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
+ 'description', 'name', 'duration', 'videoURL', 'titletag',
+ 'formattedCreationDate', 'keywords', 'editor')),
+ })['items'][0]
title = info.get('name', info.get('titletag'))
formats = []
@@ -87,3 +94,24 @@ class WSJIE(InfoExtractor):
'title': title,
'categories': info.get('keywords'),
}
+
+
+class WSJArticleIE(InfoExtractor):
+ _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
+ 'info_dict': {
+ 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
+ 'ext': 'mp4',
+ 'upload_date': '20170221',
+ 'uploader_id': 'ralcaraz',
+ 'title': 'Bao Bao the Panda Leaves for China',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ video_id = self._search_regex(
+ r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
+ return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)
diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py
index e616adc..13f8be6 100644
--- a/youtube_dl/extractor/xfileshare.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
decode_packed_codes,
+ determine_ext,
ExtractorError,
int_or_none,
NO_DEFAULT,
@@ -16,21 +17,24 @@ from ..utils import (
class XFileShareIE(InfoExtractor):
_SITES = (
- ('daclips.in', 'DaClips'),
- ('filehoot.com', 'FileHoot'),
- ('gorillavid.in', 'GorillaVid'),
- ('movpod.in', 'MovPod'),
- ('powerwatch.pw', 'PowerWatch'),
- ('rapidvideo.ws', 'Rapidvideo.ws'),
- ('thevideobee.to', 'TheVideoBee'),
- ('vidto.me', 'Vidto'),
- ('streamin.to', 'Streamin.To'),
- ('xvidstage.com', 'XVIDSTAGE'),
+ (r'daclips\.(?:in|com)', 'DaClips'),
+ (r'filehoot\.com', 'FileHoot'),
+ (r'gorillavid\.(?:in|com)', 'GorillaVid'),
+ (r'movpod\.in', 'MovPod'),
+ (r'powerwatch\.pw', 'PowerWatch'),
+ (r'rapidvideo\.ws', 'Rapidvideo.ws'),
+ (r'thevideobee\.to', 'TheVideoBee'),
+ (r'vidto\.me', 'Vidto'),
+ (r'streamin\.to', 'Streamin.To'),
+ (r'xvidstage\.com', 'XVIDSTAGE'),
+ (r'vidabc\.com', 'Vid ABC'),
+ (r'vidbom\.com', 'VidBom'),
+ (r'vidlo\.us', 'vidlo'),
)
IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
_VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
- % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0]))
+ % '|'.join(site for site in list(zip(*_SITES))[0]))
_FILE_NOT_FOUND_REGEXES = (
r'>(?:404 - )?File Not Found<',
@@ -95,6 +99,16 @@ class XFileShareIE(InfoExtractor):
# removed by administrator
'url': 'http://xvidstage.com/amfy7atlkx25',
'only_matching': True,
+ }, {
+ 'url': 'http://vidabc.com/i8ybqscrphfv',
+ 'info_dict': {
+ 'id': 'i8ybqscrphfv',
+ 'ext': 'mp4',
+ 'title': 're:Beauty and the Beast 2017',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -133,31 +147,45 @@ class XFileShareIE(InfoExtractor):
webpage, 'title', default=None) or self._og_search_title(
webpage, default=None) or video_id).strip()
- def extract_video_url(default=NO_DEFAULT):
- return self._search_regex(
- (r'file\s*:\s*(["\'])(?P<url>http.+?)\1,',
- r'file_link\s*=\s*(["\'])(?P<url>http.+?)\1',
- r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http.+?)\2\)',
- r'<embed[^>]+src=(["\'])(?P<url>http.+?)\1'),
- webpage, 'file url', default=default, group='url')
-
- video_url = extract_video_url(default=None)
-
- if not video_url:
+ def extract_formats(default=NO_DEFAULT):
+ urls = []
+ for regex in (
+ r'file\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
+ r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
+ r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
+ r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):
+ for mobj in re.finditer(regex, webpage):
+ video_url = mobj.group('url')
+ if video_url not in urls:
+ urls.append(video_url)
+ formats = []
+ for video_url in urls:
+ if determine_ext(video_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls',
+ fatal=False))
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'sd',
+ })
+ if not formats and default is not NO_DEFAULT:
+ return default
+ self._sort_formats(formats)
+ return formats
+
+ formats = extract_formats(default=None)
+
+ if not formats:
webpage = decode_packed_codes(self._search_regex(
r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))",
webpage, 'packed code'))
- video_url = extract_video_url()
+ formats = extract_formats()
thumbnail = self._search_regex(
r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None)
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- 'quality': 1,
- }]
-
return {
'id': video_id,
'title': title,
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 5584674..bea9b87 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ js_to_json,
orderedSet,
parse_duration,
sanitized_Request,
@@ -38,6 +39,22 @@ class XTubeIE(InfoExtractor):
'age_limit': 18,
}
}, {
+ # FLV videos with duplicated formats
+ 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
+ 'md5': 'a406963eb349dd43692ec54631efd88b',
+ 'info_dict': {
+ 'id': '9299752',
+ 'display_id': 'A-Super-Run-Part-1-YT',
+ 'ext': 'flv',
+ 'title': 'A Super Run - Part 1 (YT)',
+ 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93',
+ 'uploader': 'tshirtguy59',
+ 'duration': 579,
+ 'view_count': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ },
+ }, {
# new URL schema
'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
'only_matching': True,
@@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor):
})
sources = self._parse_json(self._search_regex(
- r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),',
- webpage, 'sources', group='sources'), video_id)
+ r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
+ webpage, 'sources', group='sources'), video_id,
+ transform_source=js_to_json)
formats = []
for format_id, format_url in sources.items():
@@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor):
'format_id': format_id,
'height': int_or_none(format_id),
})
+ self._remove_duplicate_formats(formats)
self._sort_formats(formats)
title = self._search_regex(
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 30825da..eca6030 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -6,8 +6,10 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
clean_html,
- ExtractorError,
determine_ext,
+ ExtractorError,
+ int_or_none,
+ parse_duration,
)
@@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor):
'id': '4588838',
'ext': 'mp4',
'title': 'Biker Takes his Girl',
+ 'duration': 108,
'age_limit': 18,
}
}
@@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor):
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
+ video_duration = int_or_none(self._og_search_property(
+ 'duration', webpage, default=None)) or parse_duration(
+ self._search_regex(
+ r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)',
+ webpage, 'duration', fatal=False))
formats = []
@@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor):
'id': video_id,
'formats': formats,
'title': video_title,
+ 'duration': video_duration,
'thumbnail': video_thumbnail,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 4951414..38f82bf 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -258,7 +258,7 @@ class YahooIE(InfoExtractor):
return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
# Look for Brightcove New Studio embeds
- bc_url = BrightcoveNewIE._extract_url(webpage)
+ bc_url = BrightcoveNewIE._extract_url(self, webpage)
if bc_url:
return self.url_result(bc_url, BrightcoveNewIE.ie_key())
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index fd6268b..eb10621 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
'overembed': 'false',
})['playlist']
- tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+ tracks = playlist['tracks']
+ track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]
# tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
# missing tracks should be retrieved manually.
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index e37f237..73ebe57 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -10,12 +10,14 @@ import time
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse_urlencode,
compat_ord,
+ compat_str,
+ compat_urllib_parse_urlencode,
)
from ..utils import (
ExtractorError,
get_element_by_attribute,
+ try_get,
)
@@ -105,7 +107,9 @@ class YoukuIE(InfoExtractor):
if stream.get('channel_type') == 'tail':
continue
format = stream.get('stream_type')
- fileid = stream['stream_fileid']
+ fileid = try_get(
+ stream, lambda x: x['segs'][0]['fileid'],
+ compat_str) or stream['stream_fileid']
fileid_dict[format] = fileid
def get_fileid(format, n):
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index ca40de5..44a3928 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -38,7 +38,6 @@ from ..utils import (
parse_duration,
remove_quotes,
remove_start,
- sanitized_Request,
smuggle_url,
str_to_int,
try_get,
@@ -54,7 +53,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
- _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password'
+
+ _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
+ _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
+ _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
@@ -96,72 +99,150 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
login_form = self._hidden_inputs(login_page)
- login_form.update({
- 'checkConnection': 'youtube',
- 'Email': username,
- 'Passwd': password,
- })
+ def req(url, f_req, note, errnote):
+ data = login_form.copy()
+ data.update({
+ 'pstMsg': 1,
+ 'checkConnection': 'youtube',
+ 'checkedDomains': 'youtube',
+ 'hl': 'en',
+ 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
+ 'f.req': json.dumps(f_req),
+ 'flowName': 'GlifWebSignIn',
+ 'flowEntry': 'ServiceLogin',
+ })
+ return self._download_json(
+ url, None, note=note, errnote=errnote,
+ transform_source=lambda s: re.sub(r'^[^[]*', '', s),
+ fatal=False,
+ data=urlencode_postdata(data), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
+ 'Google-Accounts-XSRF': 1,
+ })
- login_results = self._download_webpage(
- self._PASSWORD_CHALLENGE_URL, None,
- note='Logging in', errnote='unable to log in', fatal=False,
- data=urlencode_postdata(login_form))
- if login_results is False:
- return False
+ def warn(message):
+ self._downloader.report_warning(message)
+
+ lookup_req = [
+ username,
+ None, [], None, 'US', None, None, 2, False, True,
+ [
+ None, None,
+ [2, 1, None, 1,
+ 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
+ None, [], 4],
+ 1, [None, None, []], None, None, None, True
+ ],
+ username,
+ ]
- error_msg = self._html_search_regex(
- r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<',
- login_results, 'error message', default=None)
- if error_msg:
- raise ExtractorError('Unable to login: %s' % error_msg, expected=True)
+ lookup_results = req(
+ self._LOOKUP_URL, lookup_req,
+ 'Looking up account info', 'Unable to look up account info')
- if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
- raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
+ if lookup_results is False:
+ return False
- # Two-Factor
- # TODO add SMS and phone call support - these require making a request and then prompting the user
+ user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
+ if not user_hash:
+ warn('Unable to extract user hash')
+ return False
- if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None:
- tfa_code = self._get_tfa_info('2-step verification code')
+ challenge_req = [
+ user_hash,
+ None, 1, None, [1, None, None, None, [password, None, True]],
+ [
+ None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
+ 1, [None, None, []], None, None, None, True
+ ]]
- if not tfa_code:
- self._downloader.report_warning(
- 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
- '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
- return False
+ challenge_results = req(
+ self._CHALLENGE_URL, challenge_req,
+ 'Logging in', 'Unable to log in')
- tfa_code = remove_start(tfa_code, 'G-')
+ if challenge_results is False:
+ return
- tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
+ login_res = try_get(challenge_results, lambda x: x[0][5], list)
+ if login_res:
+ login_msg = try_get(login_res, lambda x: x[5], compat_str)
+ warn(
+ 'Unable to login: %s' % 'Invalid password'
+ if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
+ return False
- tfa_form_strs.update({
- 'Pin': tfa_code,
- 'TrustDevice': 'on',
- })
+ res = try_get(challenge_results, lambda x: x[0][-1], list)
+ if not res:
+ warn('Unable to extract result entry')
+ return False
- tfa_data = urlencode_postdata(tfa_form_strs)
+ tfa = try_get(res, lambda x: x[0][0], list)
+ if tfa:
+ tfa_str = try_get(tfa, lambda x: x[2], compat_str)
+ if tfa_str == 'TWO_STEP_VERIFICATION':
+ # SEND_SUCCESS - TFA code has been successfully sent to phone
+ # QUOTA_EXCEEDED - reached the limit of TFA codes
+ status = try_get(tfa, lambda x: x[5], compat_str)
+ if status == 'QUOTA_EXCEEDED':
+ warn('Exceeded the limit of TFA codes, try later')
+ return False
+
+ tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
+ if not tl:
+ warn('Unable to extract TL')
+ return False
+
+ tfa_code = self._get_tfa_info('2-step verification code')
+
+ if not tfa_code:
+ warn(
+ 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+ '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+ return False
+
+ tfa_code = remove_start(tfa_code, 'G-')
+
+ tfa_req = [
+ user_hash, None, 2, None,
+ [
+ 9, None, None, None, None, None, None, None,
+ [None, tfa_code, True, 2]
+ ]]
+
+ tfa_results = req(
+ self._TFA_URL.format(tl), tfa_req,
+ 'Submitting TFA code', 'Unable to submit TFA code')
+
+ if tfa_results is False:
+ return False
+
+ tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
+ if tfa_res:
+ tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
+ warn(
+ 'Unable to finish TFA: %s' % 'Invalid TFA code'
+ if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
+ return False
+
+ check_cookie_url = try_get(
+ tfa_results, lambda x: x[0][-1][2], compat_str)
+ else:
+ check_cookie_url = try_get(res, lambda x: x[2], compat_str)
- tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
- tfa_results = self._download_webpage(
- tfa_req, None,
- note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
+ if not check_cookie_url:
+ warn('Unable to extract CheckCookie URL')
+ return False
- if tfa_results is False:
- return False
+ check_cookie_results = self._download_webpage(
+ check_cookie_url, None, 'Checking cookie', fatal=False)
- if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None:
- self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
- return False
- if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None:
- self._downloader.report_warning('unable to log in - did the page structure change?')
- return False
- if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
- self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
- return False
+ if check_cookie_results is False:
+ return False
- if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning('unable to log in: bad username or password')
+ if 'https://myaccount.google.com/' not in check_cookie_results:
+ warn('Unable to log in')
return False
+
return True
def _real_initialize(self):
@@ -317,60 +398,60 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
# DASH mp4 video
- '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
- '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
- '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
- '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
+ '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
+ '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
# Dash mp4 audio
- '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
- '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
- '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
- '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
- '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
- '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'preference': -50, 'container': 'm4a_dash'},
- '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'preference': -50, 'container': 'm4a_dash'},
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
+ '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
+ '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
# Dash webm
- '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},
- '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
+ '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
- '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
- '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
- '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
- '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
+ '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
# Dash webm audio
- '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
- '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
+ '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
+ '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
# Dash webm audio with opus inside
- '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
- '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
- '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
+ '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
+ '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
+ '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
# RTMP (unnamed)
'_rtmp': {'protocol': 'rtmp'},
@@ -963,7 +1044,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -1253,25 +1334,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_id = mobj.group(2)
return video_id
- def _extract_from_m3u8(self, manifest_url, video_id):
- url_map = {}
-
- def _get_urls(_manifest):
- lines = _manifest.split('\n')
- urls = filter(lambda l: l and not l.startswith('#'),
- lines)
- return urls
- manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
- formats_urls = _get_urls(manifest)
- for format_url in formats_urls:
- itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
- url_map[itag] = format_url
- return url_map
-
def _extract_annotations(self, video_id):
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+ @staticmethod
+ def _extract_chapters(description, duration):
+ if not description:
+ return None
+ chapter_lines = re.findall(
+ r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
+ description)
+ if not chapter_lines:
+ return None
+ chapters = []
+ for next_num, (chapter_line, time_point) in enumerate(
+ chapter_lines, start=1):
+ start_time = parse_duration(time_point)
+ if start_time is None:
+ continue
+ end_time = (duration if next_num == len(chapter_lines)
+ else parse_duration(chapter_lines[next_num][1]))
+ if end_time is None:
+ continue
+ chapter_title = re.sub(
+ r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
+ chapter_title = re.sub(r'\s+', ' ', chapter_title)
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': chapter_title,
+ })
+ return chapters
+
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -1414,9 +1509,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_title = '_'
# description
- video_description = get_element_by_id("eow-description", video_webpage)
+ description_original = video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
- video_description = re.sub(r'''(?x)
+ description_original = video_description = re.sub(r'''(?x)
<a\s+
(?:[a-zA-Z-]+="[^"]*"\s+)*?
(?:title|href)="([^"]+)"\s+
@@ -1573,18 +1668,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if self._downloader.params.get('writeannotations', False):
video_annotations = self._extract_annotations(video_id)
- def _map_to_format_list(urlmap):
- formats = []
- for itag, video_real_url in urlmap.items():
- dct = {
- 'format_id': itag,
- 'url': video_real_url,
- 'player_url': player_url,
- }
- if itag in self._formats:
- dct.update(self._formats[itag])
- formats.append(dct)
- return formats
+ chapters = self._extract_chapters(description_original, video_duration)
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
@@ -1657,7 +1741,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+ r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
@@ -1718,11 +1803,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
formats.append(dct)
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
- url_map = self._extract_from_m3u8(manifest_url, video_id)
- formats = _map_to_format_list(url_map)
- # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
- for a_format in formats:
+ formats = []
+ m3u8_formats = self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', fatal=False)
+ for a_format in m3u8_formats:
+ itag = self._search_regex(
+ r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
+ if itag:
+ a_format['format_id'] = itag
+ if itag in self._formats:
+ dct = self._formats[itag].copy()
+ dct.update(a_format)
+ a_format = dct
+ a_format['player_url'] = player_url
+ # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
+ formats.append(a_format)
else:
unavailable_message = self._html_search_regex(
r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
@@ -1806,6 +1902,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
+ 'chapters': chapters,
'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
'view_count': view_count,
'like_count': like_count,
diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py
new file mode 100644
index 0000000..889aff5
--- /dev/null
+++ b/youtube_dl/extractor/zaq1.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_timestamp,
+)
+
+
+class Zaq1IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://zaq1.pl/video/xev0e',
+ 'md5': '24a5eb3f052e604ae597c4d0d19b351e',
+ 'info_dict': {
+ 'id': 'xev0e',
+ 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa',
+ 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147',
+ 'ext': 'mp4',
+ 'duration': 511,
+ 'timestamp': 1490896361,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170330',
+ 'view_count': int,
+ }
+ }, {
+ # malformed JSON-LD
+ 'url': 'http://zaq1.pl/video/x81vn',
+ 'info_dict': {
+ 'id': 'x81vn',
+ 'title': 'SEKRETNE ŻYCIE WALTERA MITTY',
+ 'ext': 'mp4',
+ 'duration': 6234,
+ 'timestamp': 1493494860,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170429',
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to parse JSON'],
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'video url', group='url')
+
+ info = self._search_json_ld(webpage, video_id, fatal=False)
+
+ def extract_data(field, name, fatal=False):
+ return self._search_regex(
+ r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field,
+ webpage, field, fatal=fatal, group='field')
+
+ if not info.get('title'):
+ info['title'] = extract_data('file-name', 'title', fatal=True)
+
+ if not info.get('duration'):
+ info['duration'] = int_or_none(extract_data('duration', 'duration'))
+
+ if not info.get('thumbnail'):
+ info['thumbnail'] = extract_data('photo-url', 'thumbnail')
+
+ if not info.get('timestamp'):
+ info['timestamp'] = unified_timestamp(self._html_search_meta(
+ 'uploadDate', webpage, 'timestamp'))
+
+ if not info.get('interactionCount'):
+ info['view_count'] = int_or_none(self._html_search_meta(
+ 'interactionCount', webpage, 'view count'))
+
+ uploader = self._html_search_regex(
+ r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader',
+ fatal=False)
+
+ width = int_or_none(self._html_search_meta(
+ 'width', webpage, fatal=False))
+ height = int_or_none(self._html_search_meta(
+ 'height', webpage, fatal=False))
+
+ info.update({
+ 'id': video_id,
+ 'formats': [{
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'http_headers': {
+ 'Referer': url,
+ },
+ }],
+ 'uploader': uploader,
+ })
+
+ return info