aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRogério Brito <rbrito@ime.usp.br>2017-05-19 03:21:46 -0300
committerRogério Brito <rbrito@ime.usp.br>2017-05-19 03:21:46 -0300
commit532a08904ffbacc5e5ccf99edb660c5f37ddb213 (patch)
tree12399229cfa94b847a3ba07fcdca2336e5adc5b0
parent97a8fc3ae80fb363c69c2e6b8c29b5373ac72aea (diff)
parentd9d7cd0e85dc712461d9185db9df9d6c900a573b (diff)
downloadyoutube-dl-532a08904ffbacc5e5ccf99edb660c5f37ddb213.zip
youtube-dl-532a08904ffbacc5e5ccf99edb660c5f37ddb213.tar.gz
youtube-dl-532a08904ffbacc5e5ccf99edb660c5f37ddb213.tar.bz2
Merge tag 'upstream/2017.05.18.1'
Upstream version 2017.05.18.1
-rw-r--r--ChangeLog317
-rw-r--r--Makefile2
-rw-r--r--README.md25
-rw-r--r--README.txt25
-rw-r--r--docs/supportedsites.md48
-rw-r--r--test/test_InfoExtractor.py315
-rw-r--r--test/test_YoutubeDL.py4
-rw-r--r--test/test_download.py18
-rw-r--r--test/test_subtitles.py8
-rw-r--r--test/test_utils.py54
-rw-r--r--test/test_youtube_chapters.py268
-rwxr-xr-xyoutube-dlbin1497486 -> 1535435 bytes
-rw-r--r--youtube-dl.126
-rw-r--r--youtube-dl.bash-completion2
-rw-r--r--youtube-dl.fish7
-rw-r--r--youtube-dl.zsh2
-rwxr-xr-xyoutube_dl/YoutubeDL.py26
-rw-r--r--youtube_dl/__init__.py1
-rw-r--r--youtube_dl/compat.py2
-rw-r--r--youtube_dl/downloader/common.py34
-rw-r--r--youtube_dl/downloader/dash.py43
-rw-r--r--youtube_dl/downloader/external.py12
-rw-r--r--youtube_dl/downloader/f4m.py33
-rw-r--r--youtube_dl/downloader/fragment.py122
-rw-r--r--youtube_dl/downloader/hls.py51
-rw-r--r--youtube_dl/downloader/ism.py34
-rw-r--r--youtube_dl/downloader/rtmp.py2
-rw-r--r--youtube_dl/extractor/adn.py136
-rw-r--r--youtube_dl/extractor/adobepass.py69
-rw-r--r--youtube_dl/extractor/adultswim.py281
-rw-r--r--youtube_dl/extractor/aenetworks.py40
-rw-r--r--youtube_dl/extractor/afreecatv.py142
-rw-r--r--youtube_dl/extractor/airmozilla.py28
-rw-r--r--youtube_dl/extractor/aljazeera.py9
-rw-r--r--youtube_dl/extractor/allocine.py44
-rw-r--r--youtube_dl/extractor/amp.py26
-rw-r--r--youtube_dl/extractor/anvato.py66
-rw-r--r--youtube_dl/extractor/appleconnect.py4
-rw-r--r--youtube_dl/extractor/appletrailers.py5
-rw-r--r--youtube_dl/extractor/archiveorg.py4
-rw-r--r--youtube_dl/extractor/arte.py5
-rw-r--r--youtube_dl/extractor/atresplayer.py2
-rw-r--r--youtube_dl/extractor/audioboom.py2
-rw-r--r--youtube_dl/extractor/azubu.py140
-rw-r--r--youtube_dl/extractor/bandcamp.py12
-rw-r--r--youtube_dl/extractor/bbc.py10
-rw-r--r--youtube_dl/extractor/beeg.py2
-rw-r--r--youtube_dl/extractor/bilibili.py5
-rw-r--r--youtube_dl/extractor/bleacherreport.py10
-rw-r--r--youtube_dl/extractor/br.py2
-rw-r--r--youtube_dl/extractor/brightcove.py111
-rw-r--r--youtube_dl/extractor/canalc2.py5
-rw-r--r--youtube_dl/extractor/canalplus.py43
-rw-r--r--youtube_dl/extractor/canvas.py1
-rw-r--r--youtube_dl/extractor/cbc.py6
-rw-r--r--youtube_dl/extractor/cbslocal.py4
-rwxr-xr-xyoutube_dl/extractor/cda.py52
-rw-r--r--youtube_dl/extractor/ceskatelevize.py106
-rw-r--r--youtube_dl/extractor/chaturbate.py16
-rw-r--r--youtube_dl/extractor/clipfish.py2
-rw-r--r--youtube_dl/extractor/collegerama.py3
-rw-r--r--youtube_dl/extractor/common.py286
-rw-r--r--youtube_dl/extractor/condenast.py81
-rw-r--r--youtube_dl/extractor/coub.py5
-rw-r--r--youtube_dl/extractor/crackle.py7
-rw-r--r--youtube_dl/extractor/crunchyroll.py12
-rw-r--r--youtube_dl/extractor/cspan.py15
-rw-r--r--youtube_dl/extractor/curiositystream.py55
-rw-r--r--youtube_dl/extractor/cwtv.py7
-rw-r--r--youtube_dl/extractor/dailymail.py12
-rw-r--r--youtube_dl/extractor/dailymotion.py126
-rw-r--r--youtube_dl/extractor/democracynow.py3
-rw-r--r--youtube_dl/extractor/discoveryvr.py59
-rw-r--r--youtube_dl/extractor/dotsub.py2
-rw-r--r--youtube_dl/extractor/douyutv.py86
-rw-r--r--youtube_dl/extractor/drtv.py34
-rw-r--r--youtube_dl/extractor/extractors.py57
-rw-r--r--youtube_dl/extractor/foxsports.py9
-rw-r--r--youtube_dl/extractor/francetv.py218
-rw-r--r--youtube_dl/extractor/funimation.py273
-rw-r--r--youtube_dl/extractor/funnyordie.py3
-rw-r--r--youtube_dl/extractor/gamespot.py3
-rw-r--r--youtube_dl/extractor/gdcvault.py15
-rw-r--r--youtube_dl/extractor/generic.py262
-rw-r--r--youtube_dl/extractor/go.py49
-rw-r--r--youtube_dl/extractor/go90.py126
-rw-r--r--youtube_dl/extractor/hbo.py16
-rw-r--r--youtube_dl/extractor/imdb.py5
-rw-r--r--youtube_dl/extractor/infoq.py4
-rw-r--r--youtube_dl/extractor/instagram.py8
-rw-r--r--youtube_dl/extractor/iqiyi.py26
-rw-r--r--youtube_dl/extractor/itv.py28
-rw-r--r--youtube_dl/extractor/kaltura.py27
-rw-r--r--youtube_dl/extractor/laola1tv.py97
-rw-r--r--youtube_dl/extractor/leeco.py111
-rw-r--r--youtube_dl/extractor/lego.py2
-rw-r--r--youtube_dl/extractor/limelight.py53
-rw-r--r--youtube_dl/extractor/liveleak.py83
-rw-r--r--youtube_dl/extractor/mediaset.py118
-rw-r--r--youtube_dl/extractor/medici.py70
-rw-r--r--youtube_dl/extractor/mixcloud.py38
-rw-r--r--youtube_dl/extractor/myspace.py100
-rw-r--r--youtube_dl/extractor/nbc.py98
-rw-r--r--youtube_dl/extractor/nonktube.py33
-rw-r--r--youtube_dl/extractor/noovo.py97
-rw-r--r--youtube_dl/extractor/nowness.py2
-rw-r--r--youtube_dl/extractor/npo.py11
-rw-r--r--youtube_dl/extractor/nrk.py25
-rw-r--r--youtube_dl/extractor/nuevo.py5
-rw-r--r--youtube_dl/extractor/odnoklassniki.py32
-rw-r--r--youtube_dl/extractor/openload.py73
-rw-r--r--youtube_dl/extractor/orf.py109
-rw-r--r--youtube_dl/extractor/packtpub.py171
-rw-r--r--youtube_dl/extractor/pbs.py31
-rw-r--r--youtube_dl/extractor/periscope.py7
-rw-r--r--youtube_dl/extractor/porn91.py32
-rw-r--r--youtube_dl/extractor/pornhub.py5
-rw-r--r--youtube_dl/extractor/r7.py3
-rw-r--r--youtube_dl/extractor/rai.py509
-rw-r--r--youtube_dl/extractor/rbmaradio.py6
-rw-r--r--youtube_dl/extractor/rmcdecouverte.py26
-rw-r--r--youtube_dl/extractor/rtl2.py110
-rw-r--r--youtube_dl/extractor/rudo.py2
-rw-r--r--youtube_dl/extractor/streamable.py6
-rw-r--r--youtube_dl/extractor/streamango.py64
-rw-r--r--youtube_dl/extractor/ted.py2
-rw-r--r--youtube_dl/extractor/theplatform.py21
-rw-r--r--youtube_dl/extractor/thescene.py36
-rw-r--r--youtube_dl/extractor/thesun.py32
-rw-r--r--youtube_dl/extractor/turner.py9
-rw-r--r--youtube_dl/extractor/tv2hu.py62
-rw-r--r--youtube_dl/extractor/tv5mondeplus.py79
-rw-r--r--youtube_dl/extractor/tvp.py3
-rw-r--r--youtube_dl/extractor/tvplay.py6
-rw-r--r--youtube_dl/extractor/tvplayer.py35
-rw-r--r--youtube_dl/extractor/udemy.py79
-rw-r--r--youtube_dl/extractor/upskill.py176
-rw-r--r--youtube_dl/extractor/vevo.py17
-rw-r--r--youtube_dl/extractor/vice.py154
-rw-r--r--youtube_dl/extractor/viceland.py11
-rw-r--r--youtube_dl/extractor/videopress.py9
-rw-r--r--youtube_dl/extractor/vidio.py7
-rw-r--r--youtube_dl/extractor/vidzi.py11
-rw-r--r--youtube_dl/extractor/vier.py117
-rw-r--r--youtube_dl/extractor/viewlift.py2
-rw-r--r--youtube_dl/extractor/viewster.py3
-rw-r--r--youtube_dl/extractor/vlive.py4
-rw-r--r--youtube_dl/extractor/vrt.py1
-rw-r--r--youtube_dl/extractor/vrv.py212
-rw-r--r--youtube_dl/extractor/vshare.py38
-rw-r--r--youtube_dl/extractor/washingtonpost.py6
-rw-r--r--youtube_dl/extractor/wistia.py22
-rw-r--r--youtube_dl/extractor/worldstarhiphop.py46
-rw-r--r--youtube_dl/extractor/wsj.py52
-rw-r--r--youtube_dl/extractor/xfileshare.py86
-rw-r--r--youtube_dl/extractor/xtube.py23
-rw-r--r--youtube_dl/extractor/xvideos.py11
-rw-r--r--youtube_dl/extractor/yahoo.py2
-rw-r--r--youtube_dl/extractor/yandexmusic.py3
-rw-r--r--youtube_dl/extractor/youku.py8
-rw-r--r--youtube_dl/extractor/youtube.py363
-rw-r--r--youtube_dl/extractor/zaq1.py101
-rw-r--r--youtube_dl/jsinterp.py38
-rw-r--r--youtube_dl/options.py17
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py33
-rw-r--r--youtube_dl/postprocessor/metadatafromtitle.py6
-rw-r--r--youtube_dl/socks.py5
-rw-r--r--youtube_dl/utils.py215
-rw-r--r--youtube_dl/version.py2
169 files changed, 6639 insertions, 2448 deletions
diff --git a/ChangeLog b/ChangeLog
index 07725b1..1637876 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,320 @@
+version 2017.05.18.1
+
+Core
+* [jsinterp] Fix typo and cleanup regular expressions (#13134)
+
+
+version 2017.05.18
+
+Core
++ [jsinterp] Add support for quoted names and indexers (#13123, #13124, #13125,
+ #13126, #13128, #13129, #13130, #13131, #13132)
++ [extractor/common] Add support for schemeless URLs in _extract_wowza_formats
+ (#13088, #13092)
++ [utils] Recognize more audio codecs (#13081)
+
+Extractors
++ [vier] Extract more metadata (#12539)
+* [vier] Improve extraction (#12801)
+ + Add support for authentication
+ * Bypass authentication when no credentials provided
+ * Improve extraction robustness
+* [dailymail] Fix sources extraction (#13057)
+* [dailymotion] Extend URL regular expression (#13079)
+
+
+version 2017.05.14
+
+Core
++ [extractor/common] Respect Width and Height attributes in ISM manifests
++ [postprocessor/metadatafromtitle] Add support regular expression syntax for
+ --metadata-from-title (#13065)
+
+Extractors
++ [mediaset] Add support for video.mediaset.it (#12708, #12964)
+* [orf:radio] Fix extraction (#11643, #12926)
+* [aljazeera] Extend URL regular expression (#13053)
+* [imdb] Relax URL regular expression (#13056)
++ [francetv] Add support for mobile.france.tv (#13068)
++ [upskill] Add support for upskillcourses.com (#13043)
+* [thescene] Fix extraction (#13061)
+* [condenast] Improve embed support
+* [liveleak] Fix extraction (#12053)
++ [douyu] Support Douyu shows (#12228)
+* [myspace] Improve URL regular expression (#13040)
+* [adultswim] Use desktop platform in assets URL (#13041)
+
+
+version 2017.05.09
+
+Core
+* [YoutubeDL] Force --restrict-filenames when no locale is set on all python
+ versions (#13027)
+
+Extractors
+* [francetv] Adapt to site redesign (#13034)
++ [packtpub] Add support for authentication (#12622)
+* [drtv] Lower preference for SignLanguage formats (#13013, #13016)
++ [cspan] Add support for brightcove live embeds (#13028)
+* [vrv] Extract DASH formats and subtitles
+* [funimation] Fix authentication (#13021)
+* [adultswim] Fix extraction (#8640, #10950, #11042, #12121)
+ + Add support for Adobe Pass authentication
+ + Add support for live streams
+ + Add support for show pages
+* [turner] Extract thumbnail, is_live and strip description
++ [nonktube] Add support for nonktube.com (#8647, #13024)
++ [nuevo] Pass headers to _extract_nuevo
+* [nbc] Improve extraction (#12364)
+
+
+version 2017.05.07
+
+Common
+* [extractor/common] Fix typo in _extract_akamai_formats
++ [postprocessor/ffmpeg] Embed chapters into media file with --add-metadata
++ [extractor/common] Introduce chapters meta field
+
+Extractors
+* [youtube] Fix authentication (#12820, #12927, #12973, #12992, #12993, #12995,
+ #13003)
+* [bilibili] Fix video downloading (#13001)
+* [rmcdecouverte] Fix extraction (#12937)
+* [theplatform] Extract chapters
+* [bandcamp] Fix thumbnail extraction (#12980)
+* [pornhub] Extend URL regular expression (#12996)
++ [youtube] Extract chapters
++ [nrk] Extract chapters
++ [vice] Add support for ooyala embeds in article pages
++ [vice] Support vice articles (#12968)
+* [vice] Fix extraction for non en_us videos (#12967)
+* [gdcvault] Fix extraction for some videos (#12733)
+* [pbs] Improve multipart video support (#12981)
+* [laola1tv] Fix extraction (#12880)
++ [cda] Support birthday verification (#12789)
+* [leeco] Fix extraction (#12974)
++ [pbs] Extract chapters
+* [amp] Imporove thumbnail and subtitles extraction
+* [foxsports] Fix extraction (#12945)
+- [coub] Remove comment count extraction (#12941)
+
+
+version 2017.05.01
+
+Core
++ [extractor/common] Extract view count from JSON-LD
+* [utils] Improve unified_timestamp
++ [utils] Add video/mp2t to mimetype2ext
+* [downloader/external] Properly handle live stream downloading cancellation
+ (#8932)
++ [utils] Add support for unicode whitespace in clean_html on python 2 (#12906)
+
+Extractors
+* [infoq] Make audio format extraction non fatal (#12938)
+* [brightcove] Allow whitespace around attribute names in embedded code
++ [zaq1] Add support for zaq1.pl (#12693)
++ [xvideos] Extract duration (#12828)
+* [vevo] Fix extraction (#12879)
++ [noovo] Add support for noovo.ca (#12792)
++ [washingtonpost] Add support for embeds (#12699)
+* [yandexmusic:playlist] Fix extraction for python 3 (#12888)
+* [anvato] Improve extraction (#12913)
+ * Promote to regular shortcut based extractor
+ * Add mcp to access key mapping table
+ * Add support for embeds extraction
+ * Add support for anvato embeds in generic extractor
+* [xtube] Fix extraction for older FLV videos (#12734)
+* [tvplayer] Fix extraction (#12908)
+
+
+version 2017.04.28
+
+Core
++ [adobepass] Use geo verification headers for all requests
+- [downloader/fragment] Remove assert for resume_len when no fragments
+ downloaded
++ [extractor/common] Add manifest_url for explicit group rendition formats
+* [extractor/common] Fix manifest_url for m3u8 formats
+- [extractor/common] Don't list master m3u8 playlists in format list (#12832)
+
+Extractor
+* [aenetworks] Fix extraction for shows with single season
++ [go] Add support for Disney, DisneyJunior and DisneyXD show pages
+* [youtube] Recognize new locale-based player URLs (#12885)
++ [streamable] Add support for new embedded URL schema (#12844)
+* [arte:+7] Relax URL regular expression (#12837)
+
+
+version 2017.04.26
+
+Core
+* Introduce --keep-fragments for keeping fragments of fragmented download
+ on disk after download is finished
+* [YoutubeDL] Fix output template for missing timestamp (#12796)
+* [socks] Handle cases where credentials are required but missing
+* [extractor/common] Improve HLS extraction (#12211)
+ * Extract m3u8 parsing to separate method
+ * Improve rendition groups extraction
+ * Build stream name according stream GROUP-ID
+ * Ignore reference to AUDIO group without URI when stream has no CODECS
+ * Use float for scaled tbr in _parse_m3u8_formats
+* [utils] Add support for TTML styles in dfxp2srt
+* [downloader/hls] No need to download keys for fragments that have been
+ already downloaded
+* [downloader/fragment] Improve fragment downloading
+ * Resume immediately
+ * Don't concatenate fragments and decrypt them on every resume
+ * Optimize disk storage usage, don't store intermediate fragments on disk
+ * Store bookkeeping download state file
++ [extractor/common] Add support for multiple getters in try_get
++ [extractor/common] Add support for video of WebPage context in _json_ld
+ (#12778)
++ [extractor/common] Relax JWPlayer regular expression and remove
+ duplicate URLs (#12768)
+
+Extractors
+* [iqiyi] Fix extraction of Yule videos
+* [vidio] Improve extraction and sort formats
++ [brightcove] Match only video elements with data-video-id attribute
+* [iqiyi] Fix playlist detection (#12504)
+- [azubu] Remove extractor (#12813)
+* [porn91] Fix extraction (#12814)
+* [vidzi] Fix extraction (#12793)
++ [amp] Extract error message (#12795)
++ [xfileshare] Add support for gorillavid.com and daclips.com (#12776)
+* [instagram] Fix extraction (#12777)
++ [generic] Support Brightcove videos in <iframe> (#12482)
++ [brightcove] Support URLs with bcpid instead of playerID (#12482)
+* [brightcove] Fix _extract_url (#12782)
++ [odnoklassniki] Extract HLS formats
+
+
+version 2017.04.17
+
+Extractors
+* [limelight] Improve extraction LimelightEmbeddedPlayerFlash media embeds and
+ add support for channel and channelList embeds
+* [generic] Extract multiple Limelight embeds (#12761)
++ [itv] Extract series metadata
+* [itv] Fix RTMP formats downloading (#12759)
+* [itv] Use native HLS downloader by default
++ [go90] Extract subtitles (#12752)
++ [go90] Extract series metadata (#12752)
+
+
+version 2017.04.16
+
+Core
+* [YoutubeDL] Apply expand_path after output template substitution
++ [YoutubeDL] Propagate overridden meta fields to extraction results of type
+ url (#11163)
+
+Extractors
++ [generic] Extract RSS entries as url_transparent (#11163)
++ [streamango] Add support for streamango.com (#12643)
++ [wsj:article] Add support for articles (#12558)
+* [brightcove] Relax video tag embeds extraction and validate ambiguous embeds'
+ URLs (#9163, #12005, #12178, #12480)
++ [udemy] Add support for react rendition (#12744)
+
+
+version 2017.04.15
+
+Extractors
+* [youku] Fix fileid extraction (#12741, #12743)
+
+
+version 2017.04.14
+
+Core
++ [downloader/hls] Add basic support for EXT-X-BYTERANGE tag (#10955)
++ [adobepass] Improve Comcast and Verison login code (#10803)
++ [adobepass] Add support for Verizon (#10803)
+
+Extractors
++ [aenetworks] Add support for specials (#12723)
++ [hbo] Extract HLS formats
++ [go90] Add support for go90.com (#10127)
++ [tv2hu] Add support for tv2.hu (#10509)
++ [generic] Exclude URLs with xml ext from valid video URLs (#10768, #11654)
+* [youtube] Improve HLS formats extraction
+* [afreecatv] Fix extraction for videos with different key layout (#12718)
+- [youtube] Remove explicit preference for audio-only and video-only formats in
+ order not to break sorting when new formats appear
+* [canalplus] Bypass geo restriction
+
+
+version 2017.04.11
+
+Extractors
+* [afreecatv] Fix extraction (#12706)
++ [generic] Add support for <object> YouTube embeds (#12637)
+* [bbccouk] Treat bitrate as audio+video bitrate in media selector
++ [bbccouk] Skip unrecognized formats in media selector (#12701)
++ [bbccouk] Add support for https protocol in media selector (#12701)
+* [curiositystream] Fix extraction (#12638)
+* [adn] Update subtitle decryption key
+* [chaturbate] Fix extraction (#12665, #12688, #12690)
+
+
+version 2017.04.09
+
+Extractors
++ [medici] Add support for medici.tv (#3406)
++ [rbmaradio] Add support for redbullradio.com URLs (#12687)
++ [npo:live] Add support for default URL (#12555)
+* [mixcloud:playlist] Fix title, description and view count extraction (#12582)
++ [thesun] Add suport for thesun.co.uk (#11298, #12674)
++ [ceskateleveize:porady] Add support for porady (#7411, #12645)
+* [ceskateleveize] Improve extraction and remove URL replacement hacks
++ [kaltura] Add support for iframe embeds (#12679)
+* [airmozilla] Fix extraction (#12670)
+* [wshh] Extract html5 entries and delegate to generic extractor (12676)
++ [raiplay] Extract subtitles
++ [xfileshare] Add support for vidlo.us (#12660)
++ [xfileshare] Add support for vidbom.com (#12661)
++ [aenetworks] Add more video URL regular expressions (#12657)
++ [odnoklassniki] Fix format sorting for 1080p quality
++ [rtl2] Add support for you.rtl2.de (#10257)
++ [vshare] Add support for vshare.io (#12278)
+
+
+version 2017.04.03
+
+Core
++ [extractor/common] Add censorship check for TransTelekom ISP
+* [extractor/common] Move censorship checks to a separate method
+
+Extractors
++ [discoveryvr] Add support for discoveryvr.com (#12578)
++ [tv5mondeplus] Add support for tv5mondeplus.com (#11386)
++ [periscope] Add support for pscp.tv URLs (#12618, #12625)
+
+
+version 2017.04.02
+
+Core
+* [YoutubeDL] Return early when extraction of url_transparent fails
+
+Extractors
+* [rai] Fix and improve extraction (#11790)
++ [vrv] Add support for series pages
+* [limelight] Improve extraction for audio only formats
+* [funimation] Fix extraction (#10696, #11773)
++ [xfileshare] Add support for vidabc.com (#12589)
++ [xfileshare] Improve extraction and extract hls formats
++ [crunchyroll] Pass geo verifcation proxy
++ [cwtv] Extract ISM formats
++ [tvplay] Bypass geo restriction
++ [vrv] Add support for vrv.co
++ [packtpub] Add support for packtpub.com (#12610)
++ [generic] Pass base_url to _parse_jwplayer_data
++ [adn] Add support for animedigitalnetwork.fr (#4866)
++ [allocine] Extract more metadata
+* [allocine] Fix extraction (#12592)
+* [openload] Fix extraction
+
+
version 2017.03.26
Core
diff --git a/Makefile b/Makefile
index 9d1ddc9..0235563 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
all: youtube-dl README.md CONTRIBUTING.md README.txt youtube-dl.1 youtube-dl.bash-completion youtube-dl.zsh youtube-dl.fish supportedsites
clean:
- rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part* *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
+ rm -rf youtube-dl.1.temp.md youtube-dl.1 youtube-dl.bash-completion README.txt MANIFEST build/ dist/ .coverage cover/ youtube-dl.tar.gz youtube-dl.zsh youtube-dl.fish youtube_dl/extractor/lazy_extractors.py *.dump *.part* *.ytdl *.info.json *.mp4 *.m4a *.flv *.mp3 *.avi *.mkv *.webm *.3gp *.wav *.ape *.swf *.jpg *.png CONTRIBUTING.md.tmp ISSUE_TEMPLATE.md.tmp youtube-dl youtube-dl.exe
find . -name "*.pyc" -delete
find . -name "*.class" -delete
diff --git a/README.md b/README.md
index 86b4478..dc0be1f 100644
--- a/README.md
+++ b/README.md
@@ -181,12 +181,15 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
-R, --retries RETRIES Number of retries (default is 10), or
"infinite".
--fragment-retries RETRIES Number of retries for a fragment (default
- is 10), or "infinite" (DASH and hlsnative
- only)
- --skip-unavailable-fragments Skip unavailable fragments (DASH and
- hlsnative only)
+ is 10), or "infinite" (DASH, hlsnative and
+ ISM)
+ --skip-unavailable-fragments Skip unavailable fragments (DASH, hlsnative
+ and ISM)
--abort-on-unavailable-fragment Abort downloading when some fragment is not
available
+ --keep-fragments Keep downloaded fragments on disk after
+ downloading is finished; fragments are
+ erased by default
--buffer-size SIZE Size of download buffer (e.g. 1024 or 16K)
(default is 1024)
--no-resize-buffer Do not automatically adjust the buffer
@@ -397,12 +400,14 @@ Alternatively, refer to the [developer instructions](#developer-instructions) fo
--add-metadata Write metadata to the video file
--metadata-from-title FORMAT Parse additional metadata like song title /
artist from the video title. The format
- syntax is the same as --output, the parsed
- parameters replace existing values.
- Additional templates: %(album)s,
- %(artist)s. Example: --metadata-from-title
- "%(artist)s - %(title)s" matches a title
- like "Coldplay - Paradise"
+ syntax is the same as --output. Regular
+ expression with named capture groups may
+ also be used. The parsed parameters replace
+ existing values. Example: --metadata-from-
+ title "%(artist)s - %(title)s" matches a
+ title like "Coldplay - Paradise". Example
+ (regex): --metadata-from-title
+ "(?P<artist>.+?) - (?P<title>.+)"
--xattrs Write metadata to the video file's xattrs
(using dublin core and xdg standards)
--fixup POLICY Automatically correct known faults of the
diff --git a/README.txt b/README.txt
index e0f1c54..129756d 100644
--- a/README.txt
+++ b/README.txt
@@ -210,12 +210,15 @@ Download Options:
-R, --retries RETRIES Number of retries (default is 10), or
"infinite".
--fragment-retries RETRIES Number of retries for a fragment (default
- is 10), or "infinite" (DASH and hlsnative
- only)
- --skip-unavailable-fragments Skip unavailable fragments (DASH and
- hlsnative only)
+ is 10), or "infinite" (DASH, hlsnative and
+ ISM)
+ --skip-unavailable-fragments Skip unavailable fragments (DASH, hlsnative
+ and ISM)
--abort-on-unavailable-fragment Abort downloading when some fragment is not
available
+ --keep-fragments Keep downloaded fragments on disk after
+ downloading is finished; fragments are
+ erased by default
--buffer-size SIZE Size of download buffer (e.g. 1024 or 16K)
(default is 1024)
--no-resize-buffer Do not automatically adjust the buffer
@@ -444,12 +447,14 @@ Post-processing Options:
--add-metadata Write metadata to the video file
--metadata-from-title FORMAT Parse additional metadata like song title /
artist from the video title. The format
- syntax is the same as --output, the parsed
- parameters replace existing values.
- Additional templates: %(album)s,
- %(artist)s. Example: --metadata-from-title
- "%(artist)s - %(title)s" matches a title
- like "Coldplay - Paradise"
+ syntax is the same as --output. Regular
+ expression with named capture groups may
+ also be used. The parsed parameters replace
+ existing values. Example: --metadata-from-
+ title "%(artist)s - %(title)s" matches a
+ title like "Coldplay - Paradise". Example
+ (regex): --metadata-from-title
+ "(?P<artist>.+?) - (?P<title>.+)"
--xattrs Write metadata to the video file's xattrs
(using dublin core and xdg standards)
--fixup POLICY Automatically correct known faults of the
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index e9dbc02..aa6c118 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -28,6 +28,7 @@
- **acast**
- **acast:channel**
- **AddAnime**
+ - **ADN**: Anime Digital Network
- **AdobeTV**
- **AdobeTVChannel**
- **AdobeTVShow**
@@ -44,6 +45,7 @@
- **anderetijden**: npo.nl and ntr.nl
- **AnimeOnDemand**
- **anitube.se**
+ - **Anvato**
- **AnySex**
- **Aparat**
- **AppleConnect**
@@ -80,8 +82,6 @@
- **AZMedien**: AZ Medien videos
- **AZMedienPlaylist**: AZ Medien playlists
- **AZMedienShowPlaylist**: AZ Medien show playlists
- - **Azubu**
- - **AzubuLive**
- **BaiduVideo**: 百度视频
- **bambuser**
- **bambuser:channel**
@@ -126,7 +126,7 @@
- **CamWithHer**
- **canalc2.tv**
- **Canalplus**: canalplus.fr, piwiplus.fr and d8.tv
- - **Canvas**
+ - **Canvas**: canvas.be and een.be
- **CarambaTV**
- **CarambaTVPage**
- **CartoonNetwork**
@@ -144,6 +144,7 @@
- **CCTV**: 央视网
- **CDA**
- **CeskaTelevize**
+ - **CeskaTelevizePorady**
- **channel9**: Channel 9
- **CharlieRose**
- **Chaturbate**
@@ -212,8 +213,10 @@
- **DiscoveryGo**
- **DiscoveryGoPlaylist**
- **DiscoveryNetworksDe**
+ - **DiscoveryVR**
- **Disney**
- **Dotsub**
+ - **DouyuShow**
- **DouyuTV**: 斗鱼
- **DPlay**
- **DPlayIt**
@@ -279,7 +282,8 @@
- **france2.fr:generation-quoi**
- **FranceCulture**
- **FranceInter**
- - **francetv**: France 2, 3, 4, 5 and Ô
+ - **FranceTV**
+ - **FranceTVEmbed**
- **francetvinfo.fr**
- **Freesound**
- **freespeech.org**
@@ -305,6 +309,7 @@
- **Globo**
- **GloboArticle**
- **Go**
+ - **Go90**
- **GodTube**
- **GodTV**
- **Golem**
@@ -429,6 +434,8 @@
- **MDR**: MDR.DE and KiKA
- **media.ccc.de**
- **Medialaan**
+ - **Mediaset**
+ - **Medici**
- **Meipai**: 美拍
- **MelonVOD**
- **META**
@@ -526,6 +533,8 @@
- **NJPWWorld**: 新日本プロレスワールド
- **NobelPrize**
- **Noco**
+ - **NonkTube**
+ - **Noovo**
- **Normalboots**
- **NosVideo**
- **Nova**: TN.cz, Prásk.tv, Nova.cz, Novaplus.cz, FANDA.tv, Krásná.cz and Doma.cz
@@ -572,6 +581,8 @@
- **orf:iptv**: iptv.ORF.at
- **orf:oe1**: Radio Österreich 1
- **orf:tvthek**: ORF TVthek
+ - **PacktPub**
+ - **PacktPubCourse**
- **PandaTV**: 熊猫TV
- **pandora.tv**: 판도라TV
- **parliamentlive.tv**: UK parliament videos
@@ -595,7 +606,6 @@
- **pluralsight**
- **pluralsight:course**
- **plus.google**: Google Plus
- - **pluzz.francetv.fr**
- **podomatic**
- **Pokemon**
- **PolskieRadio**
@@ -629,7 +639,7 @@
- **radiofrance**
- **RadioJavan**
- **Rai**
- - **RaiTV**
+ - **RaiPlay**
- **RBMARadio**
- **RDS**: RDS.ca
- **RedBullTV**
@@ -654,7 +664,9 @@
- **rte**: Raidió Teilifís Éireann TV
- **rte:radio**: Raidió Teilifís Éireann radio
- **rtl.nl**: rtl.nl and rtlxl.nl
- - **RTL2**
+ - **rtl2**
+ - **rtl2:you**
+ - **rtl2:you:series**
- **RTP**
- **RTS**: RTS.ch
- **rtve.es:alacarta**: RTVE a la carta
@@ -736,6 +748,7 @@
- **Steam**
- **Stitcher**
- **Streamable**
+ - **Streamango**
- **streamcloud.eu**
- **StreamCZ**
- **StreetVoice**
@@ -776,6 +789,7 @@
- **TheScene**
- **TheSixtyOne**
- **TheStar**
+ - **TheSun**
- **TheWeatherChannel**
- **ThisAmericanLife**
- **ThisAV**
@@ -809,9 +823,11 @@
- **Tutv**
- **tv.dfb.de**
- **TV2**
+ - **tv2.hu**
- **TV2Article**
- **TV3**
- **TV4**: tv4.se and tv4play.se
+ - **TV5MondePlus**: TV5MONDE+
- **TVA**
- **TVANouvelles**
- **TVANouvellesArticle**
@@ -847,6 +863,8 @@
- **uol.com.br**
- **uplynk**
- **uplynk:preplay**
+ - **Upskill**
+ - **UpskillCourse**
- **Urort**: NRK P3 Urørt
- **URPlay**
- **USANetwork**
@@ -866,9 +884,10 @@
- **VGTV**: VGTV, BTTV, FTV, Aftenposten and Aftonbladet
- **vh1.com**
- **Viafree**
- - **Vice**
+ - **vice**
+ - **vice:article**
+ - **vice:show**
- **Viceland**
- - **ViceShow**
- **Vidbit**
- **Viddler**
- **Videa**
@@ -888,7 +907,7 @@
- **vidme:user**
- **vidme:user:likes**
- **Vidzi**
- - **vier**
+ - **vier**: vier.be and vijf.be
- **vier:videos**
- **ViewLift**
- **ViewLiftEmbed**
@@ -925,7 +944,10 @@
- **Vporn**
- **vpro**: npo.nl and ntr.nl
- **Vrak**
- - **VRT**
+ - **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be
+ - **vrv**
+ - **vrv:series**
+ - **VShare**
- **vube**: Vube.com
- **VuClip**
- **VVVVID**
@@ -951,9 +973,10 @@
- **wrzuta.pl**
- **wrzuta.pl:playlist**
- **WSJ**: Wall Street Journal
+ - **WSJArticle**
- **XBef**
- **XboxClips**
- - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE
+ - **XFileShare**: XFileShare based sites: DaClips, FileHoot, GorillaVid, MovPod, PowerWatch, Rapidvideo.ws, TheVideoBee, Vidto, Streamin.To, XVIDSTAGE, Vid ABC, VidBom, vidlo
- **XHamster**
- **XHamsterEmbed**
- **xiami:album**: 虾米音乐 - 专辑
@@ -998,6 +1021,7 @@
- **youtube:user**: YouTube.com user videos (URL or "ytuser" keyword)
- **youtube:watchlater**: Youtube watch later list, ":ytwatchlater" for short (requires authentication)
- **Zapiks**
+ - **Zaq1**
- **ZDF**
- **ZDFChannel**
- **zingmp3**: mp3.zing.vn
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index 881197a..6f52e11 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -3,12 +3,13 @@
from __future__ import unicode_literals
# Allow direct execution
+import io
import os
import sys
import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-from test.helper import FakeYDL, expect_dict
+from test.helper import FakeYDL, expect_dict, expect_value
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import YoutubeIE, get_info_extractor
from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
@@ -175,6 +176,318 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
}]
})
+ def test_parse_m3u8_formats(self):
+ _TEST_CASES = [
+ (
+ # https://github.com/rg3/youtube-dl/issues/11507
+ # http://pluzz.francetv.fr/videos/le_ministere.html
+ 'pluzz_francetv_11507',
+ 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais',
+ [{
+ 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_0_av.m3u8?null=0',
+ 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais',
+ 'ext': 'mp4',
+ 'format_id': '180',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.66.30',
+ 'tbr': 180,
+ 'width': 256,
+ 'height': 144,
+ }, {
+ 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_1_av.m3u8?null=0',
+ 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais',
+ 'ext': 'mp4',
+ 'format_id': '303',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.66.30',
+ 'tbr': 303,
+ 'width': 320,
+ 'height': 180,
+ }, {
+ 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_2_av.m3u8?null=0',
+ 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais',
+ 'ext': 'mp4',
+ 'format_id': '575',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.66.30',
+ 'tbr': 575,
+ 'width': 512,
+ 'height': 288,
+ }, {
+ 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_3_av.m3u8?null=0',
+ 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais',
+ 'ext': 'mp4',
+ 'format_id': '831',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.77.30',
+ 'tbr': 831,
+ 'width': 704,
+ 'height': 396,
+ }, {
+ 'url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/index_4_av.m3u8?null=0',
+ 'manifest_url': 'http://replayftv-vh.akamaihd.net/i/streaming-adaptatif_france-dom-tom/2017/S16/J2/156589847-58f59130c1f52-,standard1,standard2,standard3,standard4,standard5,.mp4.csmil/master.m3u8?caption=2017%2F16%2F156589847-1492488987.m3u8%3Afra%3AFrancais&audiotrack=0%3Afra%3AFrancais',
+ 'ext': 'mp4',
+ 'protocol': 'm3u8',
+ 'format_id': '1467',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.77.30',
+ 'tbr': 1467,
+ 'width': 1024,
+ 'height': 576,
+ }]
+ ),
+ (
+ # https://github.com/rg3/youtube-dl/issues/11995
+ # http://teamcoco.com/video/clueless-gamer-super-bowl-for-honor
+ 'teamcoco_11995',
+ 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
+ [{
+ 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-160k_v4.m3u8',
+ 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
+ 'ext': 'mp4',
+ 'format_id': 'audio-0-Default',
+ 'protocol': 'm3u8',
+ 'vcodec': 'none',
+ }, {
+ 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8',
+ 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
+ 'ext': 'mp4',
+ 'format_id': 'audio-1-Default',
+ 'protocol': 'm3u8',
+ 'vcodec': 'none',
+ }, {
+ 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-audio-64k_v4.m3u8',
+ 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '71',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.5',
+ 'vcodec': 'none',
+ 'tbr': 71,
+ }, {
+ 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8',
+ 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '413',
+ 'protocol': 'm3u8',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.42001e',
+ 'tbr': 413,
+ 'width': 400,
+ 'height': 224,
+ }, {
+ 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-400k_v4.m3u8',
+ 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '522',
+ 'protocol': 'm3u8',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.42001e',
+ 'tbr': 522,
+ 'width': 400,
+ 'height': 224,
+ }, {
+ 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-1m_v4.m3u8',
+ 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '1205',
+ 'protocol': 'm3u8',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.4d001e',
+ 'tbr': 1205,
+ 'width': 640,
+ 'height': 360,
+ }, {
+ 'url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/hls/CONAN_020217_Highlight_show-2m_v4.m3u8',
+ 'manifest_url': 'http://ak.storage-w.teamcococdn.com/cdn/2017-02/98599/ed8f/main.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '2374',
+ 'protocol': 'm3u8',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.4d001f',
+ 'tbr': 2374,
+ 'width': 1024,
+ 'height': 576,
+ }]
+ ),
+ (
+ # https://github.com/rg3/youtube-dl/issues/12211
+ # http://video.toggle.sg/en/series/whoopie-s-world/ep3/478601
+ 'toggle_mobile_12211',
+ 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8',
+ [{
+ 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_sa2ntrdg/name/a.mp4/index.m3u8',
+ 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8',
+ 'ext': 'mp4',
+ 'format_id': 'audio-English',
+ 'protocol': 'm3u8',
+ 'language': 'eng',
+ 'vcodec': 'none',
+ }, {
+ 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_r7y0nitg/name/a.mp4/index.m3u8',
+ 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8',
+ 'ext': 'mp4',
+ 'format_id': 'audio-Undefined',
+ 'protocol': 'm3u8',
+ 'language': 'und',
+ 'vcodec': 'none',
+ }, {
+ 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_qlk9hlzr/name/a.mp4/index.m3u8',
+ 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '155',
+ 'protocol': 'm3u8',
+ 'tbr': 155.648,
+ 'width': 320,
+ 'height': 180,
+ }, {
+ 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/2/pv/1/flavorId/0_oefackmi/name/a.mp4/index.m3u8',
+ 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '502',
+ 'protocol': 'm3u8',
+ 'tbr': 502.784,
+ 'width': 480,
+ 'height': 270,
+ }, {
+ 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_vyg9pj7k/name/a.mp4/index.m3u8',
+ 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '827',
+ 'protocol': 'm3u8',
+ 'tbr': 827.392,
+ 'width': 640,
+ 'height': 360,
+ }, {
+ 'url': 'http://k.toggle.sg/fhls/p/2082311/sp/208231100/serveFlavor/entryId/0_89q6e8ku/v/12/pv/1/flavorId/0_50n4psvx/name/a.mp4/index.m3u8',
+ 'manifest_url': 'http://cdnapi.kaltura.com/p/2082311/sp/208231100/playManifest/protocol/http/entryId/0_89q6e8ku/format/applehttp/tags/mobile_sd/f/a.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '1396',
+ 'protocol': 'm3u8',
+ 'tbr': 1396.736,
+ 'width': 854,
+ 'height': 480,
+ }]
+ ),
+ (
+ # http://www.twitch.tv/riotgames/v/6528877
+ 'twitch_vod',
+ 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee',
+ [{
+ 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/audio_only/index-muted-HM49I092CC.m3u8',
+ 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee',
+ 'ext': 'mp4',
+ 'format_id': 'Audio Only',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'none',
+ 'tbr': 182.725,
+ }, {
+ 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/mobile/index-muted-HM49I092CC.m3u8',
+ 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee',
+ 'ext': 'mp4',
+ 'format_id': 'Mobile',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.42C00D',
+ 'tbr': 280.474,
+ 'width': 400,
+ 'height': 226,
+ }, {
+ 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/low/index-muted-HM49I092CC.m3u8',
+ 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee',
+ 'ext': 'mp4',
+ 'format_id': 'Low',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.42C01E',
+ 'tbr': 628.347,
+ 'width': 640,
+ 'height': 360,
+ }, {
+ 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/medium/index-muted-HM49I092CC.m3u8',
+ 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee',
+ 'ext': 'mp4',
+ 'format_id': 'Medium',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.42C01E',
+ 'tbr': 893.387,
+ 'width': 852,
+ 'height': 480,
+ }, {
+ 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/high/index-muted-HM49I092CC.m3u8',
+ 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee',
+ 'ext': 'mp4',
+ 'format_id': 'High',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.42C01F',
+ 'tbr': 1603.789,
+ 'width': 1280,
+ 'height': 720,
+ }, {
+ 'url': 'https://vod.edgecast.hls.ttvnw.net/e5da31ab49_riotgames_15001215120_261543898/chunked/index-muted-HM49I092CC.m3u8',
+ 'manifest_url': 'https://usher.ttvnw.net/vod/6528877?allow_source=true&allow_audio_only=true&allow_spectre=true&player=twitchweb&nauth=%7B%22user_id%22%3Anull%2C%22vod_id%22%3A6528877%2C%22expires%22%3A1492887874%2C%22chansub%22%3A%7B%22restricted_bitrates%22%3A%5B%5D%7D%2C%22privileged%22%3Afalse%2C%22https_required%22%3Afalse%7D&nauthsig=3e29296a6824a0f48f9e731383f77a614fc79bee',
+ 'ext': 'mp4',
+ 'format_id': 'Source',
+ 'protocol': 'm3u8',
+ 'acodec': 'mp4a.40.2',
+ 'vcodec': 'avc1.100.31',
+ 'tbr': 3214.134,
+ 'width': 1280,
+ 'height': 720,
+ }]
+ ),
+ (
+ # http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+ # EXT-X-STREAM-INF tag with NAME attribute that is not defined
+ # in HLS specification
+ 'vidio',
+ 'https://www.vidio.com/videos/165683/playlist.m3u8',
+ [{
+ 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b300.mp4.m3u8',
+ 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '270p 3G',
+ 'protocol': 'm3u8',
+ 'tbr': 300,
+ 'width': 480,
+ 'height': 270,
+ }, {
+ 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b600.mp4.m3u8',
+ 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '360p SD',
+ 'protocol': 'm3u8',
+ 'tbr': 600,
+ 'width': 640,
+ 'height': 360,
+ }, {
+ 'url': 'https://cdn1-a.production.vidio.static6.com/uploads/165683/dj_ambred-4383-b1200.mp4.m3u8',
+ 'manifest_url': 'https://www.vidio.com/videos/165683/playlist.m3u8',
+ 'ext': 'mp4',
+ 'format_id': '720p HD',
+ 'protocol': 'm3u8',
+ 'tbr': 1200,
+ 'width': 1280,
+ 'height': 720,
+ }]
+ )
+ ]
+
+ for m3u8_file, m3u8_url, expected_formats in _TEST_CASES:
+ with io.open('./test/testdata/m3u8/%s.m3u8' % m3u8_file,
+ mode='r', encoding='utf-8') as f:
+ formats = self.ie._parse_m3u8_formats(
+ f.read(), m3u8_url, ext='mp4')
+ self.ie._sort_formats(formats)
+ expect_value(self, formats, expected_formats, None)
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 8491a88..75945e3 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -755,6 +755,7 @@ class TestYoutubeDL(unittest.TestCase):
'_type': 'url_transparent',
'url': 'foo2:',
'ie_key': 'Foo2',
+ 'title': 'foo1 title'
}
class Foo2IE(InfoExtractor):
@@ -771,7 +772,7 @@ class TestYoutubeDL(unittest.TestCase):
_VALID_URL = r'foo3:'
def _real_extract(self, url):
- return _make_result([{'url': TEST_URL}])
+ return _make_result([{'url': TEST_URL}], title='foo3 title')
ydl.add_info_extractor(Foo1IE(ydl))
ydl.add_info_extractor(Foo2IE(ydl))
@@ -779,6 +780,7 @@ class TestYoutubeDL(unittest.TestCase):
ydl.extract_info('foo1:')
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['url'], TEST_URL)
+ self.assertEqual(downloaded['title'], 'foo1 title')
if __name__ == '__main__':
diff --git a/test/test_download.py b/test/test_download.py
index 01a8bcb..209f5f6 100644
--- a/test/test_download.py
+++ b/test/test_download.py
@@ -151,7 +151,7 @@ def generator(test_case, tname):
try_num = 1
while True:
try:
- # We're not using .download here sine that is just a shim
+ # We're not using .download here since that is just a shim
# for outside error handling, and returns the exit code
# instead of the result dict.
res_dict = ydl.extract_info(
@@ -199,7 +199,16 @@ def generator(test_case, tname):
self.assertEqual(
test_case['playlist_duration_sum'], got_duration)
- for tc in test_cases:
+ # Generalize both playlists and single videos to unified format for
+ # simplicity
+ if 'entries' not in res_dict:
+ res_dict['entries'] = [res_dict]
+
+ for tc_num, tc in enumerate(test_cases):
+ tc_res_dict = res_dict['entries'][tc_num]
+ # First, check test cases' data against extracted data alone
+ expect_info_dict(self, tc_res_dict, tc.get('info_dict', {}))
+ # Now, check downloaded file consistency
tc_filename = get_tc_filename(tc)
if not test_case.get('params', {}).get('skip_download', False):
self.assertTrue(os.path.exists(tc_filename), msg='Missing file ' + tc_filename)
@@ -216,14 +225,15 @@ def generator(test_case, tname):
format_bytes(got_fsize)))
if 'md5' in tc:
md5_for_file = _file_md5(tc_filename)
- self.assertEqual(md5_for_file, tc['md5'])
+ self.assertEqual(tc['md5'], md5_for_file)
+ # Finally, check test cases' data again but this time against
+ # extracted data from info JSON file written during processing
info_json_fn = os.path.splitext(tc_filename)[0] + '.info.json'
self.assertTrue(
os.path.exists(info_json_fn),
'Missing info file %s' % info_json_fn)
with io.open(info_json_fn, encoding='utf-8') as infof:
info_dict = json.load(infof)
-
expect_info_dict(self, info_dict, tc.get('info_dict', {}))
finally:
try_rm_tcs_files()
diff --git a/test/test_subtitles.py b/test/test_subtitles.py
index 27e763e..1b8de82 100644
--- a/test/test_subtitles.py
+++ b/test/test_subtitles.py
@@ -21,7 +21,7 @@ from youtube_dl.extractor import (
NPOIE,
ComedyCentralIE,
NRKTVIE,
- RaiTVIE,
+ RaiPlayIE,
VikiIE,
ThePlatformIE,
ThePlatformFeedIE,
@@ -258,9 +258,9 @@ class TestNRKSubtitles(BaseTestSubtitles):
self.assertEqual(md5(subtitles['no']), '544fa917d3197fcbee64634559221cc2')
-class TestRaiSubtitles(BaseTestSubtitles):
- url = 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
- IE = RaiTVIE
+class TestRaiPlaySubtitles(BaseTestSubtitles):
+ url = 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html'
+ IE = RaiPlayIE
def test_allsubtitles(self):
self.DL.params['writesubtitles'] = True
diff --git a/test/test_utils.py b/test/test_utils.py
index aa4569b..f31559e 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -44,6 +44,7 @@ from youtube_dl.utils import (
limit_length,
mimetype2ext,
month_by_name,
+ multipart_encode,
ohdave_rsa_encrypt,
OnDemandPagedList,
orderedSet,
@@ -338,6 +339,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unified_timestamp('UNKNOWN DATE FORMAT'), None)
self.assertEqual(unified_timestamp('May 16, 2016 11:15 PM'), 1463440500)
self.assertEqual(unified_timestamp('Feb 7, 2016 at 6:35 pm'), 1454870100)
+ self.assertEqual(unified_timestamp('2017-03-30T17:52:41Q'), 1490896361)
def test_determine_ext(self):
self.assertEqual(determine_ext('http://example.com/foo/bar.mp4/?download'), 'mp4')
@@ -619,6 +621,16 @@ class TestUtil(unittest.TestCase):
'http://example.com/path', {'test': '第二行тест'})),
query_dict('http://example.com/path?test=%E7%AC%AC%E4%BA%8C%E8%A1%8C%D1%82%D0%B5%D1%81%D1%82'))
+ def test_multipart_encode(self):
+ self.assertEqual(
+ multipart_encode({b'field': b'value'}, boundary='AAAAAA')[0],
+ b'--AAAAAA\r\nContent-Disposition: form-data; name="field"\r\n\r\nvalue\r\n--AAAAAA--\r\n')
+ self.assertEqual(
+ multipart_encode({'欄位'.encode('utf-8'): '值'.encode('utf-8')}, boundary='AAAAAA')[0],
+ b'--AAAAAA\r\nContent-Disposition: form-data; name="\xe6\xac\x84\xe4\xbd\x8d"\r\n\r\n\xe5\x80\xbc\r\n--AAAAAA--\r\n')
+ self.assertRaises(
+ ValueError, multipart_encode, {b'field': b'value'}, boundary='value')
+
def test_dict_get(self):
FALSE_VALUES = {
'none': None,
@@ -899,6 +911,7 @@ class TestUtil(unittest.TestCase):
def test_clean_html(self):
self.assertEqual(clean_html('a:\nb'), 'a: b')
self.assertEqual(clean_html('a:\n "b"'), 'a: "b"')
+ self.assertEqual(clean_html('a<br>\xa0b'), 'a\nb')
def test_intlist_to_bytes(self):
self.assertEqual(
@@ -1069,6 +1082,47 @@ The first line
'''
self.assertEqual(dfxp2srt(dfxp_data_no_default_namespace), srt_data)
+ dfxp_data_with_style = '''<?xml version="1.0" encoding="utf-8"?>
+<tt xmlns="http://www.w3.org/2006/10/ttaf1" xmlns:ttp="http://www.w3.org/2006/10/ttaf1#parameter" ttp:timeBase="media" xmlns:tts="http://www.w3.org/2006/10/ttaf1#style" xml:lang="en" xmlns:ttm="http://www.w3.org/2006/10/ttaf1#metadata">
+ <head>
+ <styling>
+ <style id="s2" style="s0" tts:color="cyan" tts:fontWeight="bold" />
+ <style id="s1" style="s0" tts:color="yellow" tts:fontStyle="italic" />
+ <style id="s3" style="s0" tts:color="lime" tts:textDecoration="underline" />
+ <style id="s0" tts:backgroundColor="black" tts:fontStyle="normal" tts:fontSize="16" tts:fontFamily="sansSerif" tts:color="white" />
+ </styling>
+ </head>
+ <body tts:textAlign="center" style="s0">
+ <div>
+ <p begin="00:00:02.08" id="p0" end="00:00:05.84">default style<span tts:color="red">custom style</span></p>
+ <p style="s2" begin="00:00:02.08" id="p0" end="00:00:05.84"><span tts:color="lime">part 1<br /></span><span tts:color="cyan">part 2</span></p>
+ <p style="s3" begin="00:00:05.84" id="p1" end="00:00:09.56">line 3<br />part 3</p>
+ <p style="s1" tts:textDecoration="underline" begin="00:00:09.56" id="p2" end="00:00:12.36"><span style="s2" tts:color="lime">inner<br /> </span>style</p>
+ </div>
+ </body>
+</tt>'''
+ srt_data = '''1
+00:00:02,080 --> 00:00:05,839
+<font color="white" face="sansSerif" size="16">default style<font color="red">custom style</font></font>
+
+2
+00:00:02,080 --> 00:00:05,839
+<b><font color="cyan" face="sansSerif" size="16"><font color="lime">part 1
+</font>part 2</font></b>
+
+3
+00:00:05,839 --> 00:00:09,560
+<u><font color="lime">line 3
+part 3</font></u>
+
+4
+00:00:09,560 --> 00:00:12,359
+<i><u><font color="yellow"><font color="lime">inner
+ </font>style</font></u></i>
+
+'''
+ self.assertEqual(dfxp2srt(dfxp_data_with_style), srt_data)
+
def test_cli_option(self):
self.assertEqual(cli_option({'proxy': '127.0.0.1:3128'}, '--proxy', 'proxy'), ['--proxy', '127.0.0.1:3128'])
self.assertEqual(cli_option({'proxy': None}, '--proxy', 'proxy'), [])
diff --git a/test/test_youtube_chapters.py b/test/test_youtube_chapters.py
new file mode 100644
index 0000000..cb12f83
--- /dev/null
+++ b/test/test_youtube_chapters.py
@@ -0,0 +1,268 @@
+#!/usr/bin/env python
+# coding: utf-8
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from test.helper import expect_value
+from youtube_dl.extractor import YoutubeIE
+
+
+class TestYoutubeChapters(unittest.TestCase):
+
+ _TEST_CASES = [
+ (
+ # https://www.youtube.com/watch?v=A22oy8dFjqc
+ # pattern: 00:00 - <title>
+ '''This is the absolute ULTIMATE experience of Queen's set at LIVE AID, this is the best video mixed to the absolutely superior stereo radio broadcast. This vastly superior audio mix takes a huge dump on all of the official mixes. Best viewed in 1080p. ENJOY! ***MAKE SURE TO READ THE DESCRIPTION***<br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+36);return false;">00:36</a> - Bohemian Rhapsody<br /><a href="#" onclick="yt.www.watch.player.seekTo(02*60+42);return false;">02:42</a> - Radio Ga Ga<br /><a href="#" onclick="yt.www.watch.player.seekTo(06*60+53);return false;">06:53</a> - Ay Oh!<br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+34);return false;">07:34</a> - Hammer To Fall<br /><a href="#" onclick="yt.www.watch.player.seekTo(12*60+08);return false;">12:08</a> - Crazy Little Thing Called Love<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+03);return false;">16:03</a> - We Will Rock You<br /><a href="#" onclick="yt.www.watch.player.seekTo(17*60+18);return false;">17:18</a> - We Are The Champions<br /><a href="#" onclick="yt.www.watch.player.seekTo(21*60+12);return false;">21:12</a> - Is This The World We Created...?<br /><br />Short song analysis:<br /><br />- "Bohemian Rhapsody": Although it's a short medley version, it's one of the best performances of the ballad section, with Freddie nailing the Bb4s with the correct studio phrasing (for the first time ever!).<br /><br />- "Radio Ga Ga": Although it's missing one chorus, this is one of - if not the best - the best versions ever, Freddie nails all the Bb4s and sounds very clean! Spike Edney's Roland Jupiter 8 also really shines through on this mix, compared to the DVD releases!<br /><br />- "Audience Improv": A great improv, Freddie sounds strong and confident. You gotta love when he sustains that A4 for 4 seconds!<br /><br />- "Hammer To Fall": Despite missing a verse and a chorus, it's a strong version (possibly the best ever). Freddie sings the song amazingly, and even ad-libs a C#5 and a C5! Also notice how heavy Brian's guitar sounds compared to the thin DVD mixes - it roars!<br /><br />- "Crazy Little Thing Called Love": A great version, the crowd loves the song, the jam is great as well! Only downside to this is the slight feedback issues.<br /><br />- "We Will Rock You": Although cut down to the 1st verse and chorus, Freddie sounds strong. He nails the A4, and the solo from Dr. May is brilliant!<br /><br />- "We Are the Champions": Perhaps the high-light of the performance - Freddie is very daring on this version, he sustains the pre-chorus Bb4s, nails the 1st C5, belts great A4s, but most importantly: He nails the chorus Bb4s, in all 3 choruses! This is the only time he has ever done so! It has to be said though, the last one sounds a bit rough, but that's a side effect of belting high notes for the past 18 minutes, with nodules AND laryngitis!<br /><br />- "Is This The World We Created... ?": Freddie and Brian perform a beautiful version of this, and it is one of the best versions ever. It's both sad and hilarious that a couple of BBC engineers are talking over the song, one of them being completely oblivious of the fact that he is interrupting the performance, on live television... Which was being televised to almost 2 billion homes.<br /><br /><br />All rights go to their respective owners!<br />-----Copyright Disclaimer Under Section 107 of the Copyright Act 1976, allowance is made for fair use for purposes such as criticism, comment, news reporting, teaching, scholarship, and research. Fair use is a use permitted by copyright statute that might otherwise be infringing. Non-profit, educational or personal use tips the balance in favor of fair use''',
+ 1477,
+ [{
+ 'start_time': 36,
+ 'end_time': 162,
+ 'title': 'Bohemian Rhapsody',
+ }, {
+ 'start_time': 162,
+ 'end_time': 413,
+ 'title': 'Radio Ga Ga',
+ }, {
+ 'start_time': 413,
+ 'end_time': 454,
+ 'title': 'Ay Oh!',
+ }, {
+ 'start_time': 454,
+ 'end_time': 728,
+ 'title': 'Hammer To Fall',
+ }, {
+ 'start_time': 728,
+ 'end_time': 963,
+ 'title': 'Crazy Little Thing Called Love',
+ }, {
+ 'start_time': 963,
+ 'end_time': 1038,
+ 'title': 'We Will Rock You',
+ }, {
+ 'start_time': 1038,
+ 'end_time': 1272,
+ 'title': 'We Are The Champions',
+ }, {
+ 'start_time': 1272,
+ 'end_time': 1477,
+ 'title': 'Is This The World We Created...?',
+ }]
+ ),
+ (
+ # https://www.youtube.com/watch?v=ekYlRhALiRQ
+ # pattern: <num>. <title> 0:00
+ '1. Those Beaten Paths of Confusion <a href="#" onclick="yt.www.watch.player.seekTo(0*60+00);return false;">0:00</a><br />2. Beyond the Shadows of Emptiness & Nothingness <a href="#" onclick="yt.www.watch.player.seekTo(11*60+47);return false;">11:47</a><br />3. Poison Yourself...With Thought <a href="#" onclick="yt.www.watch.player.seekTo(26*60+30);return false;">26:30</a><br />4. The Agents of Transformation <a href="#" onclick="yt.www.watch.player.seekTo(35*60+57);return false;">35:57</a><br />5. Drowning in the Pain of Consciousness <a href="#" onclick="yt.www.watch.player.seekTo(44*60+32);return false;">44:32</a><br />6. Deny the Disease of Life <a href="#" onclick="yt.www.watch.player.seekTo(53*60+07);return false;">53:07</a><br /><br />More info/Buy: http://crepusculonegro.storenvy.com/products/257645-cn-03-arizmenda-within-the-vacuum-of-infinity<br /><br />No copyright is intended. The rights to this video are assumed by the owner and its affiliates.',
+ 4009,
+ [{
+ 'start_time': 0,
+ 'end_time': 707,
+ 'title': '1. Those Beaten Paths of Confusion',
+ }, {
+ 'start_time': 707,
+ 'end_time': 1590,
+ 'title': '2. Beyond the Shadows of Emptiness & Nothingness',
+ }, {
+ 'start_time': 1590,
+ 'end_time': 2157,
+ 'title': '3. Poison Yourself...With Thought',
+ }, {
+ 'start_time': 2157,
+ 'end_time': 2672,
+ 'title': '4. The Agents of Transformation',
+ }, {
+ 'start_time': 2672,
+ 'end_time': 3187,
+ 'title': '5. Drowning in the Pain of Consciousness',
+ }, {
+ 'start_time': 3187,
+ 'end_time': 4009,
+ 'title': '6. Deny the Disease of Life',
+ }]
+ ),
+ (
+ # https://www.youtube.com/watch?v=WjL4pSzog9w
+ # pattern: 00:00 <title>
+ '<a href="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" class="yt-uix-servicelink " data-target-new-window="True" data-servicelink="CDAQ6TgYACITCNf1raqT2dMCFdRjGAod_o0CBSj4HQ" data-url="https://arizmenda.bandcamp.com/merch/despairs-depths-descended-cd" rel="nofollow noopener" target="_blank">https://arizmenda.bandcamp.com/merch/...</a><br /><br /><a href="#" onclick="yt.www.watch.player.seekTo(00*60+00);return false;">00:00</a> Christening Unborn Deformities <br /><a href="#" onclick="yt.www.watch.player.seekTo(07*60+08);return false;">07:08</a> Taste of Purity<br /><a href="#" onclick="yt.www.watch.player.seekTo(16*60+16);return false;">16:16</a> Sculpting Sins of a Universal Tongue<br /><a href="#" onclick="yt.www.watch.player.seekTo(24*60+45);return false;">24:45</a> Birth<br /><a href="#" onclick="yt.www.watch.player.seekTo(31*60+24);return false;">31:24</a> Neves<br /><a href="#" onclick="yt.www.watch.player.seekTo(37*60+55);return false;">37:55</a> Libations in Limbo',
+ 2705,
+ [{
+ 'start_time': 0,
+ 'end_time': 428,
+ 'title': 'Christening Unborn Deformities',
+ }, {
+ 'start_time': 428,
+ 'end_time': 976,
+ 'title': 'Taste of Purity',
+ }, {
+ 'start_time': 976,
+ 'end_time': 1485,
+ 'title': 'Sculpting Sins of a Universal Tongue',
+ }, {
+ 'start_time': 1485,
+ 'end_time': 1884,
+ 'title': 'Birth',
+ }, {
+ 'start_time': 1884,
+ 'end_time': 2275,
+ 'title': 'Neves',
+ }, {
+ 'start_time': 2275,
+ 'end_time': 2705,
+ 'title': 'Libations in Limbo',
+ }]
+ ),
+ (
+ # https://www.youtube.com/watch?v=o3r1sn-t3is
+ # pattern: <title> 00:00 <note>
+ 'Download this show in MP3: <a href="http://sh.st/njZKK" class="yt-uix-servicelink " data-url="http://sh.st/njZKK" data-target-new-window="True" data-servicelink="CDAQ6TgYACITCK3j8_6o2dMCFVDCGAoduVAKKij4HQ" rel="nofollow noopener" target="_blank">http://sh.st/njZKK</a><br /><br />Setlist:<br />I-E-A-I-A-I-O <a href="#" onclick="yt.www.watch.player.seekTo(00*60+45);return false;">00:45</a><br />Suite-Pee <a href="#" onclick="yt.www.watch.player.seekTo(4*60+26);return false;">4:26</a> (Incomplete)<br />Attack <a href="#" onclick="yt.www.watch.player.seekTo(5*60+31);return false;">5:31</a> (First live performance since 2011)<br />Prison Song <a href="#" onclick="yt.www.watch.player.seekTo(8*60+42);return false;">8:42</a><br />Know <a href="#" onclick="yt.www.watch.player.seekTo(12*60+32);return false;">12:32</a> (First live performance since 2011)<br />Aerials <a href="#" onclick="yt.www.watch.player.seekTo(15*60+32);return false;">15:32</a><br />Soldier Side - Intro <a href="#" onclick="yt.www.watch.player.seekTo(19*60+13);return false;">19:13</a><br />B.Y.O.B. <a href="#" onclick="yt.www.watch.player.seekTo(20*60+09);return false;">20:09</a><br />Soil <a href="#" onclick="yt.www.watch.player.seekTo(24*60+32);return false;">24:32</a><br />Darts <a href="#" onclick="yt.www.watch.player.seekTo(27*60+48);return false;">27:48</a><br />Radio/Video <a href="#" onclick="yt.www.watch.player.seekTo(30*60+38);return false;">30:38</a><br />Hypnotize <a href="#" onclick="yt.www.watch.player.seekTo(35*60+05);return false;">35:05</a><br />Temper <a href="#" onclick="yt.www.watch.player.seekTo(38*60+08);return false;">38:08</a> (First live performance since 1999)<br />CUBErt <a href="#" onclick="yt.www.watch.player.seekTo(41*60+00);return false;">41:00</a><br />Needles <a href="#" onclick="yt.www.watch.player.seekTo(42*60+57);return false;">42:57</a><br />Deer Dance <a href="#" onclick="yt.www.watch.player.seekTo(46*60+27);return false;">46:27</a><br />Bounce <a href="#" onclick="yt.www.watch.player.seekTo(49*60+38);return false;">49:38</a><br />Suggestions <a href="#" onclick="yt.www.watch.player.seekTo(51*60+25);return false;">51:25</a><br />Psycho <a href="#" onclick="yt.www.watch.player.seekTo(53*60+52);return false;">53:52</a><br />Chop Suey! <a href="#" onclick="yt.www.watch.player.seekTo(58*60+13);return false;">58:13</a><br />Lonely Day <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+01*60+15);return false;">1:01:15</a><br />Question! <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+04*60+14);return false;">1:04:14</a><br />Lost in Hollywood <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+08*60+10);return false;">1:08:10</a><br />Vicinity of Obscenity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+13*60+40);return false;">1:13:40</a>(First live performance since 2012)<br />Forest <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+16*60+17);return false;">1:16:17</a><br />Cigaro <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+20*60+02);return false;">1:20:02</a><br />Toxicity <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+23*60+57);return false;">1:23:57</a>(with Chino Moreno)<br />Sugar <a href="#" onclick="yt.www.watch.player.seekTo(1*3600+27*60+53);return false;">1:27:53</a>',
+ 5640,
+ [{
+ 'start_time': 45,
+ 'end_time': 266,
+ 'title': 'I-E-A-I-A-I-O',
+ }, {
+ 'start_time': 266,
+ 'end_time': 331,
+ 'title': 'Suite-Pee (Incomplete)',
+ }, {
+ 'start_time': 331,
+ 'end_time': 522,
+ 'title': 'Attack (First live performance since 2011)',
+ }, {
+ 'start_time': 522,
+ 'end_time': 752,
+ 'title': 'Prison Song',
+ }, {
+ 'start_time': 752,
+ 'end_time': 932,
+ 'title': 'Know (First live performance since 2011)',
+ }, {
+ 'start_time': 932,
+ 'end_time': 1153,
+ 'title': 'Aerials',
+ }, {
+ 'start_time': 1153,
+ 'end_time': 1209,
+ 'title': 'Soldier Side - Intro',
+ }, {
+ 'start_time': 1209,
+ 'end_time': 1472,
+ 'title': 'B.Y.O.B.',
+ }, {
+ 'start_time': 1472,
+ 'end_time': 1668,
+ 'title': 'Soil',
+ }, {
+ 'start_time': 1668,
+ 'end_time': 1838,
+ 'title': 'Darts',
+ }, {
+ 'start_time': 1838,
+ 'end_time': 2105,
+ 'title': 'Radio/Video',
+ }, {
+ 'start_time': 2105,
+ 'end_time': 2288,
+ 'title': 'Hypnotize',
+ }, {
+ 'start_time': 2288,
+ 'end_time': 2460,
+ 'title': 'Temper (First live performance since 1999)',
+ }, {
+ 'start_time': 2460,
+ 'end_time': 2577,
+ 'title': 'CUBErt',
+ }, {
+ 'start_time': 2577,
+ 'end_time': 2787,
+ 'title': 'Needles',
+ }, {
+ 'start_time': 2787,
+ 'end_time': 2978,
+ 'title': 'Deer Dance',
+ }, {
+ 'start_time': 2978,
+ 'end_time': 3085,
+ 'title': 'Bounce',
+ }, {
+ 'start_time': 3085,
+ 'end_time': 3232,
+ 'title': 'Suggestions',
+ }, {
+ 'start_time': 3232,
+ 'end_time': 3493,
+ 'title': 'Psycho',
+ }, {
+ 'start_time': 3493,
+ 'end_time': 3675,
+ 'title': 'Chop Suey!',
+ }, {
+ 'start_time': 3675,
+ 'end_time': 3854,
+ 'title': 'Lonely Day',
+ }, {
+ 'start_time': 3854,
+ 'end_time': 4090,
+ 'title': 'Question!',
+ }, {
+ 'start_time': 4090,
+ 'end_time': 4420,
+ 'title': 'Lost in Hollywood',
+ }, {
+ 'start_time': 4420,
+ 'end_time': 4577,
+ 'title': 'Vicinity of Obscenity (First live performance since 2012)',
+ }, {
+ 'start_time': 4577,
+ 'end_time': 4802,
+ 'title': 'Forest',
+ }, {
+ 'start_time': 4802,
+ 'end_time': 5037,
+ 'title': 'Cigaro',
+ }, {
+ 'start_time': 5037,
+ 'end_time': 5273,
+ 'title': 'Toxicity (with Chino Moreno)',
+ }, {
+ 'start_time': 5273,
+ 'end_time': 5640,
+ 'title': 'Sugar',
+ }]
+ ),
+ (
+ # https://www.youtube.com/watch?v=PkYLQbsqCE8
+ # pattern: <num> - <title> [<latinized title>] 0:00:00
+ '''Затемно (Zatemno) is an Obscure Black Metal Band from Russia.<br /><br />"Во прах (Vo prakh)'' Into The Ashes", Debut mini-album released may 6, 2016, by Death Knell Productions<br />Released on 6 panel digipak CD, limited to 100 copies only<br />And digital format on Bandcamp<br /><br />Tracklist<br /><br />1 - Во прах [Vo prakh] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+00*60+00);return false;">0:00:00</a><br />2 - Искупление [Iskupleniye] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+08*60+10);return false;">0:08:10</a><br />3 - Из серпов луны...[Iz serpov luny] <a href="#" onclick="yt.www.watch.player.seekTo(0*3600+14*60+30);return false;">0:14:30</a><br /><br />Links:<br /><a href="https://deathknellprod.bandcamp.com/album/--2" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://deathknellprod.bandcamp.com/album/--2" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://deathknellprod.bandcamp.com/a...</a><br /><a href="https://www.facebook.com/DeathKnellProd/" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://www.facebook.com/DeathKnellProd/" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://www.facebook.com/DeathKnellProd/</a><br /><br /><br />I don't have any right about this artifact, my only intention is to spread the music of the band, all rights are reserved to the Затемно (Zatemno) and his producers, Death Knell Productions.<br /><br />------------------------------------------------------------------<br /><br />Subscribe for more videos like this.<br />My link: <a href="https://web.facebook.com/AttackOfTheDragons" class="yt-uix-servicelink " data-target-new-window="True" data-url="https://web.facebook.com/AttackOfTheDragons" data-servicelink="CC8Q6TgYACITCNP234Kr2dMCFcNxGAodQqsIwSj4HQ" target="_blank" rel="nofollow noopener">https://web.facebook.com/AttackOfTheD...</a>''',
+ 1138,
+ [{
+ 'start_time': 0,
+ 'end_time': 490,
+ 'title': '1 - Во прах [Vo prakh]',
+ }, {
+ 'start_time': 490,
+ 'end_time': 870,
+ 'title': '2 - Искупление [Iskupleniye]',
+ }, {
+ 'start_time': 870,
+ 'end_time': 1138,
+ 'title': '3 - Из серпов луны...[Iz serpov luny]',
+ }]
+ ),
+ ]
+
+ def test_youtube_chapters(self):
+ for description, duration, expected_chapters in self._TEST_CASES:
+ ie = YoutubeIE()
+ expect_value(
+ self, ie._extract_chapters(description, duration),
+ expected_chapters, None)
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/youtube-dl b/youtube-dl
index d7fc66a..bf4b72a 100755
--- a/youtube-dl
+++ b/youtube-dl
Binary files differ
diff --git a/youtube-dl.1 b/youtube-dl.1
index d4c0ab5..49818dc 100644
--- a/youtube-dl.1
+++ b/youtube-dl.1
@@ -297,13 +297,13 @@ Number of retries (default is 10), or "infinite".
.RE
.TP
.B \-\-fragment\-retries \f[I]RETRIES\f[]
-Number of retries for a fragment (default is 10), or "infinite" (DASH
-and hlsnative only)
+Number of retries for a fragment (default is 10), or "infinite" (DASH,
+hlsnative and ISM)
.RS
.RE
.TP
.B \-\-skip\-unavailable\-fragments
-Skip unavailable fragments (DASH and hlsnative only)
+Skip unavailable fragments (DASH, hlsnative and ISM)
.RS
.RE
.TP
@@ -312,6 +312,12 @@ Abort downloading when some fragment is not available
.RS
.RE
.TP
+.B \-\-keep\-fragments
+Keep downloaded fragments on disk after downloading is finished;
+fragments are erased by default
+.RS
+.RE
+.TP
.B \-\-buffer\-size \f[I]SIZE\f[]
Size of download buffer (e.g.
1024 or 16K) (default is 1024)
@@ -841,12 +847,14 @@ Write metadata to the video file
.TP
.B \-\-metadata\-from\-title \f[I]FORMAT\f[]
Parse additional metadata like song title / artist from the video title.
-The format syntax is the same as \-\-output, the parsed parameters
-replace existing values.
-Additional templates: %(album)s, %(artist)s.
-Example: \-\-metadata\-from\-title "%(artist)s \- %(title)s" matches a
-title like "Coldplay \- Paradise"
-.RS
+The format syntax is the same as \-\-output.
+Regular expression with named capture groups may also be used.
+The parsed parameters replace existing values.
+Example: \-\-metadata\-from\- title "%(artist)s \- %(title)s" matches a
+title like "Coldplay \- Paradise".
+Example (regex): \-\-metadata\-from\-title "(?P.+?) \- (?P
+.RS
+\&.+)"
.RE
.TP
.B \-\-xattrs
diff --git a/youtube-dl.bash-completion b/youtube-dl.bash-completion
index cf81e2c..2cf75de 100644
--- a/youtube-dl.bash-completion
+++ b/youtube-dl.bash-completion
@@ -4,7 +4,7 @@ __youtube_dl()
COMPREPLY=()
cur="${COMP_WORDS[COMP_CWORD]}"
prev="${COMP_WORDS[COMP_CWORD-1]}"
- opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --config-location --flat-playlist --mark-watched --no-mark-watched --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --limit-rate --retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --buffer-size --no-resize-buffer --test --playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --autonumber-start --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info-json --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --max-sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs"
+ opts="--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --config-location --flat-playlist --mark-watched --no-mark-watched --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --limit-rate --retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --buffer-size --no-resize-buffer --test --playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --autonumber-start --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info-json --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --max-sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs"
keywords=":ytfavorites :ytrecommended :ytsubscriptions :ytwatchlater :ythistory"
fileopts="-a|--batch-file|--download-archive|--cookies|--load-info"
diropts="--cache-dir"
diff --git a/youtube-dl.fish b/youtube-dl.fish
index 4921032..3778979 100644
--- a/youtube-dl.fish
+++ b/youtube-dl.fish
@@ -46,9 +46,10 @@ complete --command youtube-dl --long-option download-archive --description 'Down
complete --command youtube-dl --long-option include-ads --description 'Download advertisements as well (experimental)'
complete --command youtube-dl --long-option limit-rate --short-option r --description 'Maximum download rate in bytes per second (e.g. 50K or 4.2M)'
complete --command youtube-dl --long-option retries --short-option R --description 'Number of retries (default is %default), or "infinite".'
-complete --command youtube-dl --long-option fragment-retries --description 'Number of retries for a fragment (default is %default), or "infinite" (DASH and hlsnative only)'
-complete --command youtube-dl --long-option skip-unavailable-fragments --description 'Skip unavailable fragments (DASH and hlsnative only)'
+complete --command youtube-dl --long-option fragment-retries --description 'Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)'
+complete --command youtube-dl --long-option skip-unavailable-fragments --description 'Skip unavailable fragments (DASH, hlsnative and ISM)'
complete --command youtube-dl --long-option abort-on-unavailable-fragment --description 'Abort downloading when some fragment is not available'
+complete --command youtube-dl --long-option keep-fragments --description 'Keep downloaded fragments on disk after downloading is finished; fragments are erased by default'
complete --command youtube-dl --long-option buffer-size --description 'Size of download buffer (e.g. 1024 or 16K) (default is %default)'
complete --command youtube-dl --long-option no-resize-buffer --description 'Do not automatically adjust the buffer size. By default, the buffer size is automatically resized from an initial value of SIZE.'
complete --command youtube-dl --long-option test
@@ -151,7 +152,7 @@ complete --command youtube-dl --long-option no-post-overwrites --description 'Do
complete --command youtube-dl --long-option embed-subs --description 'Embed subtitles in the video (only for mp4, webm and mkv videos)'
complete --command youtube-dl --long-option embed-thumbnail --description 'Embed thumbnail in the audio as cover art'
complete --command youtube-dl --long-option add-metadata --description 'Write metadata to the video file'
-complete --command youtube-dl --long-option metadata-from-title --description 'Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output, the parsed parameters replace existing values. Additional templates: %(album)s, %(artist)s. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise"'
+complete --command youtube-dl --long-option metadata-from-title --description 'Parse additional metadata like song title / artist from the video title. The format syntax is the same as --output. Regular expression with named capture groups may also be used. The parsed parameters replace existing values. Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like "Coldplay - Paradise". Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"'
complete --command youtube-dl --long-option xattrs --description 'Write metadata to the video file'"'"'s xattrs (using dublin core and xdg standards)'
complete --command youtube-dl --long-option fixup --description 'Automatically correct known faults of the file. One of never (do nothing), warn (only emit a warning), detect_or_warn (the default; fix file if we can, warn otherwise)'
complete --command youtube-dl --long-option prefer-avconv --description 'Prefer avconv over ffmpeg for running the postprocessors (default)'
diff --git a/youtube-dl.zsh b/youtube-dl.zsh
index 6b060d7..2d670ee 100644
--- a/youtube-dl.zsh
+++ b/youtube-dl.zsh
@@ -19,7 +19,7 @@ __youtube_dl() {
elif [[ ${prev} == "--recode-video" ]]; then
_arguments '*: :(mp4 flv ogg webm mkv)'
else
- _arguments '*: :(--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --config-location --flat-playlist --mark-watched --no-mark-watched --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --limit-rate --retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --buffer-size --no-resize-buffer --test --playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --autonumber-start --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info-json --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --max-sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs)'
+ _arguments '*: :(--help --version --update --ignore-errors --abort-on-error --dump-user-agent --list-extractors --extractor-descriptions --force-generic-extractor --default-search --ignore-config --config-location --flat-playlist --mark-watched --no-mark-watched --no-color --proxy --socket-timeout --source-address --force-ipv4 --force-ipv6 --geo-verification-proxy --cn-verification-proxy --geo-bypass --no-geo-bypass --geo-bypass-country --playlist-start --playlist-end --playlist-items --match-title --reject-title --max-downloads --min-filesize --max-filesize --date --datebefore --dateafter --min-views --max-views --match-filter --no-playlist --yes-playlist --age-limit --download-archive --include-ads --limit-rate --retries --fragment-retries --skip-unavailable-fragments --abort-on-unavailable-fragment --keep-fragments --buffer-size --no-resize-buffer --test --playlist-reverse --playlist-random --xattr-set-filesize --hls-prefer-native --hls-prefer-ffmpeg --hls-use-mpegts --external-downloader --external-downloader-args --batch-file --id --output --autonumber-size --autonumber-start --restrict-filenames --auto-number --title --literal --no-overwrites --continue --no-continue --no-part --no-mtime --write-description --write-info-json --write-annotations --load-info-json --cookies --cache-dir --no-cache-dir --rm-cache-dir --write-thumbnail --write-all-thumbnails --list-thumbnails --quiet --no-warnings --simulate --skip-download --get-url --get-title --get-id --get-thumbnail --get-description --get-duration --get-filename --get-format --dump-json --dump-single-json --print-json --newline --no-progress --console-title --verbose --dump-pages --write-pages --youtube-print-sig-code --print-traffic --call-home --no-call-home --encoding --no-check-certificate --prefer-insecure --user-agent --referer --add-header --bidi-workaround --sleep-interval --max-sleep-interval --format --all-formats --prefer-free-formats --list-formats --youtube-include-dash-manifest --youtube-skip-dash-manifest --merge-output-format --write-sub --write-auto-sub --all-subs --list-subs --sub-format --sub-lang --username --password --twofactor --netrc --video-password --ap-mso --ap-username --ap-password --ap-list-mso --extract-audio --audio-format --audio-quality --recode-video --postprocessor-args --keep-video --no-post-overwrites --embed-subs --embed-thumbnail --add-metadata --metadata-from-title --xattrs --fixup --prefer-avconv --prefer-ffmpeg --ffmpeg-location --exec --convert-subs)'
fi
;;
esac
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index 21586f0..4c33d49 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -370,10 +370,10 @@ class YoutubeDL(object):
else:
raise
- if (sys.version_info >= (3,) and sys.platform != 'win32' and
+ if (sys.platform != 'win32' and
sys.getfilesystemencoding() in ['ascii', 'ANSI_X3.4-1968'] and
not params.get('restrictfilenames', False)):
- # On Python 3, the Unicode filesystem API will throw errors (#1474)
+ # Unicode filesystem API will throw errors (#1474, #13027)
self.report_warning(
'Assuming --restrict-filenames since file system encoding '
'cannot encode all characters. '
@@ -640,7 +640,7 @@ class YoutubeDL(object):
NUMERIC_FIELDS = set((
'width', 'height', 'tbr', 'abr', 'asr', 'vbr', 'fps', 'filesize', 'filesize_approx',
- 'upload_year', 'upload_month', 'upload_day',
+ 'timestamp', 'upload_year', 'upload_month', 'upload_day',
'duration', 'view_count', 'like_count', 'dislike_count', 'repost_count',
'average_rating', 'comment_count', 'age_limit',
'start_time', 'end_time',
@@ -672,8 +672,7 @@ class YoutubeDL(object):
FORMAT_RE.format(numeric_field),
r'%({0})s'.format(numeric_field), outtmpl)
- tmpl = expand_path(outtmpl)
- filename = tmpl % template_dict
+ filename = expand_path(outtmpl % template_dict)
# Temporary fix for #4787
# 'Treat' all problem characters by passing filename through preferredencoding
# to workaround encoding issues with subprocess on python2 @ Windows
@@ -837,6 +836,12 @@ class YoutubeDL(object):
ie_result['url'], ie_key=ie_result.get('ie_key'),
extra_info=extra_info, download=False, process=False)
+ # extract_info may return None when ignoreerrors is enabled and
+ # extraction failed with an error, don't crash and return early
+ # in this case
+ if not info:
+ return info
+
force_properties = dict(
(k, v) for k, v in ie_result.items() if v is not None)
for f in ('_type', 'url', 'ie_key'):
@@ -845,11 +850,18 @@ class YoutubeDL(object):
new_result = info.copy()
new_result.update(force_properties)
- assert new_result.get('_type') != 'url_transparent'
+ # Extracted info may not be a video result (i.e.
+ # info.get('_type', 'video') != video) but rather an url or
+ # url_transparent. In such cases outer metadata (from ie_result)
+ # should be propagated to inner one (info). For this to happen
+ # _type of info should be overridden with url_transparent. This
+ # fixes issue from https://github.com/rg3/youtube-dl/pull/11163.
+ if new_result.get('_type') == 'url':
+ new_result['_type'] = 'url_transparent'
return self.process_ie_result(
new_result, download=download, extra_info=extra_info)
- elif result_type == 'playlist' or result_type == 'multi_video':
+ elif result_type in ('playlist', 'multi_video'):
# We process each entry in the playlist
playlist = ie_result.get('title') or ie_result.get('id')
self.to_screen('[download] Downloading playlist: %s' % playlist)
diff --git a/youtube_dl/__init__.py b/youtube_dl/__init__.py
index f156065..c458941 100644
--- a/youtube_dl/__init__.py
+++ b/youtube_dl/__init__.py
@@ -343,6 +343,7 @@ def _real_main(argv=None):
'retries': opts.retries,
'fragment_retries': opts.fragment_retries,
'skip_unavailable_fragments': opts.skip_unavailable_fragments,
+ 'keep_fragments': opts.keep_fragments,
'buffersize': opts.buffersize,
'noresizebuffer': opts.noresizebuffer,
'continuedl': opts.continue_dl,
diff --git a/youtube_dl/compat.py b/youtube_dl/compat.py
index 0c119e4..3952711 100644
--- a/youtube_dl/compat.py
+++ b/youtube_dl/compat.py
@@ -2692,7 +2692,7 @@ else:
userhome = pwent.pw_dir
userhome = userhome.rstrip('/')
return (userhome + path[i:]) or '/'
- elif compat_os_name == 'nt' or compat_os_name == 'ce':
+ elif compat_os_name in ('nt', 'ce'):
def compat_expanduser(path):
"""Expand ~ and ~user constructs.
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index 2c4470a..5d66211 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -187,6 +187,9 @@ class FileDownloader(object):
return filename[:-len('.part')]
return filename
+ def ytdl_filename(self, filename):
+ return filename + '.ytdl'
+
def try_rename(self, old_filename, new_filename):
try:
if old_filename == new_filename:
@@ -327,21 +330,22 @@ class FileDownloader(object):
os.path.exists(encodeFilename(filename))
)
- continuedl_and_exists = (
- self.params.get('continuedl', True) and
- os.path.isfile(encodeFilename(filename)) and
- not self.params.get('nopart', False)
- )
-
- # Check file already present
- if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists):
- self.report_file_already_downloaded(filename)
- self._hook_progress({
- 'filename': filename,
- 'status': 'finished',
- 'total_bytes': os.path.getsize(encodeFilename(filename)),
- })
- return True
+ if not hasattr(filename, 'write'):
+ continuedl_and_exists = (
+ self.params.get('continuedl', True) and
+ os.path.isfile(encodeFilename(filename)) and
+ not self.params.get('nopart', False)
+ )
+
+ # Check file already present
+ if filename != '-' and (nooverwrites_and_exists or continuedl_and_exists):
+ self.report_file_already_downloaded(filename)
+ self._hook_progress({
+ 'filename': filename,
+ 'status': 'finished',
+ 'total_bytes': os.path.getsize(encodeFilename(filename)),
+ })
+ return True
min_sleep_interval = self.params.get('sleep_interval')
if min_sleep_interval:
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py
index e2ddc36..7491fda 100644
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@@ -1,13 +1,7 @@
from __future__ import unicode_literals
-import os
-
from .fragment import FragmentFD
from ..compat import compat_urllib_error
-from ..utils import (
- sanitize_open,
- encodeFilename,
-)
class DashSegmentsFD(FragmentFD):
@@ -28,31 +22,24 @@ class DashSegmentsFD(FragmentFD):
self._prepare_and_start_frag_download(ctx)
- segments_filenames = []
-
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
- def process_segment(segment, tmp_filename, num):
- segment_url = segment['url']
- segment_name = 'Frag%d' % num
- target_filename = '%s-%s' % (tmp_filename, segment_name)
+ frag_index = 0
+ for i, segment in enumerate(segments):
+ frag_index += 1
+ if frag_index <= ctx['fragment_index']:
+ continue
# In DASH, the first segment contains necessary headers to
# generate a valid MP4 file, so always abort for the first segment
- fatal = num == 0 or not skip_unavailable_fragments
+ fatal = i == 0 or not skip_unavailable_fragments
count = 0
while count <= fragment_retries:
try:
- success = ctx['dl'].download(target_filename, {
- 'url': segment_url,
- 'http_headers': info_dict.get('http_headers'),
- })
+ success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
if not success:
return False
- down, target_sanitized = sanitize_open(target_filename, 'rb')
- ctx['dest_stream'].write(down.read())
- down.close()
- segments_filenames.append(target_sanitized)
+ self._append_fragment(ctx, frag_content)
break
except compat_urllib_error.HTTPError as err:
# YouTube may often return 404 HTTP error for a fragment causing the
@@ -63,22 +50,14 @@ class DashSegmentsFD(FragmentFD):
# HTTP error.
count += 1
if count <= fragment_retries:
- self.report_retry_fragment(err, segment_name, count, fragment_retries)
+ self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count > fragment_retries:
if not fatal:
- self.report_skip_fragment(segment_name)
- return True
+ self.report_skip_fragment(frag_index)
+ continue
self.report_error('giving up after %s fragment retries' % fragment_retries)
return False
- return True
-
- for i, segment in enumerate(segments):
- if not process_segment(segment, ctx['tmpfilename'], i):
- return False
self._finish_frag_download(ctx)
- for segment_file in segments_filenames:
- os.remove(encodeFilename(segment_file))
-
return True
diff --git a/youtube_dl/downloader/external.py b/youtube_dl/downloader/external.py
index e13cf54..e78169a 100644
--- a/youtube_dl/downloader/external.py
+++ b/youtube_dl/downloader/external.py
@@ -29,7 +29,17 @@ class ExternalFD(FileDownloader):
self.report_destination(filename)
tmpfilename = self.temp_name(filename)
- retval = self._call_downloader(tmpfilename, info_dict)
+ try:
+ retval = self._call_downloader(tmpfilename, info_dict)
+ except KeyboardInterrupt:
+ if not info_dict.get('is_live'):
+ raise
+ # Live stream downloading cancellation should be considered as
+ # correct and expected termination thus all postprocessing
+ # should take place
+ retval = 0
+ self.to_screen('[%s] Interrupted by user' % self.get_basename())
+
if retval == 0:
fsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen('\r[%s] Downloaded %s bytes' % (self.get_basename(), fsize))
diff --git a/youtube_dl/downloader/f4m.py b/youtube_dl/downloader/f4m.py
index 688e086..c8fde9a 100644
--- a/youtube_dl/downloader/f4m.py
+++ b/youtube_dl/downloader/f4m.py
@@ -3,7 +3,6 @@ from __future__ import division, unicode_literals
import base64
import io
import itertools
-import os
import time
from .fragment import FragmentFD
@@ -16,9 +15,7 @@ from ..compat import (
compat_struct_unpack,
)
from ..utils import (
- encodeFilename,
fix_xml_ampersands,
- sanitize_open,
xpath_text,
)
@@ -366,17 +363,21 @@ class F4mFD(FragmentFD):
dest_stream = ctx['dest_stream']
- write_flv_header(dest_stream)
- if not live:
- write_metadata_tag(dest_stream, metadata)
+ if ctx['complete_frags_downloaded_bytes'] == 0:
+ write_flv_header(dest_stream)
+ if not live:
+ write_metadata_tag(dest_stream, metadata)
base_url_parsed = compat_urllib_parse_urlparse(base_url)
self._start_frag_download(ctx)
- frags_filenames = []
+ frag_index = 0
while fragments_list:
seg_i, frag_i = fragments_list.pop(0)
+ frag_index += 1
+ if frag_index <= ctx['fragment_index']:
+ continue
name = 'Seg%d-Frag%d' % (seg_i, frag_i)
query = []
if base_url_parsed.query:
@@ -386,17 +387,10 @@ class F4mFD(FragmentFD):
if info_dict.get('extra_param_to_segment_url'):
query.append(info_dict['extra_param_to_segment_url'])
url_parsed = base_url_parsed._replace(path=base_url_parsed.path + name, query='&'.join(query))
- frag_filename = '%s-%s' % (ctx['tmpfilename'], name)
try:
- success = ctx['dl'].download(frag_filename, {
- 'url': url_parsed.geturl(),
- 'http_headers': info_dict.get('http_headers'),
- })
+ success, down_data = self._download_fragment(ctx, url_parsed.geturl(), info_dict)
if not success:
return False
- (down, frag_sanitized) = sanitize_open(frag_filename, 'rb')
- down_data = down.read()
- down.close()
reader = FlvReader(down_data)
while True:
try:
@@ -411,12 +405,8 @@ class F4mFD(FragmentFD):
break
raise
if box_type == b'mdat':
- dest_stream.write(box_data)
+ self._append_fragment(ctx, box_data)
break
- if live:
- os.remove(encodeFilename(frag_sanitized))
- else:
- frags_filenames.append(frag_sanitized)
except (compat_urllib_error.HTTPError, ) as err:
if live and (err.code == 404 or err.code == 410):
# We didn't keep up with the live window. Continue
@@ -436,7 +426,4 @@ class F4mFD(FragmentFD):
self._finish_frag_download(ctx)
- for frag_file in frags_filenames:
- os.remove(encodeFilename(frag_file))
-
return True
diff --git a/youtube_dl/downloader/fragment.py b/youtube_dl/downloader/fragment.py
index 56f9752..bccc8ec 100644
--- a/youtube_dl/downloader/fragment.py
+++ b/youtube_dl/downloader/fragment.py
@@ -2,6 +2,7 @@ from __future__ import division, unicode_literals
import os
import time
+import json
from .common import FileDownloader
from .http import HttpFD
@@ -28,15 +29,37 @@ class FragmentFD(FileDownloader):
and hlsnative only)
skip_unavailable_fragments:
Skip unavailable fragments (DASH and hlsnative only)
+ keep_fragments: Keep downloaded fragments on disk after downloading is
+ finished
+
+ For each incomplete fragment download youtube-dl keeps on disk a special
+ bookkeeping file with download state and metadata (in future such files will
+ be used for any incomplete download handled by youtube-dl). This file is
+ used to properly handle resuming, check download file consistency and detect
+ potential errors. The file has a .ytdl extension and represents a standard
+ JSON file of the following format:
+
+ extractor:
+ Dictionary of extractor related data. TBD.
+
+ downloader:
+ Dictionary of downloader related data. May contain following data:
+ current_fragment:
+ Dictionary with current (being downloaded) fragment data:
+ index: 0-based index of current fragment among all fragments
+ fragment_count:
+ Total count of fragments
+
+ This feature is experimental and file format may change in future.
"""
- def report_retry_fragment(self, err, fragment_name, count, retries):
+ def report_retry_fragment(self, err, frag_index, count, retries):
self.to_screen(
- '[download] Got server HTTP error: %s. Retrying fragment %s (attempt %d of %s)...'
- % (error_to_compat_str(err), fragment_name, count, self.format_retries(retries)))
+ '[download] Got server HTTP error: %s. Retrying fragment %d (attempt %d of %s)...'
+ % (error_to_compat_str(err), frag_index, count, self.format_retries(retries)))
- def report_skip_fragment(self, fragment_name):
- self.to_screen('[download] Skipping fragment %s...' % fragment_name)
+ def report_skip_fragment(self, frag_index):
+ self.to_screen('[download] Skipping fragment %d...' % frag_index)
def _prepare_url(self, info_dict, url):
headers = info_dict.get('http_headers')
@@ -46,6 +69,51 @@ class FragmentFD(FileDownloader):
self._prepare_frag_download(ctx)
self._start_frag_download(ctx)
+ @staticmethod
+ def __do_ytdl_file(ctx):
+ return not ctx['live'] and not ctx['tmpfilename'] == '-'
+
+ def _read_ytdl_file(self, ctx):
+ stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'r')
+ ctx['fragment_index'] = json.loads(stream.read())['downloader']['current_fragment']['index']
+ stream.close()
+
+ def _write_ytdl_file(self, ctx):
+ frag_index_stream, _ = sanitize_open(self.ytdl_filename(ctx['filename']), 'w')
+ downloader = {
+ 'current_fragment': {
+ 'index': ctx['fragment_index'],
+ },
+ }
+ if ctx.get('fragment_count') is not None:
+ downloader['fragment_count'] = ctx['fragment_count']
+ frag_index_stream.write(json.dumps({'downloader': downloader}))
+ frag_index_stream.close()
+
+ def _download_fragment(self, ctx, frag_url, info_dict, headers=None):
+ fragment_filename = '%s-Frag%d' % (ctx['tmpfilename'], ctx['fragment_index'])
+ success = ctx['dl'].download(fragment_filename, {
+ 'url': frag_url,
+ 'http_headers': headers or info_dict.get('http_headers'),
+ })
+ if not success:
+ return False, None
+ down, frag_sanitized = sanitize_open(fragment_filename, 'rb')
+ ctx['fragment_filename_sanitized'] = frag_sanitized
+ frag_content = down.read()
+ down.close()
+ return True, frag_content
+
+ def _append_fragment(self, ctx, frag_content):
+ try:
+ ctx['dest_stream'].write(frag_content)
+ finally:
+ if self.__do_ytdl_file(ctx):
+ self._write_ytdl_file(ctx)
+ if not self.params.get('keep_fragments', False):
+ os.remove(ctx['fragment_filename_sanitized'])
+ del ctx['fragment_filename_sanitized']
+
def _prepare_frag_download(self, ctx):
if 'live' not in ctx:
ctx['live'] = False
@@ -66,11 +134,36 @@ class FragmentFD(FileDownloader):
}
)
tmpfilename = self.temp_name(ctx['filename'])
- dest_stream, tmpfilename = sanitize_open(tmpfilename, 'wb')
+ open_mode = 'wb'
+ resume_len = 0
+
+ # Establish possible resume length
+ if os.path.isfile(encodeFilename(tmpfilename)):
+ open_mode = 'ab'
+ resume_len = os.path.getsize(encodeFilename(tmpfilename))
+
+ # Should be initialized before ytdl file check
+ ctx.update({
+ 'tmpfilename': tmpfilename,
+ 'fragment_index': 0,
+ })
+
+ if self.__do_ytdl_file(ctx):
+ if os.path.isfile(encodeFilename(self.ytdl_filename(ctx['filename']))):
+ self._read_ytdl_file(ctx)
+ else:
+ self._write_ytdl_file(ctx)
+ if ctx['fragment_index'] > 0:
+ assert resume_len > 0
+
+ dest_stream, tmpfilename = sanitize_open(tmpfilename, open_mode)
+
ctx.update({
'dl': dl,
'dest_stream': dest_stream,
'tmpfilename': tmpfilename,
+ # Total complete fragments downloaded so far in bytes
+ 'complete_frags_downloaded_bytes': resume_len,
})
def _start_frag_download(self, ctx):
@@ -79,9 +172,9 @@ class FragmentFD(FileDownloader):
# hook
state = {
'status': 'downloading',
- 'downloaded_bytes': 0,
- 'frag_index': 0,
- 'frag_count': total_frags,
+ 'downloaded_bytes': ctx['complete_frags_downloaded_bytes'],
+ 'fragment_index': ctx['fragment_index'],
+ 'fragment_count': total_frags,
'filename': ctx['filename'],
'tmpfilename': ctx['tmpfilename'],
}
@@ -89,8 +182,6 @@ class FragmentFD(FileDownloader):
start = time.time()
ctx.update({
'started': start,
- # Total complete fragments downloaded so far in bytes
- 'complete_frags_downloaded_bytes': 0,
# Amount of fragment's bytes downloaded by the time of the previous
# frag progress hook invocation
'prev_frag_downloaded_bytes': 0,
@@ -106,11 +197,12 @@ class FragmentFD(FileDownloader):
if not ctx['live']:
estimated_size = (
(ctx['complete_frags_downloaded_bytes'] + frag_total_bytes) /
- (state['frag_index'] + 1) * total_frags)
+ (state['fragment_index'] + 1) * total_frags)
state['total_bytes_estimate'] = estimated_size
if s['status'] == 'finished':
- state['frag_index'] += 1
+ state['fragment_index'] += 1
+ ctx['fragment_index'] = state['fragment_index']
state['downloaded_bytes'] += frag_total_bytes - ctx['prev_frag_downloaded_bytes']
ctx['complete_frags_downloaded_bytes'] = state['downloaded_bytes']
ctx['prev_frag_downloaded_bytes'] = 0
@@ -132,6 +224,10 @@ class FragmentFD(FileDownloader):
def _finish_frag_download(self, ctx):
ctx['dest_stream'].close()
+ if self.__do_ytdl_file(ctx):
+ ytdl_filename = encodeFilename(self.ytdl_filename(ctx['filename']))
+ if os.path.isfile(ytdl_filename):
+ os.remove(ytdl_filename)
elapsed = time.time() - ctx['started']
self.try_rename(ctx['tmpfilename'], ctx['filename'])
fsize = os.path.getsize(encodeFilename(ctx['filename']))
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index 4989abc..0e29c8a 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import os.path
import re
import binascii
try:
@@ -18,8 +17,6 @@ from ..compat import (
compat_struct_pack,
)
from ..utils import (
- encodeFilename,
- sanitize_open,
parse_m3u8_attributes,
update_url_query,
)
@@ -34,7 +31,7 @@ class HlsFD(FragmentFD):
def can_download(manifest, info_dict):
UNSUPPORTED_FEATURES = (
r'#EXT-X-KEY:METHOD=(?!NONE|AES-128)', # encrypted streams [1]
- r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
+ # r'#EXT-X-BYTERANGE', # playlists composed of byte ranges of media files [2]
# Live streams heuristic does not always work (e.g. geo restricted to Germany
# http://hls-geo.daserste.de/i/videoportal/Film/c_620000/622873/format,716451,716457,716450,716458,716459,.mp4.csmil/index_4_av.m3u8?null=0)
@@ -52,7 +49,9 @@ class HlsFD(FragmentFD):
# 4. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.5
)
check_results = [not re.search(feature, manifest) for feature in UNSUPPORTED_FEATURES]
- check_results.append(can_decrypt_frag or '#EXT-X-KEY:METHOD=AES-128' not in manifest)
+ is_aes128_enc = '#EXT-X-KEY:METHOD=AES-128' in manifest
+ check_results.append(can_decrypt_frag or not is_aes128_enc)
+ check_results.append(not (is_aes128_enc and r'#EXT-X-BYTERANGE' in manifest))
check_results.append(not info_dict.get('is_live'))
return all(check_results)
@@ -100,31 +99,31 @@ class HlsFD(FragmentFD):
i = 0
media_sequence = 0
decrypt_info = {'METHOD': 'NONE'}
- frags_filenames = []
+ byte_range = {}
+ frag_index = 0
for line in s.splitlines():
line = line.strip()
if line:
if not line.startswith('#'):
+ frag_index += 1
+ if frag_index <= ctx['fragment_index']:
+ continue
frag_url = (
line
if re.match(r'^https?://', line)
else compat_urlparse.urljoin(man_url, line))
- frag_name = 'Frag%d' % i
- frag_filename = '%s-%s' % (ctx['tmpfilename'], frag_name)
if extra_query:
frag_url = update_url_query(frag_url, extra_query)
count = 0
+ headers = info_dict.get('http_headers', {})
+ if byte_range:
+ headers['Range'] = 'bytes=%d-%d' % (byte_range['start'], byte_range['end'])
while count <= fragment_retries:
try:
- success = ctx['dl'].download(frag_filename, {
- 'url': frag_url,
- 'http_headers': info_dict.get('http_headers'),
- })
+ success, frag_content = self._download_fragment(
+ ctx, frag_url, info_dict, headers)
if not success:
return False
- down, frag_sanitized = sanitize_open(frag_filename, 'rb')
- frag_content = down.read()
- down.close()
break
except compat_urllib_error.HTTPError as err:
# Unavailable (possibly temporary) fragments may be served.
@@ -133,28 +132,29 @@ class HlsFD(FragmentFD):
# https://github.com/rg3/youtube-dl/issues/10448).
count += 1
if count <= fragment_retries:
- self.report_retry_fragment(err, frag_name, count, fragment_retries)
+ self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count > fragment_retries:
if skip_unavailable_fragments:
i += 1
media_sequence += 1
- self.report_skip_fragment(frag_name)
+ self.report_skip_fragment(frag_index)
continue
self.report_error(
'giving up after %s fragment retries' % fragment_retries)
return False
if decrypt_info['METHOD'] == 'AES-128':
iv = decrypt_info.get('IV') or compat_struct_pack('>8xq', media_sequence)
+ decrypt_info['KEY'] = decrypt_info.get('KEY') or self.ydl.urlopen(decrypt_info['URI']).read()
frag_content = AES.new(
decrypt_info['KEY'], AES.MODE_CBC, iv).decrypt(frag_content)
- ctx['dest_stream'].write(frag_content)
- frags_filenames.append(frag_sanitized)
+ self._append_fragment(ctx, frag_content)
# We only download the first fragment during the test
if test:
break
i += 1
media_sequence += 1
elif line.startswith('#EXT-X-KEY'):
+ decrypt_url = decrypt_info.get('URI')
decrypt_info = parse_m3u8_attributes(line[11:])
if decrypt_info['METHOD'] == 'AES-128':
if 'IV' in decrypt_info:
@@ -164,13 +164,18 @@ class HlsFD(FragmentFD):
man_url, decrypt_info['URI'])
if extra_query:
decrypt_info['URI'] = update_url_query(decrypt_info['URI'], extra_query)
- decrypt_info['KEY'] = self.ydl.urlopen(decrypt_info['URI']).read()
+ if decrypt_url != decrypt_info['URI']:
+ decrypt_info['KEY'] = None
elif line.startswith('#EXT-X-MEDIA-SEQUENCE'):
media_sequence = int(line[22:])
+ elif line.startswith('#EXT-X-BYTERANGE'):
+ splitted_byte_range = line[17:].split('@')
+ sub_range_start = int(splitted_byte_range[1]) if len(splitted_byte_range) == 2 else byte_range['end']
+ byte_range = {
+ 'start': sub_range_start,
+ 'end': sub_range_start + int(splitted_byte_range[0]),
+ }
self._finish_frag_download(ctx)
- for frag_file in frags_filenames:
- os.remove(encodeFilename(frag_file))
-
return True
diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py
index 63a636c..5f6f9fa 100644
--- a/youtube_dl/downloader/ism.py
+++ b/youtube_dl/downloader/ism.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import os
import time
import struct
import binascii
@@ -8,10 +7,6 @@ import io
from .fragment import FragmentFD
from ..compat import compat_urllib_error
-from ..utils import (
- sanitize_open,
- encodeFilename,
-)
u8 = struct.Struct(b'>B')
@@ -225,50 +220,39 @@ class IsmFD(FragmentFD):
self._prepare_and_start_frag_download(ctx)
- segments_filenames = []
-
fragment_retries = self.params.get('fragment_retries', 0)
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
track_written = False
+ frag_index = 0
for i, segment in enumerate(segments):
- segment_url = segment['url']
- segment_name = 'Frag%d' % i
- target_filename = '%s-%s' % (ctx['tmpfilename'], segment_name)
+ frag_index += 1
+ if frag_index <= ctx['fragment_index']:
+ continue
count = 0
while count <= fragment_retries:
try:
- success = ctx['dl'].download(target_filename, {
- 'url': segment_url,
- 'http_headers': info_dict.get('http_headers'),
- })
+ success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
if not success:
return False
- down, target_sanitized = sanitize_open(target_filename, 'rb')
- down_data = down.read()
if not track_written:
- tfhd_data = extract_box_data(down_data, [b'moof', b'traf', b'tfhd'])
+ tfhd_data = extract_box_data(frag_content, [b'moof', b'traf', b'tfhd'])
info_dict['_download_params']['track_id'] = u32.unpack(tfhd_data[4:8])[0]
write_piff_header(ctx['dest_stream'], info_dict['_download_params'])
track_written = True
- ctx['dest_stream'].write(down_data)
- down.close()
- segments_filenames.append(target_sanitized)
+ self._append_fragment(ctx, frag_content)
break
except compat_urllib_error.HTTPError as err:
count += 1
if count <= fragment_retries:
- self.report_retry_fragment(err, segment_name, count, fragment_retries)
+ self.report_retry_fragment(err, frag_index, count, fragment_retries)
if count > fragment_retries:
if skip_unavailable_fragments:
- self.report_skip_fragment(segment_name)
+ self.report_skip_fragment(frag_index)
continue
self.report_error('giving up after %s fragment retries' % fragment_retries)
return False
self._finish_frag_download(ctx)
- for segment_file in segments_filenames:
- os.remove(encodeFilename(segment_file))
-
return True
diff --git a/youtube_dl/downloader/rtmp.py b/youtube_dl/downloader/rtmp.py
index 9de6e70..b823b51 100644
--- a/youtube_dl/downloader/rtmp.py
+++ b/youtube_dl/downloader/rtmp.py
@@ -169,7 +169,7 @@ class RtmpFD(FileDownloader):
self.report_error('[rtmpdump] Could not connect to RTMP server.')
return False
- while (retval == RD_INCOMPLETE or retval == RD_FAILED) and not test and not live:
+ while retval in (RD_INCOMPLETE, RD_FAILED) and not test and not live:
prevsize = os.path.getsize(encodeFilename(tmpfilename))
self.to_screen('[rtmpdump] %s bytes' % prevsize)
time.sleep(5.0) # This seems to be needed
diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py
new file mode 100644
index 0000000..66caf6a
--- /dev/null
+++ b/youtube_dl/extractor/adn.py
@@ -0,0 +1,136 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import json
+import os
+
+from .common import InfoExtractor
+from ..aes import aes_cbc_decrypt
+from ..compat import compat_ord
+from ..utils import (
+ bytes_to_intlist,
+ ExtractorError,
+ float_or_none,
+ intlist_to_bytes,
+ srt_subtitles_timecode,
+ strip_or_none,
+)
+
+
+class ADNIE(InfoExtractor):
+ IE_DESC = 'Anime Digital Network'
+ _VALID_URL = r'https?://(?:www\.)?animedigitalnetwork\.fr/video/[^/]+/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://animedigitalnetwork.fr/video/blue-exorcist-kyoto-saga/7778-episode-1-debut-des-hostilites',
+ 'md5': 'e497370d847fd79d9d4c74be55575c7a',
+ 'info_dict': {
+ 'id': '7778',
+ 'ext': 'mp4',
+ 'title': 'Blue Exorcist - Kyôto Saga - Épisode 1',
+ 'description': 'md5:2f7b5aa76edbc1a7a92cedcda8a528d5',
+ }
+ }
+
+ def _get_subtitles(self, sub_path, video_id):
+ if not sub_path:
+ return None
+
+ enc_subtitles = self._download_webpage(
+ 'http://animedigitalnetwork.fr/' + sub_path,
+ video_id, fatal=False)
+ if not enc_subtitles:
+ return None
+
+ # http://animedigitalnetwork.fr/components/com_vodvideo/videojs/adn-vjs.min.js
+ dec_subtitles = intlist_to_bytes(aes_cbc_decrypt(
+ bytes_to_intlist(base64.b64decode(enc_subtitles[24:])),
+ bytes_to_intlist(b'\nd\xaf\xd2J\xd0\xfc\xe1\xfc\xdf\xb61\xe8\xe1\xf0\xcc'),
+ bytes_to_intlist(base64.b64decode(enc_subtitles[:24]))
+ ))
+ subtitles_json = self._parse_json(
+ dec_subtitles[:-compat_ord(dec_subtitles[-1])],
+ None, fatal=False)
+ if not subtitles_json:
+ return None
+
+ subtitles = {}
+ for sub_lang, sub in subtitles_json.items():
+ srt = ''
+ for num, current in enumerate(sub):
+ start, end, text = (
+ float_or_none(current.get('startTime')),
+ float_or_none(current.get('endTime')),
+ current.get('text'))
+ if start is None or end is None or text is None:
+ continue
+ srt += os.linesep.join(
+ (
+ '%d' % num,
+ '%s --> %s' % (
+ srt_subtitles_timecode(start),
+ srt_subtitles_timecode(end)),
+ text,
+ os.linesep,
+ ))
+
+ if sub_lang == 'vostf':
+ sub_lang = 'fr'
+ subtitles.setdefault(sub_lang, []).extend([{
+ 'ext': 'json',
+ 'data': json.dumps(sub),
+ }, {
+ 'ext': 'srt',
+ 'data': srt,
+ }])
+ return subtitles
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ player_config = self._parse_json(self._search_regex(
+ r'playerConfig\s*=\s*({.+});', webpage, 'player config'), video_id)
+
+ video_info = {}
+ video_info_str = self._search_regex(
+ r'videoInfo\s*=\s*({.+});', webpage,
+ 'video info', fatal=False)
+ if video_info_str:
+ video_info = self._parse_json(
+ video_info_str, video_id, fatal=False) or {}
+
+ options = player_config.get('options') or {}
+ metas = options.get('metas') or {}
+ title = metas.get('title') or video_info['title']
+ links = player_config.get('links') or {}
+
+ formats = []
+ for format_id, qualities in links.items():
+ for load_balancer_url in qualities.values():
+ load_balancer_data = self._download_json(
+ load_balancer_url, video_id, fatal=False) or {}
+ m3u8_url = load_balancer_data.get('location')
+ if not m3u8_url:
+ continue
+ m3u8_formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id=format_id, fatal=False)
+ if format_id == 'vf':
+ for f in m3u8_formats:
+ f['language'] = 'fr'
+ formats.extend(m3u8_formats)
+ error = options.get('error')
+ if not formats and error:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': strip_or_none(metas.get('summary') or video_info.get('resume')),
+ 'thumbnail': video_info.get('image'),
+ 'formats': formats,
+ 'subtitles': self.extract_subtitles(player_config.get('subtitles'), video_id),
+ 'episode': metas.get('subtitle') or video_info.get('videoTitle'),
+ 'series': video_info.get('playlistTitle'),
+ }
diff --git a/youtube_dl/extractor/adobepass.py b/youtube_dl/extractor/adobepass.py
index 1b2d364..7da96c6 100644
--- a/youtube_dl/extractor/adobepass.py
+++ b/youtube_dl/extractor/adobepass.py
@@ -41,6 +41,11 @@ MSO_INFO = {
'username_field': 'IDToken1',
'password_field': 'IDToken2',
},
+ 'Verizon': {
+ 'name': 'Verizon FiOS',
+ 'username_field': 'IDToken1',
+ 'password_field': 'IDToken2',
+ },
'thr030': {
'name': '3 Rivers Communications'
},
@@ -1303,6 +1308,12 @@ class AdobePassIE(InfoExtractor):
_USER_AGENT = 'Mozilla/5.0 (X11; Linux i686; rv:47.0) Gecko/20100101 Firefox/47.0'
_MVPD_CACHE = 'ap-mvpd'
+ def _download_webpage_handle(self, *args, **kwargs):
+ headers = kwargs.get('headers', {})
+ headers.update(self.geo_verification_headers())
+ kwargs['headers'] = headers
+ return super(AdobePassIE, self)._download_webpage_handle(*args, **kwargs)
+
@staticmethod
def _get_mvpd_resource(provider_id, title, guid, rating):
channel = etree.Element('channel')
@@ -1384,40 +1395,72 @@ class AdobePassIE(InfoExtractor):
# Comcast page flow varies by video site and whether you
# are on Comcast's network.
provider_redirect_page, urlh = provider_redirect_page_res
- # Check for Comcast auto login
if 'automatically signing you in' in provider_redirect_page:
oauth_redirect_url = self._html_search_regex(
r'window\.location\s*=\s*[\'"]([^\'"]+)',
provider_redirect_page, 'oauth redirect')
- # Just need to process the request. No useful data comes back
self._download_webpage(
oauth_redirect_url, video_id, 'Confirming auto login')
else:
if '<form name="signin"' in provider_redirect_page:
- # already have the form, just fill it
provider_login_page_res = provider_redirect_page_res
elif 'http-equiv="refresh"' in provider_redirect_page:
- # redirects to the login page
oauth_redirect_url = self._html_search_regex(
r'content="0;\s*url=([^\'"]+)',
provider_redirect_page, 'meta refresh redirect')
provider_login_page_res = self._download_webpage_handle(
- oauth_redirect_url,
- video_id, 'Downloading Provider Login Page')
+ oauth_redirect_url, video_id,
+ 'Downloading Provider Login Page')
else:
provider_login_page_res = post_form(
- provider_redirect_page_res, 'Downloading Provider Login Page')
+ provider_redirect_page_res,
+ 'Downloading Provider Login Page')
- mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {
- mso_info.get('username_field', 'username'): username,
- mso_info.get('password_field', 'password'): password,
- })
+ mvpd_confirm_page_res = post_form(
+ provider_login_page_res, 'Logging in', {
+ mso_info['username_field']: username,
+ mso_info['password_field']: password,
+ })
mvpd_confirm_page, urlh = mvpd_confirm_page_res
if '<button class="submit" value="Resume">Resume</button>' in mvpd_confirm_page:
post_form(mvpd_confirm_page_res, 'Confirming Login')
-
+ elif mso_id == 'Verizon':
+ # In general, if you're connecting from a Verizon-assigned IP,
+ # you will not actually pass your credentials.
+ provider_redirect_page, urlh = provider_redirect_page_res
+ if 'Please wait ...' in provider_redirect_page:
+ saml_redirect_url = self._html_search_regex(
+ r'self\.parent\.location=(["\'])(?P<url>.+?)\1',
+ provider_redirect_page,
+ 'SAML Redirect URL', group='url')
+ saml_login_page = self._download_webpage(
+ saml_redirect_url, video_id,
+ 'Downloading SAML Login Page')
+ else:
+ saml_login_page_res = post_form(
+ provider_redirect_page_res, 'Logging in', {
+ mso_info['username_field']: username,
+ mso_info['password_field']: password,
+ })
+ saml_login_page, urlh = saml_login_page_res
+ if 'Please try again.' in saml_login_page:
+ raise ExtractorError(
+ 'We\'re sorry, but either the User ID or Password entered is not correct.')
+ saml_login_url = self._search_regex(
+ r'xmlHttp\.open\("POST"\s*,\s*(["\'])(?P<url>.+?)\1',
+ saml_login_page, 'SAML Login URL', group='url')
+ saml_response_json = self._download_json(
+ saml_login_url, video_id, 'Downloading SAML Response',
+ headers={'Content-Type': 'text/xml'})
+ self._download_webpage(
+ saml_response_json['targetValue'], video_id,
+ 'Confirming Login', data=urlencode_postdata({
+ 'SAMLResponse': saml_response_json['SAMLResponse'],
+ 'RelayState': saml_response_json['RelayState']
+ }), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded'
+ })
else:
- # Normal, non-Comcast flow
provider_login_page_res = post_form(
provider_redirect_page_res, 'Downloading Provider Login Page')
mvpd_confirm_page_res = post_form(provider_login_page_res, 'Logging in', {
diff --git a/youtube_dl/extractor/adultswim.py b/youtube_dl/extractor/adultswim.py
index 989505c..acc4ce3 100644
--- a/youtube_dl/extractor/adultswim.py
+++ b/youtube_dl/extractor/adultswim.py
@@ -5,91 +5,52 @@ import re
from .turner import TurnerBaseIE
from ..utils import (
- ExtractorError,
int_or_none,
+ strip_or_none,
)
class AdultSwimIE(TurnerBaseIE):
- _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<is_playlist>playlists/)?(?P<show_path>[^/]+)/(?P<episode_path>[^/?#]+)/?'
+ _VALID_URL = r'https?://(?:www\.)?adultswim\.com/videos/(?P<show_path>[^/?#]+)(?:/(?P<episode_path>[^/?#]+))?'
_TESTS = [{
'url': 'http://adultswim.com/videos/rick-and-morty/pilot',
- 'playlist': [
- {
- 'md5': '247572debc75c7652f253c8daa51a14d',
- 'info_dict': {
- 'id': 'rQxZvXQ4ROaSOqq-or2Mow-0',
- 'ext': 'flv',
- 'title': 'Rick and Morty - Pilot Part 1',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
- },
- },
- {
- 'md5': '77b0e037a4b20ec6b98671c4c379f48d',
- 'info_dict': {
- 'id': 'rQxZvXQ4ROaSOqq-or2Mow-3',
- 'ext': 'flv',
- 'title': 'Rick and Morty - Pilot Part 4',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
- },
- },
- ],
'info_dict': {
'id': 'rQxZvXQ4ROaSOqq-or2Mow',
+ 'ext': 'mp4',
'title': 'Rick and Morty - Pilot',
- 'description': "Rick moves in with his daughter's family and establishes himself as a bad influence on his grandson, Morty. "
+ 'description': 'Rick moves in with his daughter\'s family and establishes himself as a bad influence on his grandson, Morty.',
+ 'timestamp': 1493267400,
+ 'upload_date': '20170427',
},
- 'skip': 'This video is only available for registered users',
- }, {
- 'url': 'http://www.adultswim.com/videos/playlists/american-parenting/putting-francine-out-of-business/',
- 'playlist': [
- {
- 'md5': '2eb5c06d0f9a1539da3718d897f13ec5',
- 'info_dict': {
- 'id': '-t8CamQlQ2aYZ49ItZCFog-0',
- 'ext': 'flv',
- 'title': 'American Dad - Putting Francine Out of Business',
- 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
- },
- }
- ],
- 'info_dict': {
- 'id': '-t8CamQlQ2aYZ49ItZCFog',
- 'title': 'American Dad - Putting Francine Out of Business',
- 'description': 'Stan hatches a plan to get Francine out of the real estate business.Watch more American Dad on [adult swim].'
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
},
+ 'expected_warnings': ['Unable to download f4m manifest'],
}, {
'url': 'http://www.adultswim.com/videos/tim-and-eric-awesome-show-great-job/dr-steve-brule-for-your-wine/',
- 'playlist': [
- {
- 'md5': '3e346a2ab0087d687a05e1e7f3b3e529',
- 'info_dict': {
- 'id': 'sY3cMUR_TbuE4YmdjzbIcQ-0',
- 'ext': 'mp4',
- 'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
- 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
- },
- }
- ],
'info_dict': {
'id': 'sY3cMUR_TbuE4YmdjzbIcQ',
+ 'ext': 'mp4',
'title': 'Tim and Eric Awesome Show Great Job! - Dr. Steve Brule, For Your Wine',
- 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \r\nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.\r\n\r\n',
+ 'description': 'Dr. Brule reports live from Wine Country with a special report on wines. \nWatch Tim and Eric Awesome Show Great Job! episode #20, "Embarrassed" on Adult Swim.',
+ 'upload_date': '20080124',
+ 'timestamp': 1201150800,
},
'params': {
# m3u8 download
'skip_download': True,
- }
+ },
}, {
- # heroMetadata.trailer
'url': 'http://www.adultswim.com/videos/decker/inside-decker-a-new-hero/',
'info_dict': {
'id': 'I0LQFQkaSUaFp8PnAWHhoQ',
'ext': 'mp4',
'title': 'Decker - Inside Decker: A New Hero',
- 'description': 'md5:c916df071d425d62d70c86d4399d3ee0',
- 'duration': 249.008,
+ 'description': 'The guys recap the conclusion of the season. They announce a new hero, take a peek into the Victorville Film Archive and welcome back the talented James Dean.',
+ 'timestamp': 1469480460,
+ 'upload_date': '20160725',
},
'params': {
# m3u8 download
@@ -97,136 +58,102 @@ class AdultSwimIE(TurnerBaseIE):
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
- 'url': 'http://www.adultswim.com/videos/toonami/friday-october-14th-2016/',
+ 'url': 'http://www.adultswim.com/videos/attack-on-titan',
+ 'info_dict': {
+ 'id': 'b7A69dzfRzuaXIECdxW8XQ',
+ 'title': 'Attack on Titan',
+ 'description': 'md5:6c8e003ea0777b47013e894767f5e114',
+ },
+ 'playlist_mincount': 12,
+ }, {
+ 'url': 'http://www.adultswim.com/videos/streams/williams-stream',
'info_dict': {
- 'id': 'eYiLsKVgQ6qTC6agD67Sig',
- 'title': 'Toonami - Friday, October 14th, 2016',
- 'description': 'md5:99892c96ffc85e159a428de85c30acde',
+ 'id': 'd8DEBj7QRfetLsRgFnGEyg',
+ 'ext': 'mp4',
+ 'title': r're:^Williams Stream \d{4}-\d{2}-\d{2} \d{2}:\d{2}$',
+ 'description': 'original programming',
},
- 'playlist': [{
- 'md5': '',
- 'info_dict': {
- 'id': 'eYiLsKVgQ6qTC6agD67Sig',
- 'ext': 'mp4',
- 'title': 'Toonami - Friday, October 14th, 2016',
- 'description': 'md5:99892c96ffc85e159a428de85c30acde',
- },
- }],
'params': {
# m3u8 download
'skip_download': True,
},
- 'expected_warnings': ['Unable to download f4m manifest'],
}]
- @staticmethod
- def find_video_info(collection, slug):
- for video in collection.get('videos'):
- if video.get('slug') == slug:
- return video
-
- @staticmethod
- def find_collection_by_linkURL(collections, linkURL):
- for collection in collections:
- if collection.get('linkURL') == linkURL:
- return collection
-
- @staticmethod
- def find_collection_containing_video(collections, slug):
- for collection in collections:
- for video in collection.get('videos'):
- if video.get('slug') == slug:
- return collection, video
- return None, None
-
def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- show_path = mobj.group('show_path')
- episode_path = mobj.group('episode_path')
- is_playlist = True if mobj.group('is_playlist') else False
-
- webpage = self._download_webpage(url, episode_path)
-
- # Extract the value of `bootstrappedData` from the Javascript in the page.
- bootstrapped_data = self._parse_json(self._search_regex(
- r'var bootstrappedData = ({.*});', webpage, 'bootstraped data'), episode_path)
-
- # Downloading videos from a /videos/playlist/ URL needs to be handled differently.
- # NOTE: We are only downloading one video (the current one) not the playlist
- if is_playlist:
- collections = bootstrapped_data['playlists']['collections']
- collection = self.find_collection_by_linkURL(collections, show_path)
- video_info = self.find_video_info(collection, episode_path)
-
- show_title = video_info['showTitle']
- segment_ids = [video_info['videoPlaybackID']]
+ show_path, episode_path = re.match(self._VALID_URL, url).groups()
+ display_id = episode_path or show_path
+ webpage = self._download_webpage(url, display_id)
+ initial_data = self._parse_json(self._search_regex(
+ r'AS_INITIAL_DATA(?:__)?\s*=\s*({.+?});',
+ webpage, 'initial data'), display_id)
+
+ is_stream = show_path == 'streams'
+ if is_stream:
+ if not episode_path:
+ episode_path = 'live-stream'
+
+ video_data = next(stream for stream_path, stream in initial_data['streams'].items() if stream_path == episode_path)
+ video_id = video_data.get('stream')
+
+ if not video_id:
+ entries = []
+ for episode in video_data.get('archiveEpisodes', []):
+ episode_url = episode.get('url')
+ if not episode_url:
+ continue
+ entries.append(self.url_result(
+ episode_url, 'AdultSwim', episode.get('id')))
+ return self.playlist_result(
+ entries, video_data.get('id'), video_data.get('title'),
+ strip_or_none(video_data.get('description')))
else:
- collections = bootstrapped_data['show']['collections']
- collection, video_info = self.find_collection_containing_video(collections, episode_path)
- # Video wasn't found in the collections, let's try `slugged_video`.
- if video_info is None:
- if bootstrapped_data.get('slugged_video', {}).get('slug') == episode_path:
- video_info = bootstrapped_data['slugged_video']
- if not video_info:
- video_info = bootstrapped_data.get(
- 'heroMetadata', {}).get('trailer', {}).get('video')
- if not video_info:
- video_info = bootstrapped_data.get('onlineOriginals', [None])[0]
- if not video_info:
- raise ExtractorError('Unable to find video info')
-
- show = bootstrapped_data['show']
- show_title = show['title']
- stream = video_info.get('stream')
- if stream and stream.get('videoPlaybackID'):
- segment_ids = [stream['videoPlaybackID']]
- elif video_info.get('clips'):
- segment_ids = [clip['videoPlaybackID'] for clip in video_info['clips']]
- elif video_info.get('videoPlaybackID'):
- segment_ids = [video_info['videoPlaybackID']]
- elif video_info.get('id'):
- segment_ids = [video_info['id']]
- else:
- if video_info.get('auth') is True:
- raise ExtractorError(
- 'This video is only available via cable service provider subscription that'
- ' is not currently supported. You may want to use --cookies.', expected=True)
- else:
- raise ExtractorError('Unable to find stream or clips')
-
- episode_id = video_info['id']
- episode_title = video_info['title']
- episode_description = video_info.get('description')
- episode_duration = int_or_none(video_info.get('duration'))
- view_count = int_or_none(video_info.get('views'))
+ show_data = initial_data['show']
+
+ if not episode_path:
+ entries = []
+ for video in show_data.get('videos', []):
+ slug = video.get('slug')
+ if not slug:
+ continue
+ entries.append(self.url_result(
+ 'http://adultswim.com/videos/%s/%s' % (show_path, slug),
+ 'AdultSwim', video.get('id')))
+ return self.playlist_result(
+ entries, show_data.get('id'), show_data.get('title'),
+ strip_or_none(show_data.get('metadata', {}).get('description')))
+
+ video_data = show_data['sluggedVideo']
+ video_id = video_data['id']
+
+ info = self._extract_cvp_info(
+ 'http://www.adultswim.com/videos/api/v0/assets?platform=desktop&id=' + video_id,
+ video_id, {
+ 'secure': {
+ 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big',
+ 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do',
+ },
+ }, {
+ 'url': url,
+ 'site_name': 'AdultSwim',
+ 'auth_required': video_data.get('auth'),
+ })
- entries = []
- for part_num, segment_id in enumerate(segment_ids):
- segement_info = self._extract_cvp_info(
- 'http://www.adultswim.com/videos/api/v0/assets?id=%s&platform=desktop' % segment_id,
- segment_id, {
- 'secure': {
- 'media_src': 'http://androidhls-secure.cdn.turner.com/adultswim/big',
- 'tokenizer_src': 'http://www.adultswim.com/astv/mvpd/processors/services/token_ipadAdobe.do',
- },
- })
- segment_title = '%s - %s' % (show_title, episode_title)
- if len(segment_ids) > 1:
- segment_title += ' Part %d' % (part_num + 1)
- segement_info.update({
- 'id': segment_id,
- 'title': segment_title,
- 'description': episode_description,
+ info.update({
+ 'id': video_id,
+ 'display_id': display_id,
+ 'description': info.get('description') or strip_or_none(video_data.get('description')),
+ })
+ if not is_stream:
+ info.update({
+ 'duration': info.get('duration') or int_or_none(video_data.get('duration')),
+ 'timestamp': info.get('timestamp') or int_or_none(video_data.get('launch_date')),
+ 'season_number': info.get('season_number') or int_or_none(video_data.get('season_number')),
+ 'episode': info['title'],
+ 'episode_number': info.get('episode_number') or int_or_none(video_data.get('episode_number')),
})
- entries.append(segement_info)
- return {
- '_type': 'playlist',
- 'id': episode_id,
- 'display_id': episode_path,
- 'entries': entries,
- 'title': '%s - %s' % (show_title, episode_title),
- 'description': episode_description,
- 'duration': episode_duration,
- 'view_count': view_count,
- }
+ info['series'] = video_data.get('collection_title') or info.get('series')
+ if info['series'] and info['series'] != info['title']:
+ info['title'] = '%s - %s' % (info['series'], info['title'])
+
+ return info
diff --git a/youtube_dl/extractor/aenetworks.py b/youtube_dl/extractor/aenetworks.py
index dd96a47..2dcdba9 100644
--- a/youtube_dl/extractor/aenetworks.py
+++ b/youtube_dl/extractor/aenetworks.py
@@ -23,7 +23,19 @@ class AENetworksBaseIE(ThePlatformIE):
class AENetworksIE(AENetworksBaseIE):
IE_NAME = 'aenetworks'
IE_DESC = 'A+E Networks: A&E, Lifetime, History.com, FYI Network'
- _VALID_URL = r'https?://(?:www\.)?(?P<domain>(?:history|aetv|mylifetime|lifetimemovieclub)\.com|fyi\.tv)/(?:shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?
+ (?P<domain>
+ (?:history|aetv|mylifetime|lifetimemovieclub)\.com|
+ fyi\.tv
+ )/
+ (?:
+ shows/(?P<show_path>[^/]+(?:/[^/]+){0,2})|
+ movies/(?P<movie_display_id>[^/]+)(?:/full-movie)?|
+ specials/(?P<special_display_id>[^/]+)/full-special
+ )
+ '''
_TESTS = [{
'url': 'http://www.history.com/shows/mountain-men/season-1/episode-1',
'md5': 'a97a65f7e823ae10e9244bc5433d5fe6',
@@ -65,6 +77,9 @@ class AENetworksIE(AENetworksBaseIE):
}, {
'url': 'https://www.lifetimemovieclub.com/movies/a-killer-among-us',
'only_matching': True
+ }, {
+ 'url': 'http://www.history.com/specials/sniper-into-the-kill-zone/full-special',
+ 'only_matching': True
}]
_DOMAIN_TO_REQUESTOR_ID = {
'history.com': 'HISTORY',
@@ -75,8 +90,8 @@ class AENetworksIE(AENetworksBaseIE):
}
def _real_extract(self, url):
- domain, show_path, movie_display_id = re.match(self._VALID_URL, url).groups()
- display_id = show_path or movie_display_id
+ domain, show_path, movie_display_id, special_display_id = re.match(self._VALID_URL, url).groups()
+ display_id = show_path or movie_display_id or special_display_id
webpage = self._download_webpage(url, display_id)
if show_path:
url_parts = show_path.split('/')
@@ -86,10 +101,14 @@ class AENetworksIE(AENetworksBaseIE):
for season_url_path in re.findall(r'(?s)<li[^>]+data-href="(/shows/%s/season-\d+)"' % url_parts[0], webpage):
entries.append(self.url_result(
compat_urlparse.urljoin(url, season_url_path), 'AENetworks'))
- return self.playlist_result(
- entries, self._html_search_meta('aetn:SeriesId', webpage),
- self._html_search_meta('aetn:SeriesTitle', webpage))
- elif url_parts_len == 2:
+ if entries:
+ return self.playlist_result(
+ entries, self._html_search_meta('aetn:SeriesId', webpage),
+ self._html_search_meta('aetn:SeriesTitle', webpage))
+ else:
+ # single season
+ url_parts_len = 2
+ if url_parts_len == 2:
entries = []
for episode_item in re.findall(r'(?s)<[^>]+class="[^"]*(?:episode|program)-item[^"]*"[^>]*>', webpage):
episode_attributes = extract_attributes(episode_item)
@@ -97,7 +116,7 @@ class AENetworksIE(AENetworksBaseIE):
url, episode_attributes['data-canonical'])
entries.append(self.url_result(
episode_url, 'AENetworks',
- episode_attributes['data-videoid']))
+ episode_attributes.get('data-videoid') or episode_attributes.get('data-video-id')))
return self.playlist_result(
entries, self._html_search_meta('aetn:SeasonId', webpage))
@@ -107,7 +126,10 @@ class AENetworksIE(AENetworksBaseIE):
}
video_id = self._html_search_meta('aetn:VideoID', webpage)
media_url = self._search_regex(
- r"media_url\s*=\s*'([^']+)'", webpage, 'video url')
+ [r"media_url\s*=\s*'(?P<url>[^']+)'",
+ r'data-media-url=(?P<url>(?:https?:)?//[^\s>]+)',
+ r'data-media-url=(["\'])(?P<url>(?:(?!\1).)+?)\1'],
+ webpage, 'video url', group='url')
theplatform_metadata = self._download_theplatform_metadata(self._search_regex(
r'https?://link.theplatform.com/s/([^?]+)', media_url, 'theplatform_path'), video_id)
info = self._parse_theplatform_metadata(theplatform_metadata)
diff --git a/youtube_dl/extractor/afreecatv.py b/youtube_dl/extractor/afreecatv.py
index b774d6d..c8cb91d 100644
--- a/youtube_dl/extractor/afreecatv.py
+++ b/youtube_dl/extractor/afreecatv.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..compat import compat_xpath
from ..utils import (
+ determine_ext,
ExtractorError,
int_or_none,
xpath_text,
@@ -72,13 +73,70 @@ class AfreecaTVIE(InfoExtractor):
'url': 'http://vod.afreecatv.com/PLAYER/STATION/18650793',
'info_dict': {
'id': '18650793',
- 'ext': 'flv',
+ 'ext': 'mp4',
+ 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '윈아디',
'uploader_id': 'badkids',
- 'title': '오늘은 다르다! 쏘님의 우월한 위아래~ 댄스리액션!',
+ 'duration': 107,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://vod.afreecatv.com/PLAYER/STATION/10481652',
+ 'info_dict': {
+ 'id': '10481652',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!'",
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': 'dailyapril',
+ 'uploader_id': 'dailyapril',
+ 'duration': 6492,
+ },
+ 'playlist_count': 2,
+ 'playlist': [{
+ 'md5': 'd8b7c174568da61d774ef0203159bf97',
+ 'info_dict': {
+ 'id': '20160502_c4c62b9d_174361386_1',
+ 'ext': 'mp4',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 1)",
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': 'dailyapril',
+ 'uploader_id': 'dailyapril',
+ 'upload_date': '20160502',
+ 'duration': 3601,
+ },
+ }, {
+ 'md5': '58f2ce7f6044e34439ab2d50612ab02b',
+ 'info_dict': {
+ 'id': '20160502_39e739bb_174361386_2',
+ 'ext': 'mp4',
+ 'title': "BJ유트루와 함께하는 '팅커벨 메이크업!' (part 2)",
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': 'dailyapril',
+ 'uploader_id': 'dailyapril',
+ 'upload_date': '20160502',
+ 'duration': 2891,
+ },
+ }],
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # non standard key
+ 'url': 'http://vod.afreecatv.com/PLAYER/STATION/20515605',
+ 'info_dict': {
+ 'id': '20170411_BE689A0E_190960999_1_2_h',
+ 'ext': 'mp4',
+ 'title': '혼자사는여자집',
+ 'thumbnail': 're:^https?://(?:video|st)img.afreecatv.com/.*$',
+ 'uploader': '♥이슬이',
+ 'uploader_id': 'dasl8121',
+ 'upload_date': '20170411',
+ 'duration': 213,
},
'params': {
- 'skip_download': True, # requires rtmpdump
+ 'skip_download': True,
},
}, {
'url': 'http://www.afreecatv.com/player/Player.swf?szType=szBjId=djleegoon&nStationNo=11273158&nBbsNo=13161095&nTitleNo=36327652',
@@ -94,7 +152,7 @@ class AfreecaTVIE(InfoExtractor):
m = re.match(r'^(?P<upload_date>\d{8})_\w+_(?P<part>\d+)$', key)
if m:
video_key['upload_date'] = m.group('upload_date')
- video_key['part'] = m.group('part')
+ video_key['part'] = int(m.group('part'))
return video_key
def _real_extract(self, url):
@@ -109,23 +167,64 @@ class AfreecaTVIE(InfoExtractor):
raise ExtractorError('Specified AfreecaTV video does not exist',
expected=True)
- video_url_raw = video_element.text
-
- app, playpath = video_url_raw.split('mp4:')
+ video_url = video_element.text.strip()
title = xpath_text(video_xml, './track/title', 'title', fatal=True)
+
uploader = xpath_text(video_xml, './track/nickname', 'uploader')
uploader_id = xpath_text(video_xml, './track/bj_id', 'uploader id')
- duration = int_or_none(xpath_text(video_xml, './track/duration',
- 'duration'))
+ duration = int_or_none(xpath_text(
+ video_xml, './track/duration', 'duration'))
thumbnail = xpath_text(video_xml, './track/titleImage', 'thumbnail')
- return {
+ common_entry = {
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'thumbnail': thumbnail,
+ }
+
+ info = common_entry.copy()
+ info.update({
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ })
+
+ if not video_url:
+ entries = []
+ file_elements = video_element.findall(compat_xpath('./file'))
+ one = len(file_elements) == 1
+ for file_num, file_element in enumerate(file_elements, start=1):
+ file_url = file_element.text
+ if not file_url:
+ continue
+ key = file_element.get('key', '')
+ upload_date = self._search_regex(
+ r'^(\d{8})_', key, 'upload date', default=None)
+ file_duration = int_or_none(file_element.get('duration'))
+ format_id = key if key else '%s_%s' % (video_id, file_num)
+ formats = self._extract_m3u8_formats(
+ file_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls',
+ note='Downloading part %d m3u8 information' % file_num)
+ file_info = common_entry.copy()
+ file_info.update({
+ 'id': format_id,
+ 'title': title if one else '%s (part %d)' % (title, file_num),
+ 'upload_date': upload_date,
+ 'duration': file_duration,
+ 'formats': formats,
+ })
+ entries.append(file_info)
+ entries_info = info.copy()
+ entries_info.update({
+ '_type': 'multi_video',
+ 'entries': entries,
+ })
+ return entries_info
+
+ info = {
'id': video_id,
- 'url': app,
- 'ext': 'flv',
- 'play_path': 'mp4:' + playpath,
- 'rtmp_live': True, # downloading won't end without this
'title': title,
'uploader': uploader,
'uploader_id': uploader_id,
@@ -133,6 +232,21 @@ class AfreecaTVIE(InfoExtractor):
'thumbnail': thumbnail,
}
+ if determine_ext(video_url) == 'm3u8':
+ info['formats'] = self._extract_m3u8_formats(
+ video_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls')
+ else:
+ app, playpath = video_url.split('mp4:')
+ info.update({
+ 'url': app,
+ 'ext': 'flv',
+ 'play_path': 'mp4:' + playpath,
+ 'rtmp_live': True, # downloading won't end without this
+ })
+
+ return info
+
class AfreecaTVGlobalIE(AfreecaTVIE):
IE_NAME = 'afreecatv:global'
diff --git a/youtube_dl/extractor/airmozilla.py b/youtube_dl/extractor/airmozilla.py
index 0e06918..9e38136 100644
--- a/youtube_dl/extractor/airmozilla.py
+++ b/youtube_dl/extractor/airmozilla.py
@@ -15,12 +15,12 @@ class AirMozillaIE(InfoExtractor):
_VALID_URL = r'https?://air\.mozilla\.org/(?P<id>[0-9a-z-]+)/?'
_TEST = {
'url': 'https://air.mozilla.org/privacy-lab-a-meetup-for-privacy-minded-people-in-san-francisco/',
- 'md5': '2e3e7486ba5d180e829d453875b9b8bf',
+ 'md5': '8d02f53ee39cf006009180e21df1f3ba',
'info_dict': {
'id': '6x4q2w',
'ext': 'mp4',
'title': 'Privacy Lab - a meetup for privacy minded people in San Francisco',
- 'thumbnail': r're:https?://vid\.ly/(?P<id>[0-9a-z-]+)/poster',
+ 'thumbnail': r're:https?://.*/poster\.jpg',
'description': 'Brings together privacy professionals and others interested in privacy at for-profits, non-profits, and NGOs in an effort to contribute to the state of the ecosystem...',
'timestamp': 1422487800,
'upload_date': '20150128',
@@ -34,21 +34,13 @@ class AirMozillaIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- video_id = self._html_search_regex(r'//vid.ly/(.*?)/embed', webpage, 'id')
+ video_id = self._html_search_regex(r'//vid\.ly/(.*?)/embed', webpage, 'id')
embed_script = self._download_webpage('https://vid.ly/{0}/embed'.format(video_id), video_id)
- jwconfig = self._search_regex(r'\svar jwconfig = (\{.*?\});\s', embed_script, 'metadata')
- metadata = self._parse_json(jwconfig, video_id)
-
- formats = [{
- 'url': source['file'],
- 'ext': source['type'],
- 'format_id': self._search_regex(r'&format=(.*)$', source['file'], 'video format'),
- 'format': source['label'],
- 'height': int(source['label'].rstrip('p')),
- } for source in metadata['playlist'][0]['sources']]
- self._sort_formats(formats)
+ jwconfig = self._parse_json(self._search_regex(
+ r'initCallback\((.*)\);', embed_script, 'metadata'), video_id)['config']
+ info_dict = self._parse_jwplayer_data(jwconfig, video_id)
view_count = int_or_none(self._html_search_regex(
r'Views since archived: ([0-9]+)',
webpage, 'view count', fatal=False))
@@ -58,17 +50,17 @@ class AirMozillaIE(InfoExtractor):
r'Duration:\s*(\d+\s*hours?\s*\d+\s*minutes?)',
webpage, 'duration', fatal=False))
- return {
+ info_dict.update({
'id': video_id,
'title': self._og_search_title(webpage),
- 'formats': formats,
'url': self._og_search_url(webpage),
'display_id': display_id,
- 'thumbnail': metadata['playlist'][0].get('image'),
'description': self._og_search_description(webpage),
'timestamp': timestamp,
'location': self._html_search_regex(r'Location: (.*)', webpage, 'location', default=None),
'duration': duration,
'view_count': view_count,
'categories': re.findall(r'<a href=".*?" class="channel">(.*?)</a>', webpage),
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/aljazeera.py b/youtube_dl/extractor/aljazeera.py
index 388e578..c68be31 100644
--- a/youtube_dl/extractor/aljazeera.py
+++ b/youtube_dl/extractor/aljazeera.py
@@ -4,9 +4,9 @@ from .common import InfoExtractor
class AlJazeeraIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/programmes/.*?/(?P<id>[^/]+)\.html'
+ _VALID_URL = r'https?://(?:www\.)?aljazeera\.com/(?:programmes|video)/.*?/(?P<id>[^/]+)\.html'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.aljazeera.com/programmes/the-slum/2014/08/deliverance-201482883754237240.html',
'info_dict': {
'id': '3792260579001',
@@ -19,7 +19,10 @@ class AlJazeeraIE(InfoExtractor):
},
'add_ie': ['BrightcoveNew'],
'skip': 'Not accessible from Travis CI server',
- }
+ }, {
+ 'url': 'http://www.aljazeera.com/video/news/2017/05/sierra-leone-709-carat-diamond-auctioned-170511100111930.html',
+ 'only_matching': True,
+ }]
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/665003303001/default_default/index.html?videoId=%s'
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/allocine.py b/youtube_dl/extractor/allocine.py
index 90f11d3..cd533ac 100644
--- a/youtube_dl/extractor/allocine.py
+++ b/youtube_dl/extractor/allocine.py
@@ -2,9 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
- remove_end,
+ int_or_none,
qualities,
+ remove_end,
+ try_get,
+ unified_timestamp,
url_basename,
)
@@ -22,6 +26,10 @@ class AllocineIE(InfoExtractor):
'title': 'Astérix - Le Domaine des Dieux Teaser VF',
'description': 'md5:4a754271d9c6f16c72629a8a993ee884',
'thumbnail': r're:http://.*\.jpg',
+ 'duration': 39,
+ 'timestamp': 1404273600,
+ 'upload_date': '20140702',
+ 'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/player_gen_cmedia=19540403&cfilm=222257.html',
@@ -33,6 +41,10 @@ class AllocineIE(InfoExtractor):
'title': 'Planes 2 Bande-annonce VF',
'description': 'Regardez la bande annonce du film Planes 2 (Planes 2 Bande-annonce VF). Planes 2, un film de Roberts Gannaway',
'thumbnail': r're:http://.*\.jpg',
+ 'duration': 69,
+ 'timestamp': 1385659800,
+ 'upload_date': '20131128',
+ 'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/player_gen_cmedia=19544709&cfilm=181290.html',
@@ -44,6 +56,10 @@ class AllocineIE(InfoExtractor):
'title': 'Dragons 2 - Bande annonce finale VF',
'description': 'md5:6cdd2d7c2687d4c6aafe80a35e17267a',
'thumbnail': r're:http://.*\.jpg',
+ 'duration': 144,
+ 'timestamp': 1397589900,
+ 'upload_date': '20140415',
+ 'view_count': int,
},
}, {
'url': 'http://www.allocine.fr/video/video-19550147/',
@@ -69,34 +85,37 @@ class AllocineIE(InfoExtractor):
r'data-model="([^"]+)"', webpage, 'data model', default=None)
if model:
model_data = self._parse_json(model, display_id)
-
- for video_url in model_data['sources'].values():
+ video = model_data['videos'][0]
+ title = video['title']
+ for video_url in video['sources'].values():
video_id, format_id = url_basename(video_url).split('_')[:2]
formats.append({
'format_id': format_id,
'quality': quality(format_id),
'url': video_url,
})
-
- title = model_data['title']
+ duration = int_or_none(video.get('duration'))
+ view_count = int_or_none(video.get('view_count'))
+ timestamp = unified_timestamp(try_get(
+ video, lambda x: x['added_at']['date'], compat_str))
else:
video_id = display_id
media_data = self._download_json(
'http://www.allocine.fr/ws/AcVisiondataV5.ashx?media=%s' % video_id, display_id)
+ title = remove_end(
+ self._html_search_regex(
+ r'(?s)<title>(.+?)</title>', webpage, 'title').strip(),
+ ' - AlloCiné')
for key, value in media_data['video'].items():
if not key.endswith('Path'):
continue
-
format_id = key[:-len('Path')]
formats.append({
'format_id': format_id,
'quality': quality(format_id),
'url': value,
})
-
- title = remove_end(self._html_search_regex(
- r'(?s)<title>(.+?)</title>', webpage, 'title'
- ).strip(), ' - AlloCiné')
+ duration, view_count, timestamp = [None] * 3
self._sort_formats(formats)
@@ -104,7 +123,10 @@ class AllocineIE(InfoExtractor):
'id': video_id,
'display_id': display_id,
'title': title,
+ 'description': self._og_search_description(webpage),
'thumbnail': self._og_search_thumbnail(webpage),
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
'formats': formats,
- 'description': self._og_search_description(webpage),
}
diff --git a/youtube_dl/extractor/amp.py b/youtube_dl/extractor/amp.py
index e8e4012..fde1a8f 100644
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@@ -7,15 +7,19 @@ from ..utils import (
parse_iso8601,
mimetype2ext,
determine_ext,
+ ExtractorError,
)
class AMPIE(InfoExtractor):
# parse Akamai Adaptive Media Player feed
def _extract_feed_info(self, url):
- item = self._download_json(
+ feed = self._download_json(
url, None, 'Downloading Akamai AMP feed',
- 'Unable to download Akamai AMP feed')['channel']['item']
+ 'Unable to download Akamai AMP feed')
+ item = feed.get('channel', {}).get('item')
+ if not item:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, feed['error']))
video_id = item['guid']
@@ -30,9 +34,12 @@ class AMPIE(InfoExtractor):
if isinstance(media_thumbnail, dict):
media_thumbnail = [media_thumbnail]
for thumbnail_data in media_thumbnail:
- thumbnail = thumbnail_data['@attributes']
+ thumbnail = thumbnail_data.get('@attributes', {})
+ thumbnail_url = thumbnail.get('url')
+ if not thumbnail_url:
+ continue
thumbnails.append({
- 'url': self._proto_relative_url(thumbnail['url'], 'http:'),
+ 'url': self._proto_relative_url(thumbnail_url, 'http:'),
'width': int_or_none(thumbnail.get('width')),
'height': int_or_none(thumbnail.get('height')),
})
@@ -43,9 +50,14 @@ class AMPIE(InfoExtractor):
if isinstance(media_subtitle, dict):
media_subtitle = [media_subtitle]
for subtitle_data in media_subtitle:
- subtitle = subtitle_data['@attributes']
- lang = subtitle.get('lang') or 'en'
- subtitles[lang] = [{'url': subtitle['href']}]
+ subtitle = subtitle_data.get('@attributes', {})
+ subtitle_href = subtitle.get('href')
+ if not subtitle_href:
+ continue
+ subtitles.setdefault(subtitle.get('lang') or 'en', []).append({
+ 'url': subtitle_href,
+ 'ext': mimetype2ext(subtitle.get('type')) or determine_ext(subtitle_href),
+ })
formats = []
media_content = get_media_node('content')
diff --git a/youtube_dl/extractor/anvato.py b/youtube_dl/extractor/anvato.py
index 623f44d..8023da7 100644
--- a/youtube_dl/extractor/anvato.py
+++ b/youtube_dl/extractor/anvato.py
@@ -5,6 +5,7 @@ import base64
import hashlib
import json
import random
+import re
import time
from .common import InfoExtractor
@@ -16,6 +17,7 @@ from ..utils import (
intlist_to_bytes,
int_or_none,
strip_jsonp,
+ unescapeHTML,
)
@@ -26,6 +28,8 @@ def md5_text(s):
class AnvatoIE(InfoExtractor):
+ _VALID_URL = r'anvato:(?P<access_key_or_mcp>[^:]+):(?P<id>\d+)'
+
# Copied from anvplayer.min.js
_ANVACK_TABLE = {
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ',
@@ -114,6 +118,22 @@ class AnvatoIE(InfoExtractor):
'nbcu_nbcd_desktop_web_prod_93d8ead38ce2024f8f544b78306fbd15895ae5e6_secure': 'NNemUkySjxLyPTKvZRiGntBIjEyK8uqicjMakIaQ'
}
+ _MCP_TO_ACCESS_KEY_TABLE = {
+ 'qa': 'anvato_mcpqa_demo_web_stage_18b55e00db5a13faa8d03ae6e41f6f5bcb15b922',
+ 'lin': 'anvato_mcp_lin_web_prod_4c36fbfd4d8d8ecae6488656e21ac6d1ac972749',
+ 'univison': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+ 'uni': 'anvato_mcp_univision_web_prod_37fe34850c99a3b5cdb71dab10a417dd5cdecafa',
+ 'dev': 'anvato_mcp_fs2go_web_prod_c7b90a93e171469cdca00a931211a2f556370d0a',
+ 'sps': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+ 'spsstg': 'anvato_mcp_sps_web_prod_54bdc90dd6ba21710e9f7074338365bba28da336',
+ 'anv': 'anvato_mcp_anv_web_prod_791407490f4c1ef2a4bcb21103e0cb1bcb3352b3',
+ 'gray': 'anvato_mcp_gray_web_prod_4c10f067c393ed8fc453d3930f8ab2b159973900',
+ 'hearst': 'anvato_mcp_hearst_web_prod_5356c3de0fc7c90a3727b4863ca7fec3a4524a99',
+ 'cbs': 'anvato_mcp_cbs_web_prod_02f26581ff80e5bda7aad28226a8d369037f2cbe',
+ 'telemundo': 'anvato_mcp_telemundo_web_prod_c5278d51ad46fda4b6ca3d0ea44a7846a054f582'
+ }
+
+ _ANVP_RE = r'<script[^>]+\bdata-anvp\s*=\s*(["\'])(?P<anvp>(?:(?!\1).)+)\1'
_AUTH_KEY = b'\x31\xc2\x42\x84\x9e\x73\xa0\xce'
def __init__(self, *args, **kwargs):
@@ -178,12 +198,7 @@ class AnvatoIE(InfoExtractor):
}
if ext == 'm3u8' or media_format in ('m3u8', 'm3u8-variant'):
- # Not using _extract_m3u8_formats here as individual media
- # playlists are also included in published_urls.
- if tbr is None:
- formats.append(self._m3u8_meta_format(video_url, ext='mp4', m3u8_id='hls'))
- continue
- else:
+ if tbr is not None:
a_format.update({
'format_id': '-'.join(filter(None, ['hls', compat_str(tbr)])),
'ext': 'mp4',
@@ -222,9 +237,42 @@ class AnvatoIE(InfoExtractor):
'subtitles': subtitles,
}
+ @staticmethod
+ def _extract_urls(ie, webpage, video_id):
+ entries = []
+ for mobj in re.finditer(AnvatoIE._ANVP_RE, webpage):
+ anvplayer_data = ie._parse_json(
+ mobj.group('anvp'), video_id, transform_source=unescapeHTML,
+ fatal=False)
+ if not anvplayer_data:
+ continue
+ video = anvplayer_data.get('video')
+ if not isinstance(video, compat_str) or not video.isdigit():
+ continue
+ access_key = anvplayer_data.get('accessKey')
+ if not access_key:
+ mcp = anvplayer_data.get('mcp')
+ if mcp:
+ access_key = AnvatoIE._MCP_TO_ACCESS_KEY_TABLE.get(
+ mcp.lower())
+ if not access_key:
+ continue
+ entries.append(ie.url_result(
+ 'anvato:%s:%s' % (access_key, video), ie=AnvatoIE.ie_key(),
+ video_id=video))
+ return entries
+
def _extract_anvato_videos(self, webpage, video_id):
- anvplayer_data = self._parse_json(self._html_search_regex(
- r'<script[^>]+data-anvp=\'([^\']+)\'', webpage,
- 'Anvato player data'), video_id)
+ anvplayer_data = self._parse_json(
+ self._html_search_regex(
+ self._ANVP_RE, webpage, 'Anvato player data', group='anvp'),
+ video_id)
return self._get_anvato_videos(
anvplayer_data['accessKey'], anvplayer_data['video'])
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ access_key, video_id = mobj.group('access_key_or_mcp', 'id')
+ if access_key not in self._ANVACK_TABLE:
+ access_key = self._MCP_TO_ACCESS_KEY_TABLE[access_key]
+ return self._get_anvato_videos(access_key, video_id)
diff --git a/youtube_dl/extractor/appleconnect.py b/youtube_dl/extractor/appleconnect.py
index ea7a703..a84b8b1 100644
--- a/youtube_dl/extractor/appleconnect.py
+++ b/youtube_dl/extractor/appleconnect.py
@@ -12,13 +12,13 @@ class AppleConnectIE(InfoExtractor):
_VALID_URL = r'https?://itunes\.apple\.com/\w{0,2}/?post/idsa\.(?P<id>[\w-]+)'
_TEST = {
'url': 'https://itunes.apple.com/us/post/idsa.4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
- 'md5': '10d0f2799111df4cb1c924520ca78f98',
+ 'md5': 'e7c38568a01ea45402570e6029206723',
'info_dict': {
'id': '4ab17a39-2720-11e5-96c5-a5b38f6c42d3',
'ext': 'm4v',
'title': 'Energy',
'uploader': 'Drake',
- 'thumbnail': 'http://is5.mzstatic.com/image/thumb/Video5/v4/78/61/c5/7861c5fa-ad6d-294b-1464-cf7605b911d6/source/1920x1080sr.jpg',
+ 'thumbnail': r're:^https?://.*\.jpg$',
'upload_date': '20150710',
'timestamp': 1436545535,
},
diff --git a/youtube_dl/extractor/appletrailers.py b/youtube_dl/extractor/appletrailers.py
index a6801f3..b45b431 100644
--- a/youtube_dl/extractor/appletrailers.py
+++ b/youtube_dl/extractor/appletrailers.py
@@ -70,7 +70,8 @@ class AppleTrailersIE(InfoExtractor):
}, {
'url': 'http://trailers.apple.com/trailers/magnolia/blackthorn/',
'info_dict': {
- 'id': 'blackthorn',
+ 'id': '4489',
+ 'title': 'Blackthorn',
},
'playlist_mincount': 2,
'expected_warnings': ['Unable to download JSON metadata'],
@@ -261,7 +262,7 @@ class AppleTrailersSectionIE(InfoExtractor):
'title': 'Most Popular',
'id': 'mostpopular',
},
- 'playlist_mincount': 80,
+ 'playlist_mincount': 30,
}, {
'url': 'http://trailers.apple.com/#section=moviestudios',
'info_dict': {
diff --git a/youtube_dl/extractor/archiveorg.py b/youtube_dl/extractor/archiveorg.py
index e21045b..3c7d725 100644
--- a/youtube_dl/extractor/archiveorg.py
+++ b/youtube_dl/extractor/archiveorg.py
@@ -24,12 +24,12 @@ class ArchiveOrgIE(InfoExtractor):
}
}, {
'url': 'https://archive.org/details/Cops1922',
- 'md5': 'bc73c8ab3838b5a8fc6c6651fa7b58ba',
+ 'md5': '0869000b4ce265e8ca62738b336b268a',
'info_dict': {
'id': 'Cops1922',
'ext': 'mp4',
'title': 'Buster Keaton\'s "Cops" (1922)',
- 'description': 'md5:b4544662605877edd99df22f9620d858',
+ 'description': 'md5:89e7c77bf5d965dd5c0372cfb49470f6',
}
}, {
'url': 'http://archive.org/embed/XD300-23_68HighlightsAResearchCntAugHumanIntellect',
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 69a23e8..56baef2 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -180,7 +180,7 @@ class ArteTVBaseIE(InfoExtractor):
class ArteTVPlus7IE(ArteTVBaseIE):
IE_NAME = 'arte.tv:+7'
- _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/[^/]+/(?P<lang>fr|de|en|es)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:(?:www|sites)\.)?arte\.tv/(?:[^/]+/)?(?P<lang>fr|de|en|es)/(?:videos/)?(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.arte.tv/guide/de/sendungen/XEN/xenius/?vid=055918-015_PLUS7-D',
@@ -188,6 +188,9 @@ class ArteTVPlus7IE(ArteTVBaseIE):
}, {
'url': 'http://sites.arte.tv/karambolage/de/video/karambolage-22',
'only_matching': True,
+ }, {
+ 'url': 'http://www.arte.tv/de/videos/048696-000-A/der-kluge-bauch-unser-zweites-gehirn',
+ 'only_matching': True,
}]
@classmethod
diff --git a/youtube_dl/extractor/atresplayer.py b/youtube_dl/extractor/atresplayer.py
index 99af6dc..01fa308 100644
--- a/youtube_dl/extractor/atresplayer.py
+++ b/youtube_dl/extractor/atresplayer.py
@@ -36,7 +36,7 @@ class AtresPlayerIE(InfoExtractor):
},
{
'url': 'http://www.atresplayer.com/television/especial/videoencuentros/temporada-1/capitulo-112-david-bustamante_2014121600375.html',
- 'md5': '0d0e918533bbd4b263f2de4d197d4aac',
+ 'md5': '6e52cbb513c405e403dbacb7aacf8747',
'info_dict': {
'id': 'capitulo-112-david-bustamante',
'ext': 'flv',
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
index 8fc5f65..e48bb89 100644
--- a/youtube_dl/extractor/audioboom.py
+++ b/youtube_dl/extractor/audioboom.py
@@ -16,7 +16,7 @@ class AudioBoomIE(InfoExtractor):
'title': '3/09/2016 Czaban Hour 3',
'description': 'Guest: Nate Davis - NFL free agency, Guest: Stan Gans',
'duration': 2245.72,
- 'uploader': 'Steve Czaban',
+ 'uploader': 'SB Nation A.M.',
'uploader_url': r're:https?://(?:www\.)?audioboom\.com/channel/steveczabanyahoosportsradio',
}
}, {
diff --git a/youtube_dl/extractor/azubu.py b/youtube_dl/extractor/azubu.py
deleted file mode 100644
index 3ba2f00..0000000
--- a/youtube_dl/extractor/azubu.py
+++ /dev/null
@@ -1,140 +0,0 @@
-from __future__ import unicode_literals
-
-import json
-
-from .common import InfoExtractor
-from ..utils import (
- ExtractorError,
- float_or_none,
- sanitized_Request,
-)
-
-
-class AzubuIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/[^/]+#!/play/(?P<id>\d+)'
- _TESTS = [
- {
- 'url': 'http://www.azubu.tv/GSL#!/play/15575/2014-hot6-cup-last-big-match-ro8-day-1',
- 'md5': 'a88b42fcf844f29ad6035054bd9ecaf4',
- 'info_dict': {
- 'id': '15575',
- 'ext': 'mp4',
- 'title': '2014 HOT6 CUP LAST BIG MATCH Ro8 Day 1',
- 'description': 'md5:d06bdea27b8cc4388a90ad35b5c66c01',
- 'thumbnail': r're:^https?://.*\.jpe?g',
- 'timestamp': 1417523507.334,
- 'upload_date': '20141202',
- 'duration': 9988.7,
- 'uploader': 'GSL',
- 'uploader_id': 414310,
- 'view_count': int,
- },
- },
- {
- 'url': 'http://www.azubu.tv/FnaticTV#!/play/9344/-fnatic-at-worlds-2014:-toyz---%22i-love-rekkles,-he-has-amazing-mechanics%22-',
- 'md5': 'b72a871fe1d9f70bd7673769cdb3b925',
- 'info_dict': {
- 'id': '9344',
- 'ext': 'mp4',
- 'title': 'Fnatic at Worlds 2014: Toyz - "I love Rekkles, he has amazing mechanics"',
- 'description': 'md5:4a649737b5f6c8b5c5be543e88dc62af',
- 'thumbnail': r're:^https?://.*\.jpe?g',
- 'timestamp': 1410530893.320,
- 'upload_date': '20140912',
- 'duration': 172.385,
- 'uploader': 'FnaticTV',
- 'uploader_id': 272749,
- 'view_count': int,
- },
- 'skip': 'Channel offline',
- },
- ]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- data = self._download_json(
- 'http://www.azubu.tv/api/video/%s' % video_id, video_id)['data']
-
- title = data['title'].strip()
- description = data.get('description')
- thumbnail = data.get('thumbnail')
- view_count = data.get('view_count')
- user = data.get('user', {})
- uploader = user.get('username')
- uploader_id = user.get('id')
-
- stream_params = json.loads(data['stream_params'])
-
- timestamp = float_or_none(stream_params.get('creationDate'), 1000)
- duration = float_or_none(stream_params.get('length'), 1000)
-
- renditions = stream_params.get('renditions') or []
- video = stream_params.get('FLVFullLength') or stream_params.get('videoFullLength')
- if video:
- renditions.append(video)
-
- if not renditions and not user.get('channel', {}).get('is_live', True):
- raise ExtractorError('%s said: channel is offline.' % self.IE_NAME, expected=True)
-
- formats = [{
- 'url': fmt['url'],
- 'width': fmt['frameWidth'],
- 'height': fmt['frameHeight'],
- 'vbr': float_or_none(fmt['encodingRate'], 1000),
- 'filesize': fmt['size'],
- 'vcodec': fmt['videoCodec'],
- 'container': fmt['videoContainer'],
- } for fmt in renditions if fmt['url']]
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'thumbnail': thumbnail,
- 'timestamp': timestamp,
- 'duration': duration,
- 'uploader': uploader,
- 'uploader_id': uploader_id,
- 'view_count': view_count,
- 'formats': formats,
- }
-
-
-class AzubuLiveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?azubu\.(?:tv|uol.com.br)/(?P<id>[^/]+)$'
-
- _TESTS = [{
- 'url': 'http://www.azubu.tv/MarsTVMDLen',
- 'only_matching': True,
- }, {
- 'url': 'http://azubu.uol.com.br/adolfz',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- user = self._match_id(url)
-
- info = self._download_json(
- 'http://api.azubu.tv/public/modules/last-video/{0}/info'.format(user),
- user)['data']
- if info['type'] != 'STREAM':
- raise ExtractorError('{0} is not streaming live'.format(user), expected=True)
-
- req = sanitized_Request(
- 'https://edge-elb.api.brightcove.com/playback/v1/accounts/3361910549001/videos/ref:' + info['reference_id'])
- req.add_header('Accept', 'application/json;pk=BCpkADawqM1gvI0oGWg8dxQHlgT8HkdE2LnAlWAZkOlznO39bSZX726u4JqnDsK3MDXcO01JxXK2tZtJbgQChxgaFzEVdHRjaDoxaOu8hHOO8NYhwdxw9BzvgkvLUlpbDNUuDoc4E4wxDToV')
- bc_info = self._download_json(req, user)
- m3u8_url = next(source['src'] for source in bc_info['sources'] if source['container'] == 'M2TS')
- formats = self._extract_m3u8_formats(m3u8_url, user, ext='mp4')
- self._sort_formats(formats)
-
- return {
- 'id': info['id'],
- 'title': self._live_title(info['title']),
- 'uploader_id': user,
- 'formats': formats,
- 'is_live': True,
- 'thumbnail': bc_info['poster'],
- }
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 056e063..489d0ba 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -34,12 +34,12 @@ class BandcampIE(InfoExtractor):
'_skip': 'There is a limit of 200 free downloads / month for the test song'
}, {
'url': 'http://benprunty.bandcamp.com/track/lanius-battle',
- 'md5': '73d0b3171568232574e45652f8720b5c',
+ 'md5': '0369ace6b939f0927e62c67a1a8d9fa7',
'info_dict': {
'id': '2650410135',
- 'ext': 'mp3',
- 'title': 'Lanius (Battle)',
- 'uploader': 'Ben Prunty Music',
+ 'ext': 'aiff',
+ 'title': 'Ben Prunty - Lanius (Battle)',
+ 'uploader': 'Ben Prunty',
},
}]
@@ -47,6 +47,7 @@ class BandcampIE(InfoExtractor):
mobj = re.match(self._VALID_URL, url)
title = mobj.group('title')
webpage = self._download_webpage(url, title)
+ thumbnail = self._html_search_meta('og:image', webpage, default=None)
m_download = re.search(r'freeDownloadPage: "(.*?)"', webpage)
if not m_download:
m_trackinfo = re.search(r'trackinfo: (.+),\s*?\n', webpage)
@@ -75,6 +76,7 @@ class BandcampIE(InfoExtractor):
return {
'id': track_id,
'title': data['title'],
+ 'thumbnail': thumbnail,
'formats': formats,
'duration': float_or_none(data.get('duration')),
}
@@ -143,7 +145,7 @@ class BandcampIE(InfoExtractor):
return {
'id': video_id,
'title': title,
- 'thumbnail': info.get('thumb_url'),
+ 'thumbnail': info.get('thumb_url') or thumbnail,
'uploader': info.get('artist'),
'artist': artist,
'track': track,
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 8a2ed0a..dd65b8d 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -361,7 +361,7 @@ class BBCCoUkIE(InfoExtractor):
fmt.update({
'width': width,
'height': height,
- 'vbr': bitrate,
+ 'tbr': bitrate,
'vcodec': encoding,
})
else:
@@ -370,7 +370,7 @@ class BBCCoUkIE(InfoExtractor):
'acodec': encoding,
'vcodec': 'none',
})
- if protocol == 'http':
+ if protocol in ('http', 'https'):
# Direct link
fmt.update({
'url': href,
@@ -389,6 +389,8 @@ class BBCCoUkIE(InfoExtractor):
'rtmp_live': False,
'ext': 'flv',
})
+ else:
+ continue
formats.append(fmt)
elif kind == 'captions':
subtitles = self.extract_subtitles(media, programme_id)
@@ -407,7 +409,7 @@ class BBCCoUkIE(InfoExtractor):
description = smp_config['summary']
for item in smp_config['items']:
kind = item['kind']
- if kind != 'programme' and kind != 'radioProgramme':
+ if kind not in ('programme', 'radioProgramme'):
continue
programme_id = item.get('vpid')
duration = int_or_none(item.get('duration'))
@@ -448,7 +450,7 @@ class BBCCoUkIE(InfoExtractor):
for item in self._extract_items(playlist):
kind = item.get('kind')
- if kind != 'programme' and kind != 'radioProgramme':
+ if kind not in ('programme', 'radioProgramme'):
continue
title = playlist.find('./{%s}title' % self._EMP_PLAYLIST_NS).text
description_el = playlist.find('./{%s}summary' % self._EMP_PLAYLIST_NS)
diff --git a/youtube_dl/extractor/beeg.py b/youtube_dl/extractor/beeg.py
index b0b7914..d5c5822 100644
--- a/youtube_dl/extractor/beeg.py
+++ b/youtube_dl/extractor/beeg.py
@@ -16,7 +16,7 @@ class BeegIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?beeg\.com/(?P<id>\d+)'
_TEST = {
'url': 'http://beeg.com/5416503',
- 'md5': '46c384def73b33dbc581262e5ee67cef',
+ 'md5': 'a1a1b1a8bc70a89e49ccfd113aed0820',
'info_dict': {
'id': '5416503',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/bilibili.py b/youtube_dl/extractor/bilibili.py
index 80dd838..1e3f255 100644
--- a/youtube_dl/extractor/bilibili.py
+++ b/youtube_dl/extractor/bilibili.py
@@ -122,6 +122,11 @@ class BiliBiliIE(InfoExtractor):
'preference': -2 if 'hd.mp4' in backup_url else -3,
})
+ for a_format in formats:
+ a_format.setdefault('http_headers', {}).update({
+ 'Referer': url,
+ })
+
self._sort_formats(formats)
entries.append({
diff --git a/youtube_dl/extractor/bleacherreport.py b/youtube_dl/extractor/bleacherreport.py
index 7a8e1f6..e829974 100644
--- a/youtube_dl/extractor/bleacherreport.py
+++ b/youtube_dl/extractor/bleacherreport.py
@@ -35,7 +35,7 @@ class BleacherReportIE(InfoExtractor):
'title': 'Aussie Golfers Get Fright of Their Lives After Being Chased by Angry Kangaroo',
'timestamp': 1446839961,
'uploader': 'Sean Fay',
- 'description': 'md5:825e94e0f3521df52fa83b2ed198fa20',
+ 'description': 'md5:b1601e2314c4d8eec23b6eafe086a757',
'uploader_id': 6466954,
'upload_date': '20151011',
},
@@ -90,17 +90,13 @@ class BleacherReportCMSIE(AMPIE):
_VALID_URL = r'https?://(?:www\.)?bleacherreport\.com/video_embed\?id=(?P<id>[0-9a-f-]{36})'
_TESTS = [{
'url': 'http://bleacherreport.com/video_embed?id=8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'md5': '8c2c12e3af7805152675446c905d159b',
+ 'md5': '2e4b0a997f9228ffa31fada5c53d1ed1',
'info_dict': {
'id': '8fd44c2f-3dc5-4821-9118-2c825a98c0e1',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Cena vs. Rollins Would Expose the Heavyweight Division',
'description': 'md5:984afb4ade2f9c0db35f3267ed88b36e',
},
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/br.py b/youtube_dl/extractor/br.py
index ff0aa11..2c32b6a 100644
--- a/youtube_dl/extractor/br.py
+++ b/youtube_dl/extractor/br.py
@@ -77,7 +77,7 @@ class BRIE(InfoExtractor):
'description': 'md5:bb659990e9e59905c3d41e369db1fbe3',
'duration': 893,
'uploader': 'Eva Maria Steimle',
- 'upload_date': '20140117',
+ 'upload_date': '20170208',
}
},
]
diff --git a/youtube_dl/extractor/brightcove.py b/youtube_dl/extractor/brightcove.py
index 46ef8e6..0ed59bc 100644
--- a/youtube_dl/extractor/brightcove.py
+++ b/youtube_dl/extractor/brightcove.py
@@ -5,6 +5,7 @@ import re
import json
from .common import InfoExtractor
+from .adobepass import AdobePassIE
from ..compat import (
compat_etree_fromstring,
compat_parse_qs,
@@ -17,6 +18,7 @@ from ..compat import (
from ..utils import (
determine_ext,
ExtractorError,
+ extract_attributes,
find_xpath_attr,
fix_xml_ampersands,
float_or_none,
@@ -109,6 +111,7 @@ class BrightcoveLegacyIE(InfoExtractor):
'upload_date': '20140827',
'uploader_id': '710858724001',
},
+ 'skip': 'Video gone',
},
{
# playlist with 'videoList'
@@ -129,6 +132,12 @@ class BrightcoveLegacyIE(InfoExtractor):
},
'playlist_mincount': 10,
},
+ {
+ # playerID inferred from bcpid
+ # from http://www.un.org/chinese/News/story.asp?NewsID=27724
+ 'url': 'https://link.brightcove.com/services/player/bcpid1722935254001/?bctid=5360463607001&autoStart=false&secureConnections=true&width=650&height=350',
+ 'only_matching': True, # Tested in GenericIE
+ }
]
FLV_VCODECS = {
1: 'SORENSON',
@@ -264,9 +273,13 @@ class BrightcoveLegacyIE(InfoExtractor):
if matches:
return list(filter(None, [cls._build_brighcove_url(m) for m in matches]))
- return list(filter(None, [
- cls._build_brighcove_url_from_js(custom_bc)
- for custom_bc in re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)]))
+ matches = re.findall(r'(customBC\.createVideo\(.+?\);)', webpage)
+ if matches:
+ return list(filter(None, [
+ cls._build_brighcove_url_from_js(custom_bc)
+ for custom_bc in matches]))
+ return [src for _, src in re.findall(
+ r'<iframe[^>]+src=([\'"])((?:https?:)?//link\.brightcove\.com/services/player/(?!\1).+)\1', webpage)]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -283,6 +296,10 @@ class BrightcoveLegacyIE(InfoExtractor):
if videoPlayer:
# We set the original url as the default 'Referer' header
referer = smuggled_data.get('Referer', url)
+ if 'playerID' not in query:
+ mobj = re.search(r'/bcpid(\d+)', url)
+ if mobj is not None:
+ query['playerID'] = [mobj.group(1)]
return self._get_video_info(
videoPlayer[0], query, referer=referer)
elif 'playerKey' in query:
@@ -432,7 +449,7 @@ class BrightcoveLegacyIE(InfoExtractor):
return info
-class BrightcoveNewIE(InfoExtractor):
+class BrightcoveNewIE(AdobePassIE):
IE_NAME = 'brightcove:new'
_VALID_URL = r'https?://players\.brightcove\.net/(?P<account_id>\d+)/(?P<player_id>[^/]+)_(?P<embed>[^/]+)/index\.html\?.*videoId=(?P<video_id>\d+|ref:[^&]+)'
_TESTS = [{
@@ -482,17 +499,18 @@ class BrightcoveNewIE(InfoExtractor):
}]
@staticmethod
- def _extract_url(webpage):
- urls = BrightcoveNewIE._extract_urls(webpage)
+ def _extract_url(ie, webpage):
+ urls = BrightcoveNewIE._extract_urls(ie, webpage)
return urls[0] if urls else None
@staticmethod
- def _extract_urls(webpage):
+ def _extract_urls(ie, webpage):
# Reference:
# 1. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideoiniframe
- # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
- # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/embed-in-page.html
- # 4. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
+ # 2. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#tag
+ # 3. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/publish-video.html#setvideousingjavascript
+ # 4. http://docs.brightcove.com/en/video-cloud/brightcove-player/guides/in-page-embed-player-implementation.html
+ # 5. https://support.brightcove.com/en/video-cloud/docs/dynamically-assigning-videos-player
entries = []
@@ -501,22 +519,48 @@ class BrightcoveNewIE(InfoExtractor):
r'<iframe[^>]+src=(["\'])((?:https?:)?//players\.brightcove\.net/\d+/[^/]+/index\.html.+?)\1', webpage):
entries.append(url if url.startswith('http') else 'http:' + url)
- # Look for embed_in_page embeds [2]
- for video_id, account_id, player_id, embed in re.findall(
- # According to examples from [3] it's unclear whether video id
- # may be optional and what to do when it is
- # According to [4] data-video-id may be prefixed with ref:
- r'''(?sx)
- <video[^>]+
- data-video-id=["\'](\d+|ref:[^"\']+)["\'][^>]*>.*?
- </video>.*?
- <script[^>]+
- src=["\'](?:https?:)?//players\.brightcove\.net/
- (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+ # Look for <video> tags [2] and embed_in_page embeds [3]
+ # [2] looks like:
+ for video, script_tag, account_id, player_id, embed in re.findall(
+ r'''(?isx)
+ (<video\s+[^>]*\bdata-video-id\s*=\s*['"]?[^>]+>)
+ (?:.*?
+ (<script[^>]+
+ src=["\'](?:https?:)?//players\.brightcove\.net/
+ (\d+)/([^/]+)_([^/]+)/index(?:\.min)?\.js
+ )
+ )?
''', webpage):
- entries.append(
- 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
- % (account_id, player_id, embed, video_id))
+ attrs = extract_attributes(video)
+
+ # According to examples from [4] it's unclear whether video id
+ # may be optional and what to do when it is
+ video_id = attrs.get('data-video-id')
+ if not video_id:
+ continue
+
+ account_id = account_id or attrs.get('data-account')
+ if not account_id:
+ continue
+
+ player_id = player_id or attrs.get('data-player') or 'default'
+ embed = embed or attrs.get('data-embed') or 'default'
+
+ bc_url = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s' % (
+ account_id, player_id, embed, video_id)
+
+ # Some brightcove videos may be embedded with video tag only and
+ # without script tag or any mentioning of brightcove at all. Such
+ # embeds are considered ambiguous since they are matched based only
+ # on data-video-id and data-account attributes and in the wild may
+ # not be brightcove embeds at all. Let's check reconstructed
+ # brightcove URLs in case of such embeds and only process valid
+ # ones. By this we ensure there is indeed a brightcove embed.
+ if not script_tag and not ie._is_valid_url(
+ bc_url, video_id, 'possible brightcove video'):
+ continue
+
+ entries.append(bc_url)
return entries
@@ -559,6 +603,20 @@ class BrightcoveNewIE(InfoExtractor):
raise ExtractorError(message, expected=True)
raise
+ errors = json_data.get('errors')
+ if errors and errors[0].get('error_subcode') == 'TVE_AUTH':
+ custom_fields = json_data['custom_fields']
+ tve_token = self._extract_mvpd_auth(
+ smuggled_data['source_url'], video_id,
+ custom_fields['bcadobepassrequestorid'],
+ custom_fields['bcadobepassresourceid'])
+ json_data = self._download_json(
+ api_url, video_id, headers={
+ 'Accept': 'application/json;pk=%s' % policy_key
+ }, query={
+ 'tveToken': tve_token,
+ })
+
title = json_data['name'].strip()
formats = []
@@ -624,7 +682,6 @@ class BrightcoveNewIE(InfoExtractor):
})
formats.append(f)
- errors = json_data.get('errors')
if not formats and errors:
error = errors[0]
raise ExtractorError(
@@ -641,7 +698,7 @@ class BrightcoveNewIE(InfoExtractor):
is_live = False
duration = float_or_none(json_data.get('duration'), 1000)
- if duration and duration < 0:
+ if duration is not None and duration <= 0:
is_live = True
return {
diff --git a/youtube_dl/extractor/canalc2.py b/youtube_dl/extractor/canalc2.py
index f1f128c..acd87e3 100644
--- a/youtube_dl/extractor/canalc2.py
+++ b/youtube_dl/extractor/canalc2.py
@@ -16,13 +16,10 @@ class Canalc2IE(InfoExtractor):
'md5': '060158428b650f896c542dfbb3d6487f',
'info_dict': {
'id': '12163',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Terrasses du Numérique',
'duration': 122,
},
- 'params': {
- 'skip_download': True, # Requires rtmpdump
- }
}, {
'url': 'http://archives-canalc2.u-strasbg.fr/video.asp?idVideo=11427&voir=oui',
'only_matching': True,
diff --git a/youtube_dl/extractor/canalplus.py b/youtube_dl/extractor/canalplus.py
index 4b9fa2d..d8bf073 100644
--- a/youtube_dl/extractor/canalplus.py
+++ b/youtube_dl/extractor/canalplus.py
@@ -7,8 +7,8 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_urlparse
from ..utils import (
dict_get,
- ExtractorError,
- HEADRequest,
+ # ExtractorError,
+ # HEADRequest,
int_or_none,
qualities,
remove_end,
@@ -45,6 +45,9 @@ class CanalplusIE(InfoExtractor):
'itele': 'itele',
}
+ # Only works for direct mp4 URLs
+ _GEO_COUNTRIES = ['FR']
+
_TESTS = [{
'url': 'http://www.canalplus.fr/c-emissions/pid1830-c-zapping.html?vid=1192814',
'info_dict': {
@@ -56,6 +59,7 @@ class CanalplusIE(InfoExtractor):
'upload_date': '20160702',
},
}, {
+ # geo restricted, bypassed
'url': 'http://www.piwiplus.fr/videos-piwi/pid1405-le-labyrinthe-boing-super-ranger.html?vid=1108190',
'info_dict': {
'id': '1108190',
@@ -65,19 +69,20 @@ class CanalplusIE(InfoExtractor):
'description': 'md5:4cea7a37153be42c1ba2c1d3064376ff',
'upload_date': '20140724',
},
- 'skip': 'Only works from France',
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
}, {
- 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html',
- 'md5': '4b47b12b4ee43002626b97fad8fb1de5',
+ # geo restricted, bypassed
+ 'url': 'http://www.c8.fr/c8-divertissement/ms-touche-pas-a-mon-poste/pid6318-videos-integrales.html?vid=1443684',
+ 'md5': 'bb6f9f343296ab7ebd88c97b660ecf8d',
'info_dict': {
- 'id': '1420213',
+ 'id': '1443684',
'display_id': 'pid6318-videos-integrales',
'ext': 'mp4',
- 'title': 'TPMP ! Même le matin - Les 35H de Baba - 14/10/2016',
- 'description': 'md5:f96736c1b0ffaa96fd5b9e60ad871799',
- 'upload_date': '20161014',
+ 'title': 'Guess my iep ! - TPMP - 07/04/2017',
+ 'description': 'md5:6f005933f6e06760a9236d9b3b5f17fa',
+ 'upload_date': '20170407',
},
- 'skip': 'Only works from France',
+ 'expected_warnings': ['HTTP Error 403: Forbidden'],
}, {
'url': 'http://www.itele.fr/chroniques/invite-michael-darmon/rachida-dati-nicolas-sarkozy-est-le-plus-en-phase-avec-les-inquietudes-des-francais-171510',
'info_dict': {
@@ -134,15 +139,15 @@ class CanalplusIE(InfoExtractor):
preference = qualities(['MOBILE', 'BAS_DEBIT', 'HAUT_DEBIT', 'HD'])
- fmt_url = next(iter(media.get('VIDEOS')))
- if '/geo' in fmt_url.lower():
- response = self._request_webpage(
- HEADRequest(fmt_url), video_id,
- 'Checking if the video is georestricted')
- if '/blocage' in response.geturl():
- raise ExtractorError(
- 'The video is not available in your country',
- expected=True)
+ # _, fmt_url = next(iter(media['VIDEOS'].items()))
+ # if '/geo' in fmt_url.lower():
+ # response = self._request_webpage(
+ # HEADRequest(fmt_url), video_id,
+ # 'Checking if the video is georestricted')
+ # if '/blocage' in response.geturl():
+ # raise ExtractorError(
+ # 'The video is not available in your country',
+ # expected=True)
formats = []
for format_id, format_url in media['VIDEOS'].items():
diff --git a/youtube_dl/extractor/canvas.py b/youtube_dl/extractor/canvas.py
index 544c665..aada029 100644
--- a/youtube_dl/extractor/canvas.py
+++ b/youtube_dl/extractor/canvas.py
@@ -7,6 +7,7 @@ from ..utils import float_or_none
class CanvasIE(InfoExtractor):
+ IE_DESC = 'canvas.be and een.be'
_VALID_URL = r'https?://(?:www\.)?(?P<site_id>canvas|een)\.be/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.canvas.be/video/de-afspraak/najaar-2015/de-afspraak-veilt-voor-de-warmste-week',
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py
index cf678e7..87ad14e 100644
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -96,6 +96,7 @@ class CBCIE(InfoExtractor):
'info_dict': {
'title': 'Keep Rover active during the deep freeze with doggie pushups and other fun indoor tasks',
'id': 'dog-indoor-exercise-winter-1.3928238',
+ 'description': 'md5:c18552e41726ee95bd75210d1ca9194c',
},
'playlist_mincount': 6,
}]
@@ -165,12 +166,11 @@ class CBCPlayerIE(InfoExtractor):
'uploader': 'CBCC-NEW',
},
}, {
- # available only when we add `formats=MPEG4,FLV,MP3` to theplatform url
'url': 'http://www.cbc.ca/player/play/2164402062',
- 'md5': '17a61eb813539abea40618d6323a7f82',
+ 'md5': '33fcd8f6719b9dd60a5e73adcb83b9f6',
'info_dict': {
'id': '2164402062',
- 'ext': 'flv',
+ 'ext': 'mp4',
'title': 'Cancer survivor four times over',
'description': 'Tim Mayer has beaten three different forms of cancer four times in five years.',
'timestamp': 1320410746,
diff --git a/youtube_dl/extractor/cbslocal.py b/youtube_dl/extractor/cbslocal.py
index 8d5f11d..7d78e3a 100644
--- a/youtube_dl/extractor/cbslocal.py
+++ b/youtube_dl/extractor/cbslocal.py
@@ -60,8 +60,8 @@ class CBSLocalIE(AnvatoIE):
'title': 'A Very Blue Anniversary',
'description': 'CBS2’s Cindy Hsu has more.',
'thumbnail': 're:^https?://.*',
- 'timestamp': 1479962220,
- 'upload_date': '20161124',
+ 'timestamp': int,
+ 'upload_date': r're:^\d{8}$',
'uploader': 'CBS',
'subtitles': {
'en': 'mincount:5',
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py
index 1ee35b5..78b7a92 100755
--- a/youtube_dl/extractor/cda.py
+++ b/youtube_dl/extractor/cda.py
@@ -9,7 +9,10 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ multipart_encode,
parse_duration,
+ random_birthday,
+ urljoin,
)
@@ -27,7 +30,8 @@ class CDAIE(InfoExtractor):
'description': 'md5:269ccd135d550da90d1662651fcb9772',
'thumbnail': r're:^https?://.*\.jpg$',
'average_rating': float,
- 'duration': 39
+ 'duration': 39,
+ 'age_limit': 0,
}
}, {
'url': 'http://www.cda.pl/video/57413289',
@@ -41,13 +45,41 @@ class CDAIE(InfoExtractor):
'uploader': 'crash404',
'view_count': int,
'average_rating': float,
- 'duration': 137
+ 'duration': 137,
+ 'age_limit': 0,
}
}, {
+ # Age-restricted
+ 'url': 'http://www.cda.pl/video/1273454c4',
+ 'info_dict': {
+ 'id': '1273454c4',
+ 'ext': 'mp4',
+ 'title': 'Bronson (2008) napisy HD 1080p',
+ 'description': 'md5:1b6cb18508daf2dc4e0fa4db77fec24c',
+ 'height': 1080,
+ 'uploader': 'boniek61',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 5554,
+ 'age_limit': 18,
+ 'view_count': int,
+ 'average_rating': float,
+ },
+ }, {
'url': 'http://ebd.cda.pl/0x0/5749950c',
'only_matching': True,
}]
+ def _download_age_confirm_page(self, url, video_id, *args, **kwargs):
+ form_data = random_birthday('rok', 'miesiac', 'dzien')
+ form_data.update({'return': url, 'module': 'video', 'module_id': video_id})
+ data, content_type = multipart_encode(form_data)
+ return self._download_webpage(
+ urljoin(url, '/a/validatebirth'), video_id, *args,
+ data=data, headers={
+ 'Referer': url,
+ 'Content-Type': content_type,
+ }, **kwargs)
+
def _real_extract(self, url):
video_id = self._match_id(url)
self._set_cookie('cda.pl', 'cda.player', 'html5')
@@ -57,6 +89,13 @@ class CDAIE(InfoExtractor):
if 'Ten film jest dostępny dla użytkowników premium' in webpage:
raise ExtractorError('This video is only available for premium users.', expected=True)
+ need_confirm_age = False
+ if self._html_search_regex(r'(<form[^>]+action="/a/validatebirth")',
+ webpage, 'birthday validate form', default=None):
+ webpage = self._download_age_confirm_page(
+ url, video_id, note='Confirming age')
+ need_confirm_age = True
+
formats = []
uploader = self._search_regex(r'''(?x)
@@ -81,6 +120,7 @@ class CDAIE(InfoExtractor):
'thumbnail': self._og_search_thumbnail(webpage),
'formats': formats,
'duration': None,
+ 'age_limit': 18 if need_confirm_age else 0,
}
def extract_format(page, version):
@@ -121,7 +161,12 @@ class CDAIE(InfoExtractor):
for href, resolution in re.findall(
r'<a[^>]+data-quality="[^"]+"[^>]+href="([^"]+)"[^>]+class="quality-btn"[^>]*>([0-9]+p)',
webpage):
- webpage = self._download_webpage(
+ if need_confirm_age:
+ handler = self._download_age_confirm_page
+ else:
+ handler = self._download_webpage
+
+ webpage = handler(
self._BASE_URL + href, video_id,
'Downloading %s version information' % resolution, fatal=False)
if not webpage:
@@ -129,6 +174,7 @@ class CDAIE(InfoExtractor):
# invalid version is requested.
self.report_warning('Unable to download %s version information' % resolution)
continue
+
extract_format(webpage, resolution)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/ceskatelevize.py b/youtube_dl/extractor/ceskatelevize.py
index dd2529a..e250de1 100644
--- a/youtube_dl/extractor/ceskatelevize.py
+++ b/youtube_dl/extractor/ceskatelevize.py
@@ -12,13 +12,14 @@ from ..utils import (
ExtractorError,
float_or_none,
sanitized_Request,
+ unescapeHTML,
urlencode_postdata,
USER_AGENTS,
)
class CeskaTelevizeIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/(porady|ivysilani)/(?:[^/]+/)*(?P<id>[^/#?]+)/*(?:[#?].*)?$'
+ _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/ivysilani/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
_TESTS = [{
'url': 'http://www.ceskatelevize.cz/ivysilani/ivysilani/10441294653-hyde-park-civilizace/214411058091220',
'info_dict': {
@@ -62,40 +63,12 @@ class CeskaTelevizeIE(InfoExtractor):
},
'skip': 'Georestricted to Czech Republic',
}, {
- # video with 18+ caution trailer
- 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
- 'info_dict': {
- 'id': '215562210900007-bogotart',
- 'title': 'Queer: Bogotart',
- 'description': 'Alternativní průvodce současným queer světem',
- },
- 'playlist': [{
- 'info_dict': {
- 'id': '61924494876844842',
- 'ext': 'mp4',
- 'title': 'Queer: Bogotart (Varování 18+)',
- 'duration': 10.2,
- },
- }, {
- 'info_dict': {
- 'id': '61924494877068022',
- 'ext': 'mp4',
- 'title': 'Queer: Bogotart (Queer)',
- 'thumbnail': r're:^https?://.*\.jpg',
- 'duration': 1558.3,
- },
- }],
- 'params': {
- # m3u8 download
- 'skip_download': True,
- },
+ 'url': 'http://www.ceskatelevize.cz/ivysilani/embed/iFramePlayer.php?hash=d6a3e1370d2e4fa76296b90bad4dfc19673b641e&IDEC=217 562 22150/0004&channelID=1&width=100%25',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- url = url.replace('/porady/', '/ivysilani/').replace('/video/', '')
-
- mobj = re.match(self._VALID_URL, url)
- playlist_id = mobj.group('id')
+ playlist_id = self._match_id(url)
webpage = self._download_webpage(url, playlist_id)
@@ -103,13 +76,28 @@ class CeskaTelevizeIE(InfoExtractor):
if '%s</p>' % NOT_AVAILABLE_STRING in webpage:
raise ExtractorError(NOT_AVAILABLE_STRING, expected=True)
- typ = self._html_search_regex(
- r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],', webpage, 'type')
- episode_id = self._html_search_regex(
- r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],', webpage, 'episode_id')
+ type_ = None
+ episode_id = None
+
+ playlist = self._parse_json(
+ self._search_regex(
+ r'getPlaylistUrl\(\[({.+?})\]', webpage, 'playlist',
+ default='{}'), playlist_id)
+ if playlist:
+ type_ = playlist.get('type')
+ episode_id = playlist.get('id')
+
+ if not type_:
+ type_ = self._html_search_regex(
+ r'getPlaylistUrl\(\[\{"type":"(.+?)","id":".+?"\}\],',
+ webpage, 'type')
+ if not episode_id:
+ episode_id = self._html_search_regex(
+ r'getPlaylistUrl\(\[\{"type":".+?","id":"(.+?)"\}\],',
+ webpage, 'episode_id')
data = {
- 'playlist[0][type]': typ,
+ 'playlist[0][type]': type_,
'playlist[0][id]': episode_id,
'requestUrl': compat_urllib_parse_urlparse(url).path,
'requestSource': 'iVysilani',
@@ -245,3 +233,47 @@ class CeskaTelevizeIE(InfoExtractor):
yield line
return '\r\n'.join(_fix_subtitle(subtitles))
+
+
+class CeskaTelevizePoradyIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?ceskatelevize\.cz/porady/(?:[^/?#&]+/)*(?P<id>[^/#?]+)'
+ _TESTS = [{
+ # video with 18+ caution trailer
+ 'url': 'http://www.ceskatelevize.cz/porady/10520528904-queer/215562210900007-bogotart/',
+ 'info_dict': {
+ 'id': '215562210900007-bogotart',
+ 'title': 'Queer: Bogotart',
+ 'description': 'Alternativní průvodce současným queer světem',
+ },
+ 'playlist': [{
+ 'info_dict': {
+ 'id': '61924494876844842',
+ 'ext': 'mp4',
+ 'title': 'Queer: Bogotart (Varování 18+)',
+ 'duration': 10.2,
+ },
+ }, {
+ 'info_dict': {
+ 'id': '61924494877068022',
+ 'ext': 'mp4',
+ 'title': 'Queer: Bogotart (Queer)',
+ 'thumbnail': r're:^https?://.*\.jpg',
+ 'duration': 1558.3,
+ },
+ }],
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ data_url = unescapeHTML(self._search_regex(
+ r'<span[^>]*\bdata-url=(["\'])(?P<url>(?:(?!\1).)+)\1',
+ webpage, 'iframe player url', group='url'))
+
+ return self.url_result(data_url, ie=CeskaTelevizeIE.ie_key())
diff --git a/youtube_dl/extractor/chaturbate.py b/youtube_dl/extractor/chaturbate.py
index 8fbc91c..e3eba4b 100644
--- a/youtube_dl/extractor/chaturbate.py
+++ b/youtube_dl/extractor/chaturbate.py
@@ -33,10 +33,17 @@ class ChaturbateIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- m3u8_formats = [(m.group('id').lower(), m.group('url')) for m in re.finditer(
- r'hlsSource(?P<id>.+?)\s*=\s*(?P<q>["\'])(?P<url>http.+?)(?P=q)', webpage)]
+ m3u8_urls = []
- if not m3u8_formats:
+ for m in re.finditer(
+ r'(["\'])(?P<url>http.+?\.m3u8.*?)\1', webpage):
+ m3u8_fast_url, m3u8_no_fast_url = m.group('url'), m.group(
+ 'url').replace('_fast', '')
+ for m3u8_url in (m3u8_fast_url, m3u8_no_fast_url):
+ if m3u8_url not in m3u8_urls:
+ m3u8_urls.append(m3u8_url)
+
+ if not m3u8_urls:
error = self._search_regex(
[r'<span[^>]+class=(["\'])desc_span\1[^>]*>(?P<error>[^<]+)</span>',
r'<div[^>]+id=(["\'])defchat\1[^>]*>\s*<p><strong>(?P<error>[^<]+)<'],
@@ -50,7 +57,8 @@ class ChaturbateIE(InfoExtractor):
raise ExtractorError('Unable to find stream URL')
formats = []
- for m3u8_id, m3u8_url in m3u8_formats:
+ for m3u8_url in m3u8_urls:
+ m3u8_id = 'fast' if '_fast' in m3u8_url else 'slow'
formats.extend(self._extract_m3u8_formats(
m3u8_url, video_id, ext='mp4',
# ffmpeg skips segments for fast m3u8
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
index bb52e0c..0920f62 100644
--- a/youtube_dl/extractor/clipfish.py
+++ b/youtube_dl/extractor/clipfish.py
@@ -12,7 +12,7 @@ class ClipfishIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
_TEST = {
'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
- 'md5': '720563e467b86374c194bdead08d207d',
+ 'md5': 'b9a5dc46294154c1193e2d10e0c95693',
'info_dict': {
'id': '4343170',
'ext': 'mp4',
diff --git a/youtube_dl/extractor/collegerama.py b/youtube_dl/extractor/collegerama.py
index 18c7347..6a41db8 100644
--- a/youtube_dl/extractor/collegerama.py
+++ b/youtube_dl/extractor/collegerama.py
@@ -21,7 +21,7 @@ class CollegeRamaIE(InfoExtractor):
'ext': 'mp4',
'title': 'Een nieuwe wereld: waarden, bewustzijn en techniek van de mensheid 2.0.',
'description': '',
- 'thumbnail': r're:^https?://.*\.jpg$',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
'duration': 7713.088,
'timestamp': 1413309600,
'upload_date': '20141014',
@@ -35,6 +35,7 @@ class CollegeRamaIE(InfoExtractor):
'ext': 'wmv',
'title': '64ste Vakantiecursus: Afvalwater',
'description': 'md5:7fd774865cc69d972f542b157c328305',
+ 'thumbnail': r're:^https?://.*\.jpg(?:\?.*?)?$',
'duration': 10853,
'timestamp': 1326446400,
'upload_date': '20120113',
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index 6c3c095..fec39da 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -1,3 +1,4 @@
+# coding: utf-8
from __future__ import unicode_literals
import base64
@@ -244,6 +245,10 @@ class InfoExtractor(object):
specified in the URL.
end_time: Time in seconds where the reproduction should end, as
specified in the URL.
+ chapters: A list of dictionaries, with the following entries:
+ * "start_time" - The start time of the chapter in seconds
+ * "end_time" - The end time of the chapter in seconds
+ * "title" (optional, string)
The following fields should only be used when the video belongs to some logical
chapter or section:
@@ -547,6 +552,34 @@ class InfoExtractor(object):
return encoding
+ def __check_blocked(self, content):
+ first_block = content[:512]
+ if ('<title>Access to this site is blocked</title>' in content and
+ 'Websense' in first_block):
+ msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
+ blocked_iframe = self._html_search_regex(
+ r'<iframe src="([^"]+)"', content,
+ 'Websense information URL', default=None)
+ if blocked_iframe:
+ msg += ' Visit %s for more details' % blocked_iframe
+ raise ExtractorError(msg, expected=True)
+ if '<title>The URL you requested has been blocked</title>' in first_block:
+ msg = (
+ 'Access to this webpage has been blocked by Indian censorship. '
+ 'Use a VPN or proxy server (with --proxy) to route around it.')
+ block_msg = self._html_search_regex(
+ r'</h1><p>(.*?)</p>',
+ content, 'block message', default=None)
+ if block_msg:
+ msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
+ raise ExtractorError(msg, expected=True)
+ if ('<title>TTK :: Доступ к ресурсу ограничен</title>' in content and
+ 'blocklist.rkn.gov.ru' in content):
+ raise ExtractorError(
+ 'Access to this webpage has been blocked by decision of the Russian government. '
+ 'Visit http://blocklist.rkn.gov.ru/ for a block reason.',
+ expected=True)
+
def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None, encoding=None):
content_type = urlh.headers.get('Content-Type', '')
webpage_bytes = urlh.read()
@@ -588,25 +621,7 @@ class InfoExtractor(object):
except LookupError:
content = webpage_bytes.decode('utf-8', 'replace')
- if ('<title>Access to this site is blocked</title>' in content and
- 'Websense' in content[:512]):
- msg = 'Access to this webpage has been blocked by Websense filtering software in your network.'
- blocked_iframe = self._html_search_regex(
- r'<iframe src="([^"]+)"', content,
- 'Websense information URL', default=None)
- if blocked_iframe:
- msg += ' Visit %s for more details' % blocked_iframe
- raise ExtractorError(msg, expected=True)
- if '<title>The URL you requested has been blocked</title>' in content[:512]:
- msg = (
- 'Access to this webpage has been blocked by Indian censorship. '
- 'Use a VPN or proxy server (with --proxy) to route around it.')
- block_msg = self._html_search_regex(
- r'</h1><p>(.*?)</p>',
- content, 'block message', default=None)
- if block_msg:
- msg += ' (Message: "%s")' % block_msg.replace('\n', ' ')
- raise ExtractorError(msg, expected=True)
+ self.__check_blocked(content)
return content
@@ -965,6 +980,23 @@ class InfoExtractor(object):
return info
if isinstance(json_ld, dict):
json_ld = [json_ld]
+
+ def extract_video_object(e):
+ assert e['@type'] == 'VideoObject'
+ info.update({
+ 'url': e.get('contentUrl'),
+ 'title': unescapeHTML(e.get('name')),
+ 'description': unescapeHTML(e.get('description')),
+ 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
+ 'duration': parse_duration(e.get('duration')),
+ 'timestamp': unified_timestamp(e.get('uploadDate')),
+ 'filesize': float_or_none(e.get('contentSize')),
+ 'tbr': int_or_none(e.get('bitrate')),
+ 'width': int_or_none(e.get('width')),
+ 'height': int_or_none(e.get('height')),
+ 'view_count': int_or_none(e.get('interactionCount')),
+ })
+
for e in json_ld:
if e.get('@context') == 'http://schema.org':
item_type = e.get('@type')
@@ -989,18 +1021,11 @@ class InfoExtractor(object):
'description': unescapeHTML(e.get('articleBody')),
})
elif item_type == 'VideoObject':
- info.update({
- 'url': e.get('contentUrl'),
- 'title': unescapeHTML(e.get('name')),
- 'description': unescapeHTML(e.get('description')),
- 'thumbnail': e.get('thumbnailUrl') or e.get('thumbnailURL'),
- 'duration': parse_duration(e.get('duration')),
- 'timestamp': unified_timestamp(e.get('uploadDate')),
- 'filesize': float_or_none(e.get('contentSize')),
- 'tbr': int_or_none(e.get('bitrate')),
- 'width': int_or_none(e.get('width')),
- 'height': int_or_none(e.get('height')),
- })
+ extract_video_object(e)
+ elif item_type == 'WebPage':
+ video = e.get('video')
+ if isinstance(video, dict) and video.get('@type') == 'VideoObject':
+ extract_video_object(video)
break
return dict((k, v) for k, v in info.items() if v is not None)
@@ -1292,40 +1317,50 @@ class InfoExtractor(object):
entry_protocol='m3u8', preference=None,
m3u8_id=None, note=None, errnote=None,
fatal=True, live=False):
-
res = self._download_webpage_handle(
m3u8_url, video_id,
note=note or 'Downloading m3u8 information',
errnote=errnote or 'Failed to download m3u8 information',
fatal=fatal)
+
if res is False:
return []
+
m3u8_doc, urlh = res
m3u8_url = urlh.geturl()
+ return self._parse_m3u8_formats(
+ m3u8_doc, m3u8_url, ext=ext, entry_protocol=entry_protocol,
+ preference=preference, m3u8_id=m3u8_id, live=live)
+
+ def _parse_m3u8_formats(self, m3u8_doc, m3u8_url, ext=None,
+ entry_protocol='m3u8', preference=None,
+ m3u8_id=None, live=False):
if '#EXT-X-FAXS-CM:' in m3u8_doc: # Adobe Flash Access
return []
- formats = [self._m3u8_meta_format(m3u8_url, ext, preference, m3u8_id)]
+ formats = []
format_url = lambda u: (
u
if re.match(r'^https?://', u)
else compat_urlparse.urljoin(m3u8_url, u))
- # We should try extracting formats only from master playlists [1], i.e.
- # playlists that describe available qualities. On the other hand media
- # playlists [2] should be returned as is since they contain just the media
- # without qualities renditions.
+ # References:
+ # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-21
+ # 2. https://github.com/rg3/youtube-dl/issues/12211
+
+ # We should try extracting formats only from master playlists [1, 4.3.4],
+ # i.e. playlists that describe available qualities. On the other hand
+ # media playlists [1, 4.3.3] should be returned as is since they contain
+ # just the media without qualities renditions.
# Fortunately, master playlist can be easily distinguished from media
- # playlist based on particular tags availability. As of [1, 2] master
- # playlist tags MUST NOT appear in a media playist and vice versa.
- # As of [3] #EXT-X-TARGETDURATION tag is REQUIRED for every media playlist
- # and MUST NOT appear in master playlist thus we can clearly detect media
- # playlist with this criterion.
- # 1. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.4
- # 2. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3
- # 3. https://tools.ietf.org/html/draft-pantos-http-live-streaming-17#section-4.3.3.1
+ # playlist based on particular tags availability. As of [1, 4.3.3, 4.3.4]
+ # master playlist tags MUST NOT appear in a media playist and vice versa.
+ # As of [1, 4.3.3.1] #EXT-X-TARGETDURATION tag is REQUIRED for every
+ # media playlist and MUST NOT appear in master playlist thus we can
+ # clearly detect media playlist with this criterion.
+
if '#EXT-X-TARGETDURATION' in m3u8_doc: # media playlist, return as is
return [{
'url': m3u8_url,
@@ -1334,52 +1369,72 @@ class InfoExtractor(object):
'protocol': entry_protocol,
'preference': preference,
}]
- audio_in_video_stream = {}
- last_info = {}
- last_media = {}
+
+ groups = {}
+ last_stream_inf = {}
+
+ def extract_media(x_media_line):
+ media = parse_m3u8_attributes(x_media_line)
+ # As per [1, 4.3.4.1] TYPE, GROUP-ID and NAME are REQUIRED
+ media_type, group_id, name = media.get('TYPE'), media.get('GROUP-ID'), media.get('NAME')
+ if not (media_type and group_id and name):
+ return
+ groups.setdefault(group_id, []).append(media)
+ if media_type not in ('VIDEO', 'AUDIO'):
+ return
+ media_url = media.get('URI')
+ if media_url:
+ format_id = []
+ for v in (group_id, name):
+ if v:
+ format_id.append(v)
+ f = {
+ 'format_id': '-'.join(format_id),
+ 'url': format_url(media_url),
+ 'manifest_url': m3u8_url,
+ 'language': media.get('LANGUAGE'),
+ 'ext': ext,
+ 'protocol': entry_protocol,
+ 'preference': preference,
+ }
+ if media_type == 'AUDIO':
+ f['vcodec'] = 'none'
+ formats.append(f)
+
+ def build_stream_name():
+ # Despite specification does not mention NAME attribute for
+ # EXT-X-STREAM-INF tag it still sometimes may be present (see [1]
+ # or vidio test in TestInfoExtractor.test_parse_m3u8_formats)
+ # 1. http://www.vidio.com/watch/165683-dj_ambred-booyah-live-2015
+ stream_name = last_stream_inf.get('NAME')
+ if stream_name:
+ return stream_name
+ # If there is no NAME in EXT-X-STREAM-INF it will be obtained
+ # from corresponding rendition group
+ stream_group_id = last_stream_inf.get('VIDEO')
+ if not stream_group_id:
+ return
+ stream_group = groups.get(stream_group_id)
+ if not stream_group:
+ return stream_group_id
+ rendition = stream_group[0]
+ return rendition.get('NAME') or stream_group_id
+
for line in m3u8_doc.splitlines():
if line.startswith('#EXT-X-STREAM-INF:'):
- last_info = parse_m3u8_attributes(line)
+ last_stream_inf = parse_m3u8_attributes(line)
elif line.startswith('#EXT-X-MEDIA:'):
- media = parse_m3u8_attributes(line)
- media_type = media.get('TYPE')
- if media_type in ('VIDEO', 'AUDIO'):
- group_id = media.get('GROUP-ID')
- media_url = media.get('URI')
- if media_url:
- format_id = []
- for v in (group_id, media.get('NAME')):
- if v:
- format_id.append(v)
- f = {
- 'format_id': '-'.join(format_id),
- 'url': format_url(media_url),
- 'language': media.get('LANGUAGE'),
- 'ext': ext,
- 'protocol': entry_protocol,
- 'preference': preference,
- }
- if media_type == 'AUDIO':
- f['vcodec'] = 'none'
- if group_id and not audio_in_video_stream.get(group_id):
- audio_in_video_stream[group_id] = False
- formats.append(f)
- else:
- # When there is no URI in EXT-X-MEDIA let this tag's
- # data be used by regular URI lines below
- last_media = media
- if media_type == 'AUDIO' and group_id:
- audio_in_video_stream[group_id] = True
+ extract_media(line)
elif line.startswith('#') or not line.strip():
continue
else:
- tbr = int_or_none(last_info.get('AVERAGE-BANDWIDTH') or last_info.get('BANDWIDTH'), scale=1000)
+ tbr = float_or_none(
+ last_stream_inf.get('AVERAGE-BANDWIDTH') or
+ last_stream_inf.get('BANDWIDTH'), scale=1000)
format_id = []
if m3u8_id:
format_id.append(m3u8_id)
- # Despite specification does not mention NAME attribute for
- # EXT-X-STREAM-INF it still sometimes may be present
- stream_name = last_info.get('NAME') or last_media.get('NAME')
+ stream_name = build_stream_name()
# Bandwidth of live streams may differ over time thus making
# format_id unpredictable. So it's better to keep provided
# format_id intact.
@@ -1389,14 +1444,14 @@ class InfoExtractor(object):
f = {
'format_id': '-'.join(format_id),
'url': manifest_url,
- 'manifest_url': manifest_url,
+ 'manifest_url': m3u8_url,
'tbr': tbr,
'ext': ext,
- 'fps': float_or_none(last_info.get('FRAME-RATE')),
+ 'fps': float_or_none(last_stream_inf.get('FRAME-RATE')),
'protocol': entry_protocol,
'preference': preference,
}
- resolution = last_info.get('RESOLUTION')
+ resolution = last_stream_inf.get('RESOLUTION')
if resolution:
mobj = re.search(r'(?P<width>\d+)[xX](?P<height>\d+)', resolution)
if mobj:
@@ -1412,13 +1467,26 @@ class InfoExtractor(object):
'vbr': vbr,
'abr': abr,
})
- f.update(parse_codecs(last_info.get('CODECS')))
- if audio_in_video_stream.get(last_info.get('AUDIO')) is False and f['vcodec'] != 'none':
- # TODO: update acodec for audio only formats with the same GROUP-ID
- f['acodec'] = 'none'
+ codecs = parse_codecs(last_stream_inf.get('CODECS'))
+ f.update(codecs)
+ audio_group_id = last_stream_inf.get('AUDIO')
+ # As per [1, 4.3.4.1.1] any EXT-X-STREAM-INF tag which
+ # references a rendition group MUST have a CODECS attribute.
+ # However, this is not always respected, for example, [2]
+ # contains EXT-X-STREAM-INF tag which references AUDIO
+ # rendition group but does not have CODECS and despite
+ # referencing audio group an audio group, it represents
+ # a complete (with audio and video) format. So, for such cases
+ # we will ignore references to rendition groups and treat them
+ # as complete formats.
+ if audio_group_id and codecs and f.get('vcodec') != 'none':
+ audio_group = groups.get(audio_group_id)
+ if audio_group and audio_group[0].get('URI'):
+ # TODO: update acodec for audio only formats with
+ # the same GROUP-ID
+ f['acodec'] = 'none'
formats.append(f)
- last_info = {}
- last_media = {}
+ last_stream_inf = {}
return formats
@staticmethod
@@ -1768,7 +1836,7 @@ class InfoExtractor(object):
if content_type == 'text':
# TODO implement WebVTT downloading
pass
- elif content_type == 'video' or content_type == 'audio':
+ elif content_type in ('video', 'audio'):
base_url = ''
for element in (representation, adaptation_set, period, mpd_doc):
base_url_e = element.find(_add_ns('BaseURL'))
@@ -1792,7 +1860,7 @@ class InfoExtractor(object):
'ext': mimetype2ext(mime_type),
'width': int_or_none(representation_attrib.get('width')),
'height': int_or_none(representation_attrib.get('height')),
- 'tbr': int_or_none(bandwidth, 1000),
+ 'tbr': float_or_none(bandwidth, 1000),
'asr': int_or_none(representation_attrib.get('audioSamplingRate')),
'fps': int_or_none(representation_attrib.get('frameRate')),
'language': lang if lang not in ('mul', 'und', 'zxx', 'mis') else None,
@@ -1933,6 +2001,12 @@ class InfoExtractor(object):
compat_etree_fromstring(ism.encode('utf-8')), urlh.geturl(), ism_id)
def _parse_ism_formats(self, ism_doc, ism_url, ism_id=None):
+ """
+ Parse formats from ISM manifest.
+ References:
+ 1. [MS-SSTR]: Smooth Streaming Protocol,
+ https://msdn.microsoft.com/en-us/library/ff469518.aspx
+ """
if ism_doc.get('IsLive') == 'TRUE' or ism_doc.find('Protection') is not None:
return []
@@ -1954,8 +2028,11 @@ class InfoExtractor(object):
self.report_warning('%s is not a supported codec' % fourcc)
continue
tbr = int(track.attrib['Bitrate']) // 1000
- width = int_or_none(track.get('MaxWidth'))
- height = int_or_none(track.get('MaxHeight'))
+ # [1] does not mention Width and Height attributes. However,
+ # they're often present while MaxWidth and MaxHeight are
+ # missing, so should be used as fallbacks
+ width = int_or_none(track.get('MaxWidth') or track.get('Width'))
+ height = int_or_none(track.get('MaxHeight') or track.get('Height'))
sampling_rate = int_or_none(track.get('SamplingRate'))
track_url_pattern = re.sub(r'{[Bb]itrate}', track.attrib['Bitrate'], url_pattern)
@@ -2106,7 +2183,7 @@ class InfoExtractor(object):
def _extract_akamai_formats(self, manifest_url, video_id, hosts={}):
formats = []
hdcore_sign = 'hdcore=3.7.0'
- f4m_url = re.sub(r'(https?://[^/+])/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
+ f4m_url = re.sub(r'(https?://[^/]+)/i/', r'\1/z/', manifest_url).replace('/master.m3u8', '/manifest.f4m')
hds_host = hosts.get('hds')
if hds_host:
f4m_url = re.sub(r'(https?://)[^/]+', r'\1' + hds_host, f4m_url)
@@ -2128,8 +2205,9 @@ class InfoExtractor(object):
def _extract_wowza_formats(self, url, video_id, m3u8_entry_protocol='m3u8_native', skip_protocols=[]):
url = re.sub(r'/(?:manifest|playlist|jwplayer)\.(?:m3u8|f4m|mpd|smil)', '', url)
- url_base = self._search_regex(r'(?:https?|rtmp|rtsp)(://[^?]+)', url, 'format url')
- http_base_url = 'http' + url_base
+ url_base = self._search_regex(
+ r'(?:(?:https?|rtmp|rtsp):)?(//[^?]+)', url, 'format url')
+ http_base_url = '%s:%s' % ('http', url_base)
formats = []
if 'm3u8' not in skip_protocols:
formats.extend(self._extract_m3u8_formats(
@@ -2163,7 +2241,7 @@ class InfoExtractor(object):
for protocol in ('rtmp', 'rtsp'):
if protocol not in skip_protocols:
formats.append({
- 'url': protocol + url_base,
+ 'url': '%s:%s' % (protocol, url_base),
'format_id': protocol,
'protocol': protocol,
})
@@ -2171,7 +2249,7 @@ class InfoExtractor(object):
def _find_jwplayer_data(self, webpage, video_id=None, transform_source=js_to_json):
mobj = re.search(
- r'jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)\.setup\s*\((?P<options>[^)]+)\)',
+ r'(?s)jwplayer\((?P<quote>[\'"])[^\'" ]+(?P=quote)\)(?!</script>).*?\.setup\s*\((?P<options>[^)]+)\)',
webpage)
if mobj:
try:
@@ -2247,11 +2325,17 @@ class InfoExtractor(object):
def _parse_jwplayer_formats(self, jwplayer_sources_data, video_id=None,
m3u8_id=None, mpd_id=None, rtmp_params=None, base_url=None):
+ urls = []
formats = []
for source in jwplayer_sources_data:
- source_url = self._proto_relative_url(source['file'])
+ source_url = self._proto_relative_url(source.get('file'))
+ if not source_url:
+ continue
if base_url:
source_url = compat_urlparse.urljoin(base_url, source_url)
+ if source_url in urls:
+ continue
+ urls.append(source_url)
source_type = source.get('type') or ''
ext = mimetype2ext(source_type) or determine_ext(source_url)
if source_type == 'hls' or ext == 'm3u8':
diff --git a/youtube_dl/extractor/condenast.py b/youtube_dl/extractor/condenast.py
index d3463b8..0c3f0c0 100644
--- a/youtube_dl/extractor/condenast.py
+++ b/youtube_dl/extractor/condenast.py
@@ -16,7 +16,6 @@ from ..utils import (
mimetype2ext,
orderedSet,
parse_iso8601,
- remove_end,
)
@@ -50,10 +49,17 @@ class CondeNastIE(InfoExtractor):
'wmagazine': 'W Magazine',
}
- _VALID_URL = r'https?://(?:video|www|player)\.(?P<site>%s)\.com/(?P<type>watch|series|video|embed(?:js)?)/(?P<id>[^/?#]+)' % '|'.join(_SITES.keys())
+ _VALID_URL = r'''(?x)https?://(?:video|www|player(?:-backend)?)\.(?:%s)\.com/
+ (?:
+ (?:
+ embed(?:js)?|
+ (?:script|inline)/video
+ )/(?P<id>[0-9a-f]{24})(?:/(?P<player_id>[0-9a-f]{24}))?(?:.+?\btarget=(?P<target>[^&]+))?|
+ (?P<type>watch|series|video)/(?P<display_id>[^/?#]+)
+ )''' % '|'.join(_SITES.keys())
IE_DESC = 'Condé Nast media group: %s' % ', '.join(sorted(_SITES.values()))
- EMBED_URL = r'(?:https?:)?//player\.(?P<site>%s)\.com/(?P<type>embed(?:js)?)/.+?' % '|'.join(_SITES.keys())
+ EMBED_URL = r'(?:https?:)?//player(?:-backend)?\.(?:%s)\.com/(?:embed(?:js)?|(?:script|inline)/video)/.+?' % '|'.join(_SITES.keys())
_TESTS = [{
'url': 'http://video.wired.com/watch/3d-printed-speakers-lit-with-led',
@@ -89,6 +95,12 @@ class CondeNastIE(InfoExtractor):
'upload_date': '20150916',
'timestamp': 1442434955,
}
+ }, {
+ 'url': 'https://player.cnevids.com/inline/video/59138decb57ac36b83000005.js?target=js-cne-player',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://player-backend.cnevids.com/script/video/59138decb57ac36b83000005.js',
+ 'only_matching': True,
}]
def _extract_series(self, url, webpage):
@@ -104,7 +116,7 @@ class CondeNastIE(InfoExtractor):
entries = [self.url_result(build_url(path), 'CondeNast') for path in paths]
return self.playlist_result(entries, playlist_title=title)
- def _extract_video(self, webpage, url_type):
+ def _extract_video_params(self, webpage):
query = {}
params = self._search_regex(
r'(?s)var params = {(.+?)}[;,]', webpage, 'player params', default=None)
@@ -123,17 +135,30 @@ class CondeNastIE(InfoExtractor):
'playerId': params['data-player'],
'target': params['id'],
})
- video_id = query['videoId']
+ return query
+
+ def _extract_video(self, params):
+ video_id = params['videoId']
+
video_info = None
- info_page = self._download_json(
- 'http://player.cnevids.com/player/video.js',
- video_id, 'Downloading video info', fatal=False, query=query)
- if info_page:
- video_info = info_page.get('video')
- if not video_info:
+ if params.get('playerId'):
+ info_page = self._download_json(
+ 'http://player.cnevids.com/player/video.js',
+ video_id, 'Downloading video info', fatal=False, query=params)
+ if info_page:
+ video_info = info_page.get('video')
+ if not video_info:
+ info_page = self._download_webpage(
+ 'http://player.cnevids.com/player/loader.js',
+ video_id, 'Downloading loader info', query=params)
+ else:
info_page = self._download_webpage(
- 'http://player.cnevids.com/player/loader.js',
- video_id, 'Downloading loader info', query=query)
+ 'https://player.cnevids.com/inline/video/%s.js' % video_id,
+ video_id, 'Downloading inline info', query={
+ 'target': params.get('target', 'embedplayer')
+ })
+
+ if not video_info:
video_info = self._parse_json(
self._search_regex(
r'(?s)var\s+config\s*=\s*({.+?});', info_page, 'config'),
@@ -161,9 +186,7 @@ class CondeNastIE(InfoExtractor):
})
self._sort_formats(formats)
- info = self._search_json_ld(
- webpage, video_id, fatal=False) if url_type != 'embed' else {}
- info.update({
+ return {
'id': video_id,
'formats': formats,
'title': title,
@@ -174,22 +197,26 @@ class CondeNastIE(InfoExtractor):
'series': video_info.get('series_title'),
'season': video_info.get('season_title'),
'timestamp': parse_iso8601(video_info.get('premiere_date')),
- })
- return info
+ 'categories': video_info.get('categories'),
+ }
def _real_extract(self, url):
- site, url_type, item_id = re.match(self._VALID_URL, url).groups()
+ video_id, player_id, target, url_type, display_id = re.match(self._VALID_URL, url).groups()
- # Convert JS embed to regular embed
- if url_type == 'embedjs':
- parsed_url = compat_urlparse.urlparse(url)
- url = compat_urlparse.urlunparse(parsed_url._replace(
- path=remove_end(parsed_url.path, '.js').replace('/embedjs/', '/embed/')))
- url_type = 'embed'
+ if video_id:
+ return self._extract_video({
+ 'videoId': video_id,
+ 'playerId': player_id,
+ 'target': target,
+ })
- webpage = self._download_webpage(url, item_id)
+ webpage = self._download_webpage(url, display_id)
if url_type == 'series':
return self._extract_series(url, webpage)
else:
- return self._extract_video(webpage, url_type)
+ params = self._extract_video_params(webpage)
+ info = self._search_json_ld(
+ webpage, display_id, fatal=False)
+ info.update(self._extract_video(params))
+ return info
diff --git a/youtube_dl/extractor/coub.py b/youtube_dl/extractor/coub.py
index 5fa1f00..6ea03e6 100644
--- a/youtube_dl/extractor/coub.py
+++ b/youtube_dl/extractor/coub.py
@@ -24,12 +24,11 @@ class CoubIE(InfoExtractor):
'duration': 4.6,
'timestamp': 1428527772,
'upload_date': '20150408',
- 'uploader': 'Артём Лоскутников',
+ 'uploader': 'Artyom Loskutnikov',
'uploader_id': 'artyom.loskutnikov',
'view_count': int,
'like_count': int,
'repost_count': int,
- 'comment_count': int,
'age_limit': 0,
},
}, {
@@ -118,7 +117,6 @@ class CoubIE(InfoExtractor):
view_count = int_or_none(coub.get('views_count') or coub.get('views_increase_count'))
like_count = int_or_none(coub.get('likes_count'))
repost_count = int_or_none(coub.get('recoubs_count'))
- comment_count = int_or_none(coub.get('comments_count'))
age_restricted = coub.get('age_restricted', coub.get('age_restricted_by_admin'))
if age_restricted is not None:
@@ -137,7 +135,6 @@ class CoubIE(InfoExtractor):
'view_count': view_count,
'like_count': like_count,
'repost_count': repost_count,
- 'comment_count': comment_count,
'age_limit': age_limit,
'formats': formats,
}
diff --git a/youtube_dl/extractor/crackle.py b/youtube_dl/extractor/crackle.py
index f919ed2..13f425b 100644
--- a/youtube_dl/extractor/crackle.py
+++ b/youtube_dl/extractor/crackle.py
@@ -21,9 +21,10 @@ class CrackleIE(InfoExtractor):
'season_number': 8,
'episode_number': 4,
'subtitles': {
- 'en-US': [{
- 'ext': 'ttml',
- }]
+ 'en-US': [
+ {'ext': 'vtt'},
+ {'ext': 'tt'},
+ ]
},
},
'params': {
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index d15fd37..2ffa4a7 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -171,7 +171,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'info_dict': {
'id': '727589',
'ext': 'mp4',
- 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance from this Judicial Injustice!",
+ 'title': "KONOSUBA -God's blessing on this wonderful world! 2 Episode 1 – Give Me Deliverance From This Judicial Injustice!",
'description': 'md5:cbcf05e528124b0f3a0a419fc805ea7d',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': 'Kadokawa Pictures Inc.',
@@ -179,7 +179,7 @@ class CrunchyrollIE(CrunchyrollBaseIE):
'series': "KONOSUBA -God's blessing on this wonderful world!",
'season': "KONOSUBA -God's blessing on this wonderful world! 2",
'season_number': 2,
- 'episode': 'Give Me Deliverance from this Judicial Injustice!',
+ 'episode': 'Give Me Deliverance From This Judicial Injustice!',
'episode_number': 1,
},
'params': {
@@ -390,7 +390,9 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
else:
webpage_url = 'http://www.' + mobj.group('url')
- webpage = self._download_webpage(self._add_skip_wall(webpage_url), video_id, 'Downloading webpage')
+ webpage = self._download_webpage(
+ self._add_skip_wall(webpage_url), video_id,
+ headers=self.geo_verification_headers())
note_m = self._html_search_regex(
r'<div class="showmedia-trailer-notice">(.+?)</div>',
webpage, 'trailer-notice', default='')
@@ -565,7 +567,9 @@ class CrunchyrollShowPlaylistIE(CrunchyrollBaseIE):
def _real_extract(self, url):
show_id = self._match_id(url)
- webpage = self._download_webpage(self._add_skip_wall(url), show_id)
+ webpage = self._download_webpage(
+ self._add_skip_wall(url), show_id,
+ headers=self.geo_verification_headers())
title = self._html_search_regex(
r'(?s)<h1[^>]*>\s*<span itemprop="name">(.*?)</span>',
webpage, 'title')
diff --git a/youtube_dl/extractor/cspan.py b/youtube_dl/extractor/cspan.py
index d457616..171820e 100644
--- a/youtube_dl/extractor/cspan.py
+++ b/youtube_dl/extractor/cspan.py
@@ -10,6 +10,7 @@ from ..utils import (
smuggle_url,
determine_ext,
ExtractorError,
+ extract_attributes,
)
from .senateisvp import SenateISVPIE
from .ustream import UstreamIE
@@ -68,6 +69,7 @@ class CSpanIE(InfoExtractor):
'uploader_id': '12987475',
},
}]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/%s/%s_%s/index.html?videoId=%s'
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -78,6 +80,19 @@ class CSpanIE(InfoExtractor):
if ustream_url:
return self.url_result(ustream_url, UstreamIE.ie_key())
+ if '&vod' not in url:
+ bc = self._search_regex(
+ r"(<[^>]+id='brightcove-player-embed'[^>]+>)",
+ webpage, 'brightcove embed', default=None)
+ if bc:
+ bc_attr = extract_attributes(bc)
+ bc_url = self.BRIGHTCOVE_URL_TEMPLATE % (
+ bc_attr.get('data-bcaccountid', '3162030207001'),
+ bc_attr.get('data-noprebcplayerid', 'SyGGpuJy3g'),
+ bc_attr.get('data-newbcplayerid', 'default'),
+ bc_attr['data-bcid'])
+ return self.url_result(smuggle_url(bc_url, {'source_url': url}))
+
# We first look for clipid, because clipprog always appears before
patterns = [r'id=\'clip(%s)\'\s*value=\'([0-9]+)\'' % t for t in ('id', 'prog')]
results = list(filter(None, (re.search(p, webpage) for p in patterns)))
diff --git a/youtube_dl/extractor/curiositystream.py b/youtube_dl/extractor/curiositystream.py
index e3c9946..8e45923 100644
--- a/youtube_dl/extractor/curiositystream.py
+++ b/youtube_dl/extractor/curiositystream.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -46,9 +48,50 @@ class CuriosityStreamBaseIE(InfoExtractor):
def _extract_media_info(self, media):
video_id = compat_str(media['id'])
- limelight_media_id = media['limelight_media_id']
title = media['title']
+ formats = []
+ for encoding in media.get('encodings', []):
+ m3u8_url = encoding.get('master_playlist_url')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ encoding_url = encoding.get('url')
+ file_url = encoding.get('file_url')
+ if not encoding_url and not file_url:
+ continue
+ f = {
+ 'width': int_or_none(encoding.get('width')),
+ 'height': int_or_none(encoding.get('height')),
+ 'vbr': int_or_none(encoding.get('video_bitrate')),
+ 'abr': int_or_none(encoding.get('audio_bitrate')),
+ 'filesize': int_or_none(encoding.get('size_in_bytes')),
+ 'vcodec': encoding.get('video_codec'),
+ 'acodec': encoding.get('audio_codec'),
+ 'container': encoding.get('container_type'),
+ }
+ for f_url in (encoding_url, file_url):
+ if not f_url:
+ continue
+ fmt = f.copy()
+ rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', f_url)
+ if rtmp:
+ fmt.update({
+ 'url': rtmp.group('url'),
+ 'play_path': rtmp.group('playpath'),
+ 'app': rtmp.group('app'),
+ 'ext': 'flv',
+ 'format_id': 'rtmp',
+ })
+ else:
+ fmt.update({
+ 'url': f_url,
+ 'format_id': 'http',
+ })
+ formats.append(fmt)
+ self._sort_formats(formats)
+
subtitles = {}
for closed_caption in media.get('closed_captions', []):
sub_url = closed_caption.get('file')
@@ -60,16 +103,14 @@ class CuriosityStreamBaseIE(InfoExtractor):
})
return {
- '_type': 'url_transparent',
'id': video_id,
- 'url': 'limelight:media:' + limelight_media_id,
+ 'formats': formats,
'title': title,
'description': media.get('description'),
'thumbnail': media.get('image_large') or media.get('image_medium') or media.get('image_small'),
'duration': int_or_none(media.get('duration')),
'tags': media.get('tags'),
'subtitles': subtitles,
- 'ie_key': 'LimelightMedia',
}
@@ -78,14 +119,12 @@ class CuriosityStreamIE(CuriosityStreamBaseIE):
_VALID_URL = r'https?://app\.curiositystream\.com/video/(?P<id>\d+)'
_TEST = {
'url': 'https://app.curiositystream.com/video/2',
- 'md5': 'a0074c190e6cddaf86900b28d3e9ee7a',
+ 'md5': '262bb2f257ff301115f1973540de8983',
'info_dict': {
'id': '2',
'ext': 'mp4',
'title': 'How Did You Develop The Internet?',
'description': 'Vint Cerf, Google\'s Chief Internet Evangelist, describes how he and Bob Kahn created the internet.',
- 'timestamp': 1448388615,
- 'upload_date': '20151124',
}
}
@@ -105,7 +144,7 @@ class CuriosityStreamCollectionIE(CuriosityStreamBaseIE):
'title': 'Curious Minds: The Internet',
'description': 'How is the internet shaping our lives in the 21st Century?',
},
- 'playlist_mincount': 17,
+ 'playlist_mincount': 12,
}
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/cwtv.py b/youtube_dl/extractor/cwtv.py
index 1ab9333..f4cf0f1 100644
--- a/youtube_dl/extractor/cwtv.py
+++ b/youtube_dl/extractor/cwtv.py
@@ -82,6 +82,11 @@ class CWTVIE(InfoExtractor):
'url': quality_url,
'tbr': tbr,
})
+ video_metadata = video_data['assetFields']
+ ism_url = video_metadata.get('smoothStreamingUrl')
+ if ism_url:
+ formats.extend(self._extract_ism_formats(
+ ism_url, video_id, ism_id='mss', fatal=False))
self._sort_formats(formats)
thumbnails = [{
@@ -90,8 +95,6 @@ class CWTVIE(InfoExtractor):
'height': image.get('height'),
} for image_id, image in video_data['images'].items() if image.get('uri')] if video_data.get('images') else None
- video_metadata = video_data['assetFields']
-
subtitles = {
'en': [{
'url': video_metadata['UnicornCcUrl'],
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py
index 98c835b..538565c 100644
--- a/youtube_dl/extractor/dailymail.py
+++ b/youtube_dl/extractor/dailymail.py
@@ -2,9 +2,11 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
int_or_none,
determine_protocol,
+ try_get,
unescapeHTML,
)
@@ -28,8 +30,14 @@ class DailyMailIE(InfoExtractor):
video_data = self._parse_json(self._search_regex(
r"data-opts='({.+?})'", webpage, 'video data'), video_id)
title = unescapeHTML(video_data['title'])
- video_sources = self._download_json(video_data.get(
- 'sources', {}).get('url') or 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id, video_id)
+
+ sources_url = (try_get(
+ video_data,
+ (lambda x: x['plugins']['sources']['url'],
+ lambda x: x['sources']['url']), compat_str) or
+ 'http://www.dailymail.co.uk/api/player/%s/video-sources.json' % video_id)
+
+ video_sources = self._download_json(sources_url, video_id)
formats = []
for rendition in video_sources['renditions']:
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index 246efde..f8db76c 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -38,7 +38,7 @@ class DailymotionBaseInfoExtractor(InfoExtractor):
class DailymotionIE(DailymotionBaseInfoExtractor):
- _VALID_URL = r'(?i)(?:https?://)?(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:embed|swf|#)/)?video/(?P<id>[^/?_]+)'
+ _VALID_URL = r'(?i)https?://(?:(www|touch)\.)?dailymotion\.[a-z]{2,3}/(?:(?:(?:embed|swf|#)/)?video|swf)/(?P<id>[^/?_]+)'
IE_NAME = 'dailymotion'
_FORMATS = [
@@ -49,68 +49,82 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
('stream_h264_hd1080_url', 'hd180'),
]
- _TESTS = [
- {
- 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
- 'md5': '2137c41a8e78554bb09225b8eb322406',
- 'info_dict': {
- 'id': 'x2iuewm',
- 'ext': 'mp4',
- 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
- 'description': 'Several come bundled with the Steam Controller.',
- 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
- 'duration': 74,
- 'timestamp': 1425657362,
- 'upload_date': '20150306',
- 'uploader': 'IGN',
- 'uploader_id': 'xijv66',
- 'age_limit': 0,
- 'view_count': int,
- }
+ _TESTS = [{
+ 'url': 'http://www.dailymotion.com/video/x5kesuj_office-christmas-party-review-jason-bateman-olivia-munn-t-j-miller_news',
+ 'md5': '074b95bdee76b9e3654137aee9c79dfe',
+ 'info_dict': {
+ 'id': 'x5kesuj',
+ 'ext': 'mp4',
+ 'title': 'Office Christmas Party Review – Jason Bateman, Olivia Munn, T.J. Miller',
+ 'description': 'Office Christmas Party Review - Jason Bateman, Olivia Munn, T.J. Miller',
+ 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 187,
+ 'timestamp': 1493651285,
+ 'upload_date': '20170501',
+ 'uploader': 'Deadline',
+ 'uploader_id': 'x1xm8ri',
+ 'age_limit': 0,
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'https://www.dailymotion.com/video/x2iuewm_steam-machine-models-pricing-listed-on-steam-store-ign-news_videogames',
+ 'md5': '2137c41a8e78554bb09225b8eb322406',
+ 'info_dict': {
+ 'id': 'x2iuewm',
+ 'ext': 'mp4',
+ 'title': 'Steam Machine Models, Pricing Listed on Steam Store - IGN News',
+ 'description': 'Several come bundled with the Steam Controller.',
+ 'thumbnail': r're:^https?:.*\.(?:jpg|png)$',
+ 'duration': 74,
+ 'timestamp': 1425657362,
+ 'upload_date': '20150306',
+ 'uploader': 'IGN',
+ 'uploader_id': 'xijv66',
+ 'age_limit': 0,
+ 'view_count': int,
},
+ 'skip': 'video gone',
+ }, {
# Vevo video
- {
- 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
- 'info_dict': {
- 'title': 'Roar (Official)',
- 'id': 'USUV71301934',
- 'ext': 'mp4',
- 'uploader': 'Katy Perry',
- 'upload_date': '20130905',
- },
- 'params': {
- 'skip_download': True,
- },
- 'skip': 'VEVO is only available in some countries',
+ 'url': 'http://www.dailymotion.com/video/x149uew_katy-perry-roar-official_musi',
+ 'info_dict': {
+ 'title': 'Roar (Official)',
+ 'id': 'USUV71301934',
+ 'ext': 'mp4',
+ 'uploader': 'Katy Perry',
+ 'upload_date': '20130905',
+ },
+ 'params': {
+ 'skip_download': True,
},
+ 'skip': 'VEVO is only available in some countries',
+ }, {
# age-restricted video
- {
- 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
- 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
- 'info_dict': {
- 'id': 'xyh2zz',
- 'ext': 'mp4',
- 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
- 'uploader': 'HotWaves1012',
- 'age_limit': 18,
- },
- 'skip': 'video gone',
+ 'url': 'http://www.dailymotion.com/video/xyh2zz_leanna-decker-cyber-girl-of-the-year-desires-nude-playboy-plus_redband',
+ 'md5': '0d667a7b9cebecc3c89ee93099c4159d',
+ 'info_dict': {
+ 'id': 'xyh2zz',
+ 'ext': 'mp4',
+ 'title': 'Leanna Decker - Cyber Girl Of The Year Desires Nude [Playboy Plus]',
+ 'uploader': 'HotWaves1012',
+ 'age_limit': 18,
},
+ 'skip': 'video gone',
+ }, {
# geo-restricted, player v5
- {
- 'url': 'http://www.dailymotion.com/video/xhza0o',
- 'only_matching': True,
- },
+ 'url': 'http://www.dailymotion.com/video/xhza0o',
+ 'only_matching': True,
+ }, {
# with subtitles
- {
- 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
- 'only_matching': True,
- },
- {
- 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
- 'only_matching': True,
- }
- ]
+ 'url': 'http://www.dailymotion.com/video/x20su5f_the-power-of-nightmares-1-the-rise-of-the-politics-of-fear-bbc-2004_news',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.dailymotion.com/swf/video/x3n92nf',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.dailymotion.com/swf/x3ss1m_funny-magic-trick-barry-and-stuart_fun',
+ 'only_matching': True,
+ }]
@staticmethod
def _extract_urls(webpage):
diff --git a/youtube_dl/extractor/democracynow.py b/youtube_dl/extractor/democracynow.py
index bdfe638..5c9c0ec 100644
--- a/youtube_dl/extractor/democracynow.py
+++ b/youtube_dl/extractor/democracynow.py
@@ -21,7 +21,8 @@ class DemocracynowIE(InfoExtractor):
'info_dict': {
'id': '2015-0703-001',
'ext': 'mp4',
- 'title': 'Daily Show',
+ 'title': 'Daily Show for July 03, 2015',
+ 'description': 'md5:80eb927244d6749900de6072c7cc2c86',
},
}, {
'url': 'http://www.democracynow.org/2015/7/3/this_flag_comes_down_today_bree',
diff --git a/youtube_dl/extractor/discoveryvr.py b/youtube_dl/extractor/discoveryvr.py
new file mode 100644
index 0000000..cb63c26
--- /dev/null
+++ b/youtube_dl/extractor/discoveryvr.py
@@ -0,0 +1,59 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import parse_duration
+
+
+class DiscoveryVRIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?discoveryvr\.com/watch/(?P<id>[^/?#]+)'
+ _TEST = {
+ 'url': 'http://www.discoveryvr.com/watch/discovery-vr-an-introduction',
+ 'md5': '32b1929798c464a54356378b7912eca4',
+ 'info_dict': {
+ 'id': 'discovery-vr-an-introduction',
+ 'ext': 'mp4',
+ 'title': 'Discovery VR - An Introduction',
+ 'description': 'md5:80d418a10efb8899d9403e61d8790f06',
+ }
+ }
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ bootstrap_data = self._search_regex(
+ r'root\.DVR\.bootstrapData\s+=\s+"({.+?})";',
+ webpage, 'bootstrap data')
+ bootstrap_data = self._parse_json(
+ bootstrap_data.encode('utf-8').decode('unicode_escape'),
+ display_id)
+ videos = self._parse_json(bootstrap_data['videos'], display_id)['allVideos']
+ video_data = next(video for video in videos if video.get('slug') == display_id)
+
+ series = video_data.get('showTitle')
+ title = episode = video_data.get('title') or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+
+ formats = []
+ for f, format_id in (('cdnUriM3U8', 'mobi'), ('webVideoUrlSd', 'sd'), ('webVideoUrlHd', 'hd')):
+ f_url = video_data.get(f)
+ if not f_url:
+ continue
+ formats.append({
+ 'format_id': format_id,
+ 'url': f_url,
+ })
+
+ return {
+ 'id': display_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': video_data.get('description'),
+ 'thumbnail': video_data.get('thumbnail'),
+ 'duration': parse_duration(video_data.get('runTime')),
+ 'formats': formats,
+ 'episode': episode,
+ 'series': series,
+ }
diff --git a/youtube_dl/extractor/dotsub.py b/youtube_dl/extractor/dotsub.py
index 1f75352..148605c 100644
--- a/youtube_dl/extractor/dotsub.py
+++ b/youtube_dl/extractor/dotsub.py
@@ -35,7 +35,7 @@ class DotsubIE(InfoExtractor):
'thumbnail': 're:^https?://dotsub.com/media/747bcf58-bd59-45b7-8c8c-ac312d084ee6/p',
'duration': 290,
'timestamp': 1476767794.2809999,
- 'upload_date': '20160525',
+ 'upload_date': '20161018',
'uploader': 'parthivi001',
'uploader_id': 'user52596202',
'view_count': int,
diff --git a/youtube_dl/extractor/douyutv.py b/youtube_dl/extractor/douyutv.py
index 82d8a04..9757f44 100644
--- a/youtube_dl/extractor/douyutv.py
+++ b/youtube_dl/extractor/douyutv.py
@@ -3,11 +3,14 @@ from __future__ import unicode_literals
import time
import hashlib
+import re
from .common import InfoExtractor
from ..utils import (
ExtractorError,
unescapeHTML,
+ unified_strdate,
+ urljoin,
)
@@ -20,7 +23,7 @@ class DouyuTVIE(InfoExtractor):
'id': '17732',
'display_id': 'iseven',
'ext': 'flv',
- 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '7师傅',
@@ -51,7 +54,7 @@ class DouyuTVIE(InfoExtractor):
'id': '17732',
'display_id': '17732',
'ext': 'flv',
- 'title': 're:^清晨醒脑!T-ARA根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
+ 'title': 're:^清晨醒脑!根本停不下来! [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}$',
'description': r're:.*m7show@163\.com.*',
'thumbnail': r're:^https?://.*\.jpg$',
'uploader': '7师傅',
@@ -117,3 +120,82 @@ class DouyuTVIE(InfoExtractor):
'uploader': uploader,
'is_live': True,
}
+
+
+class DouyuShowIE(InfoExtractor):
+ _VALID_URL = r'https?://v(?:mobile)?\.douyu\.com/show/(?P<id>[0-9a-zA-Z]+)'
+
+ _TESTS = [{
+ 'url': 'https://v.douyu.com/show/rjNBdvnVXNzvE2yw',
+ 'md5': '0c2cfd068ee2afe657801269b2d86214',
+ 'info_dict': {
+ 'id': 'rjNBdvnVXNzvE2yw',
+ 'ext': 'mp4',
+ 'title': '陈一发儿:砒霜 我有个室友系列!04-01 22点场',
+ 'duration': 7150.08,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': '陈一发儿',
+ 'uploader_id': 'XrZwYelr5wbK',
+ 'uploader_url': 'https://v.douyu.com/author/XrZwYelr5wbK',
+ 'upload_date': '20170402',
+ },
+ }, {
+ 'url': 'https://vmobile.douyu.com/show/rjNBdvnVXNzvE2yw',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ url = url.replace('vmobile.', 'v.')
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ room_info = self._parse_json(self._search_regex(
+ r'var\s+\$ROOM\s*=\s*({.+});', webpage, 'room info'), video_id)
+
+ video_info = None
+
+ for trial in range(5):
+ # Sometimes Douyu rejects our request. Let's try it more times
+ try:
+ video_info = self._download_json(
+ 'https://vmobile.douyu.com/video/getInfo', video_id,
+ query={'vid': video_id},
+ headers={
+ 'Referer': url,
+ 'x-requested-with': 'XMLHttpRequest',
+ })
+ break
+ except ExtractorError:
+ self._sleep(1, video_id)
+
+ if not video_info:
+ raise ExtractorError('Can\'t fetch video info')
+
+ formats = self._extract_m3u8_formats(
+ video_info['data']['video_url'], video_id,
+ entry_protocol='m3u8_native', ext='mp4')
+
+ upload_date = unified_strdate(self._html_search_regex(
+ r'<em>上传时间:</em><span>([^<]+)</span>', webpage,
+ 'upload date', fatal=False))
+
+ uploader = uploader_id = uploader_url = None
+ mobj = re.search(
+ r'(?m)<a[^>]+href="/author/([0-9a-zA-Z]+)".+?<strong[^>]+title="([^"]+)"',
+ webpage)
+ if mobj:
+ uploader_id, uploader = mobj.groups()
+ uploader_url = urljoin(url, '/author/' + uploader_id)
+
+ return {
+ 'id': video_id,
+ 'title': room_info['name'],
+ 'formats': formats,
+ 'duration': room_info.get('duration'),
+ 'thumbnail': room_info.get('pic'),
+ 'upload_date': upload_date,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'uploader_url': uploader_url,
+ }
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index e491701..c84624f 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -20,7 +20,7 @@ class DRTVIE(InfoExtractor):
IE_NAME = 'drtv'
_TESTS = [{
'url': 'https://www.dr.dk/tv/se/boern/ultra/klassen-ultra/klassen-darlig-taber-10',
- 'md5': '25e659cccc9a2ed956110a299fdf5983',
+ 'md5': '7ae17b4e18eb5d29212f424a7511c184',
'info_dict': {
'id': 'klassen-darlig-taber-10',
'ext': 'mp4',
@@ -30,21 +30,37 @@ class DRTVIE(InfoExtractor):
'upload_date': '20160823',
'duration': 606.84,
},
- 'params': {
- 'skip_download': True,
- },
}, {
+ # embed
'url': 'https://www.dr.dk/nyheder/indland/live-christianias-rydning-af-pusher-street-er-i-gang',
- 'md5': '2c37175c718155930f939ef59952474a',
'info_dict': {
'id': 'christiania-pusher-street-ryddes-drdkrjpo',
'ext': 'mp4',
'title': 'LIVE Christianias rydning af Pusher Street er i gang',
- 'description': '- Det er det fedeste, der er sket i 20 år, fortæller christianit til DR Nyheder.',
+ 'description': 'md5:2a71898b15057e9b97334f61d04e6eb5',
'timestamp': 1472800279,
'upload_date': '20160902',
'duration': 131.4,
},
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # with SignLanguage formats
+ 'url': 'https://www.dr.dk/tv/se/historien-om-danmark/-/historien-om-danmark-stenalder',
+ 'info_dict': {
+ 'id': 'historien-om-danmark-stenalder',
+ 'ext': 'mp4',
+ 'title': 'Historien om Danmark: Stenalder (1)',
+ 'description': 'md5:8c66dcbc1669bbc6f873879880f37f2a',
+ 'timestamp': 1490401996,
+ 'upload_date': '20170325',
+ 'duration': 3502.04,
+ 'formats': 'mincount:20',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -88,7 +104,7 @@ class DRTVIE(InfoExtractor):
elif kind in ('VideoResource', 'AudioResource'):
duration = float_or_none(asset.get('DurationInMilliseconds'), 1000)
restricted_to_denmark = asset.get('RestrictedToDenmark')
- spoken_subtitles = asset.get('Target') == 'SpokenSubtitles'
+ asset_target = asset.get('Target')
for link in asset.get('Links', []):
uri = link.get('Uri')
if not uri:
@@ -96,9 +112,9 @@ class DRTVIE(InfoExtractor):
target = link.get('Target')
format_id = target or ''
preference = None
- if spoken_subtitles:
+ if asset_target in ('SpokenSubtitles', 'SignLanguage'):
preference = -1
- format_id += '-spoken-subtitles'
+ format_id += '-%s' % asset_target
if target == 'HDS':
f4m_formats = self._extract_f4m_formats(
uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index 6a7028a..ed603eb 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -19,6 +19,7 @@ from .acast import (
ACastChannelIE,
)
from .addanime import AddAnimeIE
+from .adn import ADNIE
from .adobetv import (
AdobeTVIE,
AdobeTVShowIE,
@@ -40,6 +41,7 @@ from .alphaporno import AlphaPornoIE
from .amcnetworks import AMCNetworksIE
from .animeondemand import AnimeOnDemandIE
from .anitube import AnitubeIE
+from .anvato import AnvatoIE
from .anysex import AnySexIE
from .aol import AolIE
from .allocine import AllocineIE
@@ -86,7 +88,6 @@ from .azmedien import (
AZMedienPlaylistIE,
AZMedienShowPlaylistIE,
)
-from .azubu import AzubuIE, AzubuLiveIE
from .baidu import BaiduVideoIE
from .bambuser import BambuserIE, BambuserChannelIE
from .bandcamp import BandcampIE, BandcampAlbumIE
@@ -164,7 +165,10 @@ from .ccc import CCCIE
from .ccma import CCMAIE
from .cctv import CCTVIE
from .cda import CDAIE
-from .ceskatelevize import CeskaTelevizeIE
+from .ceskatelevize import (
+ CeskaTelevizeIE,
+ CeskaTelevizePoradyIE,
+)
from .channel9 import Channel9IE
from .charlierose import CharlieRoseIE
from .chaturbate import ChaturbateIE
@@ -247,7 +251,10 @@ from .democracynow import DemocracynowIE
from .dfb import DFBIE
from .dhm import DHMIE
from .dotsub import DotsubIE
-from .douyutv import DouyuTVIE
+from .douyutv import (
+ DouyuShowIE,
+ DouyuTVIE,
+)
from .dplay import (
DPlayIE,
DPlayItIE,
@@ -272,6 +279,7 @@ from .discoverygo import (
DiscoveryGoPlaylistIE,
)
from .discoverynetworks import DiscoveryNetworksDeIE
+from .discoveryvr import DiscoveryVRIE
from .disney import DisneyIE
from .dispeak import DigitallySpeakingIE
from .dropbox import DropboxIE
@@ -345,9 +353,9 @@ from .foxsports import FoxSportsIE
from .franceculture import FranceCultureIE
from .franceinter import FranceInterIE
from .francetv import (
- PluzzIE,
- FranceTvInfoIE,
FranceTVIE,
+ FranceTVEmbedIE,
+ FranceTVInfoIE,
GenerationQuoiIE,
CultureboxIE,
)
@@ -379,6 +387,7 @@ from .globo import (
GloboArticleIE,
)
from .go import GoIE
+from .go90 import Go90IE
from .godtube import GodTubeIE
from .godtv import GodTVIE
from .golem import GolemIE
@@ -536,6 +545,8 @@ from .mangomolo import (
)
from .matchtv import MatchTVIE
from .mdr import MDRIE
+from .mediaset import MediasetIE
+from .medici import MediciIE
from .meipai import MeipaiIE
from .melonvod import MelonVODIE
from .meta import METAIE
@@ -656,6 +667,8 @@ from .nintendo import NintendoIE
from .njpwworld import NJPWWorldIE
from .nobelprize import NobelPrizeIE
from .noco import NocoIE
+from .nonktube import NonkTubeIE
+from .noovo import NoovoIE
from .normalboots import NormalbootsIE
from .nosvideo import NosVideoIE
from .nova import NovaIE
@@ -724,10 +737,14 @@ from .openload import OpenloadIE
from .ora import OraTVIE
from .orf import (
ORFTVthekIE,
- ORFOE1IE,
ORFFM4IE,
+ ORFOE1IE,
ORFIPTVIE,
)
+from .packtpub import (
+ PacktPubIE,
+ PacktPubCourseIE,
+)
from .pandatv import PandaTVIE
from .pandoratv import PandoraTVIE
from .parliamentliveuk import ParliamentLiveUKIE
@@ -797,7 +814,7 @@ from .radiojavan import RadioJavanIE
from .radiobremen import RadioBremenIE
from .radiofrance import RadioFranceIE
from .rai import (
- RaiTVIE,
+ RaiPlayIE,
RaiIE,
)
from .rbmaradio import RBMARadioIE
@@ -828,7 +845,11 @@ from .rozhlas import RozhlasIE
from .rtbf import RTBFIE
from .rte import RteIE, RteRadioIE
from .rtlnl import RtlNlIE
-from .rtl2 import RTL2IE
+from .rtl2 import (
+ RTL2IE,
+ RTL2YouIE,
+ RTL2YouSeriesIE,
+)
from .rtp import RTPIE
from .rts import RTSIE
from .rtve import RTVEALaCartaIE, RTVELiveIE, RTVEInfantilIE, RTVELiveIE, RTVETelevisionIE
@@ -924,6 +945,7 @@ from .srmediathek import SRMediathekIE
from .stanfordoc import StanfordOpenClassroomIE
from .steam import SteamIE
from .streamable import StreamableIE
+from .streamango import StreamangoIE
from .streamcloud import StreamcloudIE
from .streamcz import StreamCZIE
from .streetvoice import StreetVoiceIE
@@ -970,6 +992,7 @@ from .theplatform import (
from .thescene import TheSceneIE
from .thesixtyone import TheSixtyOneIE
from .thestar import TheStarIE
+from .thesun import TheSunIE
from .theweatherchannel import TheWeatherChannelIE
from .thisamericanlife import ThisAmericanLifeIE
from .thisav import ThisAVIE
@@ -1016,8 +1039,10 @@ from .tv2 import (
TV2IE,
TV2ArticleIE,
)
+from .tv2hu import TV2HuIE
from .tv3 import TV3IE
from .tv4 import TV4IE
+from .tv5mondeplus import TV5MondePlusIE
from .tva import TVAIE
from .tvanouvelles import (
TVANouvellesIE,
@@ -1078,6 +1103,10 @@ from .uplynk import (
UplynkIE,
UplynkPreplayIE,
)
+from .upskill import (
+ UpskillIE,
+ UpskillCourseIE,
+)
from .urort import UrortIE
from .urplay import URPlayIE
from .usanetwork import USANetworkIE
@@ -1105,6 +1134,7 @@ from .vgtv import (
from .vh1 import VH1IE
from .vice import (
ViceIE,
+ ViceArticleIE,
ViceShowIE,
)
from .viceland import VicelandIE
@@ -1177,6 +1207,11 @@ from .voxmedia import VoxMediaIE
from .vporn import VpornIE
from .vrt import VRTIE
from .vrak import VrakIE
+from .vrv import (
+ VRVIE,
+ VRVSeriesIE,
+)
+from .vshare import VShareIE
from .medialaan import MedialaanIE
from .vube import VubeIE
from .vuclip import VuClipIE
@@ -1210,7 +1245,10 @@ from .wrzuta import (
WrzutaIE,
WrzutaPlaylistIE,
)
-from .wsj import WSJIE
+from .wsj import (
+ WSJIE,
+ WSJArticleIE,
+)
from .xbef import XBefIE
from .xboxclips import XboxClipsIE
from .xfileshare import XFileShareIE
@@ -1272,5 +1310,6 @@ from .youtube import (
YoutubeWatchLaterIE,
)
from .zapiks import ZapiksIE
+from .zaq1 import Zaq1IE
from .zdf import ZDFIE, ZDFChannelIE
from .zingmp3 import ZingMp3IE
diff --git a/youtube_dl/extractor/foxsports.py b/youtube_dl/extractor/foxsports.py
index a3bb983..9855427 100644
--- a/youtube_dl/extractor/foxsports.py
+++ b/youtube_dl/extractor/foxsports.py
@@ -11,10 +11,10 @@ class FoxSportsIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?foxsports\.com/(?:[^/]+/)*(?P<id>[^/]+)'
_TEST = {
- 'url': 'http://www.foxsports.com/video?vid=432609859715',
+ 'url': 'http://www.foxsports.com/tennessee/video/432609859715',
'md5': 'b49050e955bebe32c301972e4012ac17',
'info_dict': {
- 'id': 'i0qKWsk3qJaM',
+ 'id': 'bwduI3X_TgUB',
'ext': 'mp4',
'title': 'Courtney Lee on going up 2-0 in series vs. Blazers',
'description': 'Courtney Lee talks about Memphis being focused.',
@@ -31,8 +31,9 @@ class FoxSportsIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
config = self._parse_json(
- self._search_regex(
- r"data-player-config='([^']+)'", webpage, 'data player config'),
+ self._html_search_regex(
+ r"""class="[^"]*(?:fs-player|platformPlayer-wrapper)[^"]*".+?data-player-config='([^']+)'""",
+ webpage, 'data player config'),
video_id)
return self.url_result(smuggle_url(update_url_query(
diff --git a/youtube_dl/extractor/francetv.py b/youtube_dl/extractor/francetv.py
index 48d43ae..546d5ca 100644
--- a/youtube_dl/extractor/francetv.py
+++ b/youtube_dl/extractor/francetv.py
@@ -21,11 +21,13 @@ from .dailymotion import (
class FranceTVBaseInfoExtractor(InfoExtractor):
- def _extract_video(self, video_id, catalogue):
+ def _extract_video(self, video_id, catalogue=None):
info = self._download_json(
- 'http://webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/?idDiffusion=%s&catalogue=%s'
- % (video_id, catalogue),
- video_id, 'Downloading video JSON')
+ 'https://sivideo.webservices.francetelevisions.fr/tools/getInfosOeuvre/v2/',
+ video_id, 'Downloading video JSON', query={
+ 'idDiffusion': video_id,
+ 'catalogue': catalogue or '',
+ })
if info.get('status') == 'NOK':
raise ExtractorError(
@@ -109,27 +111,97 @@ class FranceTVBaseInfoExtractor(InfoExtractor):
}
-class PluzzIE(FranceTVBaseInfoExtractor):
- IE_NAME = 'pluzz.francetv.fr'
- _VALID_URL = r'https?://(?:m\.)?pluzz\.francetv\.fr/videos/(?P<id>.+?)\.html'
+class FranceTVIE(FranceTVBaseInfoExtractor):
+ _VALID_URL = r'https?://(?:(?:www\.)?france\.tv|mobile\.france\.tv)/(?:[^/]+/)+(?P<id>[^/]+)\.html'
- # Can't use tests, videos expire in 7 days
+ _TESTS = [{
+ 'url': 'https://www.france.tv/france-2/13h15-le-dimanche/140921-les-mysteres-de-jesus.html',
+ 'info_dict': {
+ 'id': '157550144',
+ 'ext': 'mp4',
+ 'title': '13h15, le dimanche... - Les mystères de Jésus',
+ 'description': 'md5:75efe8d4c0a8205e5904498ffe1e1a42',
+ 'timestamp': 1494156300,
+ 'upload_date': '20170507',
+ },
+ 'params': {
+ # m3u8 downloads
+ 'skip_download': True,
+ },
+ }, {
+ # france3
+ 'url': 'https://www.france.tv/france-3/des-chiffres-et-des-lettres/139063-emission-du-mardi-9-mai-2017.html',
+ 'only_matching': True,
+ }, {
+ # france4
+ 'url': 'https://www.france.tv/france-4/hero-corp/saison-1/134151-apres-le-calme.html',
+ 'only_matching': True,
+ }, {
+ # france5
+ 'url': 'https://www.france.tv/france-5/c-a-dire/saison-10/137013-c-a-dire.html',
+ 'only_matching': True,
+ }, {
+ # franceo
+ 'url': 'https://www.france.tv/france-o/archipels/132249-mon-ancetre-l-esclave.html',
+ 'only_matching': True,
+ }, {
+ # france2 live
+ 'url': 'https://www.france.tv/france-2/direct.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.france.tv/documentaires/histoire/136517-argentine-les-500-bebes-voles-de-la-dictature.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.france.tv/jeux-et-divertissements/divertissements/133965-le-web-contre-attaque.html',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://mobile.france.tv/france-5/c-dans-l-air/137347-emission-du-vendredi-12-mai-2017.html',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- video_id = self._html_search_meta(
- 'id_video', webpage, 'video id', default=None)
+ catalogue = None
+ video_id = self._search_regex(
+ r'data-main-video=(["\'])(?P<id>(?:(?!\1).)+)\1',
+ webpage, 'video id', default=None, group='id')
+
if not video_id:
- video_id = self._search_regex(
- r'data-diffusion=["\'](\d+)', webpage, 'video id')
+ video_id, catalogue = self._html_search_regex(
+ r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
+ webpage, 'video ID').split('@')
+ return self._extract_video(video_id, catalogue)
- return self._extract_video(video_id, 'Pluzz')
+class FranceTVEmbedIE(FranceTVBaseInfoExtractor):
+ _VALID_URL = r'https?://embed\.francetv\.fr/*\?.*?\bue=(?P<id>[^&]+)'
-class FranceTvInfoIE(FranceTVBaseInfoExtractor):
+ _TEST = {
+ 'url': 'http://embed.francetv.fr/?ue=7fd581a2ccf59d2fc5719c5c13cf6961',
+ 'info_dict': {
+ 'id': 'NI_983319',
+ 'ext': 'mp4',
+ 'title': 'Le Pen Reims',
+ 'upload_date': '20170505',
+ 'timestamp': 1493981780,
+ 'duration': 16,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video = self._download_json(
+ 'http://api-embed.webservices.francetelevisions.fr/key/%s' % video_id,
+ video_id)
+
+ return self._extract_video(video['video_id'], video.get('catalog'))
+
+
+class FranceTVInfoIE(FranceTVBaseInfoExtractor):
IE_NAME = 'francetvinfo.fr'
_VALID_URL = r'https?://(?:www|mobile|france3-regions)\.francetvinfo\.fr/(?:[^/]+/)*(?P<title>[^/?#&.]+)'
@@ -233,124 +305,6 @@ class FranceTvInfoIE(FranceTVBaseInfoExtractor):
return self._extract_video(video_id, catalogue)
-class FranceTVIE(FranceTVBaseInfoExtractor):
- IE_NAME = 'francetv'
- IE_DESC = 'France 2, 3, 4, 5 and Ô'
- _VALID_URL = r'''(?x)
- https?://
- (?:
- (?:www\.)?france[2345o]\.fr/
- (?:
- emissions/[^/]+/(?:videos|diffusions)|
- emission/[^/]+|
- videos|
- jt
- )
- /|
- embed\.francetv\.fr/\?ue=
- )
- (?P<id>[^/?]+)
- '''
-
- _TESTS = [
- # france2
- {
- 'url': 'http://www.france2.fr/emissions/13h15-le-samedi-le-dimanche/videos/75540104',
- 'md5': 'c03fc87cb85429ffd55df32b9fc05523',
- 'info_dict': {
- 'id': '109169362',
- 'ext': 'flv',
- 'title': '13h15, le dimanche...',
- 'description': 'md5:9a0932bb465f22d377a449be9d1a0ff7',
- 'upload_date': '20140914',
- 'timestamp': 1410693600,
- },
- },
- # france3
- {
- 'url': 'http://www.france3.fr/emissions/pieces-a-conviction/diffusions/13-11-2013_145575',
- 'md5': '679bb8f8921f8623bd658fa2f8364da0',
- 'info_dict': {
- 'id': '000702326_CAPP_PicesconvictionExtrait313022013_120220131722_Au',
- 'ext': 'mp4',
- 'title': 'Le scandale du prix des médicaments',
- 'description': 'md5:1384089fbee2f04fc6c9de025ee2e9ce',
- 'upload_date': '20131113',
- 'timestamp': 1384380000,
- },
- },
- # france4
- {
- 'url': 'http://www.france4.fr/emissions/hero-corp/videos/rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
- 'md5': 'a182bf8d2c43d88d46ec48fbdd260c1c',
- 'info_dict': {
- 'id': 'rhozet_herocorp_bonus_1_20131106_1923_06112013172108_F4',
- 'ext': 'mp4',
- 'title': 'Hero Corp Making of - Extrait 1',
- 'description': 'md5:c87d54871b1790679aec1197e73d650a',
- 'upload_date': '20131106',
- 'timestamp': 1383766500,
- },
- },
- # france5
- {
- 'url': 'http://www.france5.fr/emissions/c-a-dire/videos/quels_sont_les_enjeux_de_cette_rentree_politique__31-08-2015_908948?onglet=tous&page=1',
- 'md5': 'f6c577df3806e26471b3d21631241fd0',
- 'info_dict': {
- 'id': '123327454',
- 'ext': 'flv',
- 'title': 'C à dire ?! - Quels sont les enjeux de cette rentrée politique ?',
- 'description': 'md5:4a0d5cb5dce89d353522a84462bae5a4',
- 'upload_date': '20150831',
- 'timestamp': 1441035120,
- },
- },
- # franceo
- {
- 'url': 'http://www.franceo.fr/jt/info-soir/18-07-2015',
- 'md5': '47d5816d3b24351cdce512ad7ab31da8',
- 'info_dict': {
- 'id': '125377621',
- 'ext': 'flv',
- 'title': 'Infô soir',
- 'description': 'md5:01b8c6915a3d93d8bbbd692651714309',
- 'upload_date': '20150718',
- 'timestamp': 1437241200,
- 'duration': 414,
- },
- },
- {
- # francetv embed
- 'url': 'http://embed.francetv.fr/?ue=8d7d3da1e3047c42ade5a5d7dfd3fc87',
- 'info_dict': {
- 'id': 'EV_30231',
- 'ext': 'flv',
- 'title': 'Alcaline, le concert avec Calogero',
- 'description': 'md5:61f08036dcc8f47e9cfc33aed08ffaff',
- 'upload_date': '20150226',
- 'timestamp': 1424989860,
- 'duration': 5400,
- },
- },
- {
- 'url': 'http://www.france4.fr/emission/highlander/diffusion-du-17-07-2015-04h05',
- 'only_matching': True,
- },
- {
- 'url': 'http://www.franceo.fr/videos/125377617',
- 'only_matching': True,
- }
- ]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- video_id, catalogue = self._html_search_regex(
- r'(?:href=|player\.setVideo\(\s*)"http://videos?\.francetv\.fr/video/([^@]+@[^"]+)"',
- webpage, 'video ID').split('@')
- return self._extract_video(video_id, catalogue)
-
-
class GenerationQuoiIE(InfoExtractor):
IE_NAME = 'france2.fr:generation-quoi'
_VALID_URL = r'https?://generation-quoi\.france2\.fr/portrait/(?P<id>[^/?#]+)'
diff --git a/youtube_dl/extractor/funimation.py b/youtube_dl/extractor/funimation.py
index eba00cd..8c37509 100644
--- a/youtube_dl/extractor/funimation.py
+++ b/youtube_dl/extractor/funimation.py
@@ -2,231 +2,148 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import (
- compat_HTTPError,
- compat_urllib_parse_unquote_plus,
-)
+from ..compat import compat_HTTPError
from ..utils import (
- clean_html,
determine_ext,
int_or_none,
- sanitized_Request,
+ js_to_json,
ExtractorError,
urlencode_postdata
)
class FunimationIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?funimation\.com/shows/[^/]+/videos/(?:official|promotional)/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?funimation(?:\.com|now\.uk)/shows/[^/]+/(?P<id>[^/?#&]+)'
_NETRC_MACHINE = 'funimation'
+ _TOKEN = None
_TESTS = [{
- 'url': 'http://www.funimation.com/shows/air/videos/official/breeze',
+ 'url': 'https://www.funimation.com/shows/hacksign/role-play/',
'info_dict': {
- 'id': '658',
- 'display_id': 'breeze',
- 'ext': 'mp4',
- 'title': 'Air - 1 - Breeze',
- 'description': 'md5:1769f43cd5fc130ace8fd87232207892',
- 'thumbnail': r're:https?://.*\.jpg',
- },
- 'skip': 'Access without user interaction is forbidden by CloudFlare, and video removed',
- }, {
- 'url': 'http://www.funimation.com/shows/hacksign/videos/official/role-play',
- 'info_dict': {
- 'id': '31128',
+ 'id': '91144',
'display_id': 'role-play',
'ext': 'mp4',
- 'title': '.hack//SIGN - 1 - Role Play',
+ 'title': '.hack//SIGN - Role Play',
'description': 'md5:b602bdc15eef4c9bbb201bb6e6a4a2dd',
'thumbnail': r're:https?://.*\.jpg',
},
- 'skip': 'Access without user interaction is forbidden by CloudFlare',
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
}, {
- 'url': 'http://www.funimation.com/shows/attack-on-titan-junior-high/videos/promotional/broadcast-dub-preview',
+ 'url': 'https://www.funimation.com/shows/attack-on-titan-junior-high/broadcast-dub-preview/',
'info_dict': {
- 'id': '9635',
+ 'id': '210051',
'display_id': 'broadcast-dub-preview',
'ext': 'mp4',
'title': 'Attack on Titan: Junior High - Broadcast Dub Preview',
- 'description': 'md5:f8ec49c0aff702a7832cd81b8a44f803',
'thumbnail': r're:https?://.*\.(?:jpg|png)',
},
- 'skip': 'Access without user interaction is forbidden by CloudFlare',
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.funimationnow.uk/shows/puzzle-dragons-x/drop-impact/simulcast/',
+ 'only_matching': True,
}]
- _LOGIN_URL = 'http://www.funimation.com/login'
-
- def _download_webpage(self, *args, **kwargs):
- try:
- return super(FunimationIE, self)._download_webpage(*args, **kwargs)
- except ExtractorError as ee:
- if isinstance(ee.cause, compat_HTTPError) and ee.cause.code == 403:
- response = ee.cause.read()
- if b'>Please complete the security check to access<' in response:
- raise ExtractorError(
- 'Access to funimation.com is blocked by CloudFlare. '
- 'Please browse to http://www.funimation.com/, solve '
- 'the reCAPTCHA, export browser cookies to a text file,'
- ' and then try again with --cookies YOUR_COOKIE_FILE.',
- expected=True)
- raise
-
- def _extract_cloudflare_session_ua(self, url):
- ci_session_cookie = self._get_cookies(url).get('ci_session')
- if ci_session_cookie:
- ci_session = compat_urllib_parse_unquote_plus(ci_session_cookie.value)
- # ci_session is a string serialized by PHP function serialize()
- # This case is simple enough to use regular expressions only
- return self._search_regex(
- r'"user_agent";s:\d+:"([^"]+)"', ci_session, 'user agent',
- default=None)
-
def _login(self):
(username, password) = self._get_login_info()
if username is None:
return
- data = urlencode_postdata({
- 'email_field': username,
- 'password_field': password,
- })
- user_agent = self._extract_cloudflare_session_ua(self._LOGIN_URL)
- if not user_agent:
- user_agent = 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'
- login_request = sanitized_Request(self._LOGIN_URL, data, headers={
- 'User-Agent': user_agent,
- 'Content-Type': 'application/x-www-form-urlencoded'
- })
- login_page = self._download_webpage(
- login_request, None, 'Logging in as %s' % username)
- if any(p in login_page for p in ('funimation.com/logout', '>Log Out<')):
- return
- error = self._html_search_regex(
- r'(?s)<div[^>]+id=["\']errorMessages["\'][^>]*>(.+?)</div>',
- login_page, 'error messages', default=None)
- if error:
- raise ExtractorError('Unable to login: %s' % error, expected=True)
- raise ExtractorError('Unable to log in')
+ try:
+ data = self._download_json(
+ 'https://prod-api-funimationnow.dadcdigital.com/api/auth/login/',
+ None, 'Logging in as %s' % username, data=urlencode_postdata({
+ 'username': username,
+ 'password': password,
+ }))
+ self._TOKEN = data['token']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 401:
+ error = self._parse_json(e.cause.read().decode(), None)['error']
+ raise ExtractorError(error, expected=True)
+ raise
def _real_initialize(self):
self._login()
def _real_extract(self, url):
display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
- errors = []
- formats = []
-
- ERRORS_MAP = {
- 'ERROR_MATURE_CONTENT_LOGGED_IN': 'matureContentLoggedIn',
- 'ERROR_MATURE_CONTENT_LOGGED_OUT': 'matureContentLoggedOut',
- 'ERROR_SUBSCRIPTION_LOGGED_OUT': 'subscriptionLoggedOut',
- 'ERROR_VIDEO_EXPIRED': 'videoExpired',
- 'ERROR_TERRITORY_UNAVAILABLE': 'territoryUnavailable',
- 'SVODBASIC_SUBSCRIPTION_IN_PLAYER': 'basicSubscription',
- 'SVODNON_SUBSCRIPTION_IN_PLAYER': 'nonSubscription',
- 'ERROR_PLAYER_NOT_RESPONDING': 'playerNotResponding',
- 'ERROR_UNABLE_TO_CONNECT_TO_CDN': 'unableToConnectToCDN',
- 'ERROR_STREAM_NOT_FOUND': 'streamNotFound',
- }
-
- USER_AGENTS = (
- # PC UA is served with m3u8 that provides some bonus lower quality formats
- ('pc', 'Mozilla/5.0 (Windows NT 5.2; WOW64; rv:42.0) Gecko/20100101 Firefox/42.0'),
- # Mobile UA allows to extract direct links and also does not fail when
- # PC UA fails with hulu error (e.g.
- # http://www.funimation.com/shows/hacksign/videos/official/role-play)
- ('mobile', 'Mozilla/5.0 (Linux; Android 4.4.2; Nexus 4 Build/KOT49H) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.114 Mobile Safari/537.36'),
- )
-
- user_agent = self._extract_cloudflare_session_ua(url)
- if user_agent:
- USER_AGENTS = ((None, user_agent),)
-
- for kind, user_agent in USER_AGENTS:
- request = sanitized_Request(url)
- request.add_header('User-Agent', user_agent)
- webpage = self._download_webpage(
- request, display_id,
- 'Downloading %s webpage' % kind if kind else 'Downloading webpage')
-
- playlist = self._parse_json(
- self._search_regex(
- r'var\s+playersData\s*=\s*(\[.+?\]);\n',
- webpage, 'players data'),
- display_id)[0]['playlist']
-
- items = next(item['items'] for item in playlist if item.get('items'))
- item = next(item for item in items if item.get('itemAK') == display_id)
-
- error_messages = {}
- video_error_messages = self._search_regex(
- r'var\s+videoErrorMessages\s*=\s*({.+?});\n',
- webpage, 'error messages', default=None)
- if video_error_messages:
- error_messages_json = self._parse_json(video_error_messages, display_id, fatal=False)
- if error_messages_json:
- for _, error in error_messages_json.items():
- type_ = error.get('type')
- description = error.get('description')
- content = error.get('content')
- if type_ == 'text' and description and content:
- error_message = ERRORS_MAP.get(description)
- if error_message:
- error_messages[error_message] = content
-
- for video in item.get('videoSet', []):
- auth_token = video.get('authToken')
- if not auth_token:
- continue
- funimation_id = video.get('FUNImationID') or video.get('videoId')
- preference = 1 if video.get('languageMode') == 'dub' else 0
- if not auth_token.startswith('?'):
- auth_token = '?%s' % auth_token
- for quality, height in (('sd', 480), ('hd', 720), ('hd1080', 1080)):
- format_url = video.get('%sUrl' % quality)
- if not format_url:
- continue
- if not format_url.startswith(('http', '//')):
- errors.append(format_url)
- continue
- if determine_ext(format_url) == 'm3u8':
- formats.extend(self._extract_m3u8_formats(
- format_url + auth_token, display_id, 'mp4', entry_protocol='m3u8_native',
- preference=preference, m3u8_id='%s-hls' % funimation_id, fatal=False))
- else:
- tbr = int_or_none(self._search_regex(
- r'-(\d+)[Kk]', format_url, 'tbr', default=None))
- formats.append({
- 'url': format_url + auth_token,
- 'format_id': '%s-http-%dp' % (funimation_id, height),
- 'height': height,
- 'tbr': tbr,
- 'preference': preference,
- })
+ def _search_kane(name):
+ return self._search_regex(
+ r"KANE_customdimensions\.%s\s*=\s*'([^']+)';" % name,
+ webpage, name, default=None)
+
+ title_data = self._parse_json(self._search_regex(
+ r'TITLE_DATA\s*=\s*({[^}]+})',
+ webpage, 'title data', default=''),
+ display_id, js_to_json, fatal=False) or {}
+
+ video_id = title_data.get('id') or self._search_regex([
+ r"KANE_customdimensions.videoID\s*=\s*'(\d+)';",
+ r'<iframe[^>]+src="/player/(\d+)"',
+ ], webpage, 'video_id', default=None)
+ if not video_id:
+ player_url = self._html_search_meta([
+ 'al:web:url',
+ 'og:video:url',
+ 'og:video:secure_url',
+ ], webpage, fatal=True)
+ video_id = self._search_regex(r'/player/(\d+)', player_url, 'video id')
+
+ title = episode = title_data.get('title') or _search_kane('videoTitle') or self._og_search_title(webpage)
+ series = _search_kane('showName')
+ if series:
+ title = '%s - %s' % (series, title)
+ description = self._html_search_meta(['description', 'og:description'], webpage, fatal=True)
- if not formats and errors:
- raise ExtractorError(
- '%s returned error: %s'
- % (self.IE_NAME, clean_html(error_messages.get(errors[0], errors[0]))),
- expected=True)
+ try:
+ headers = {}
+ if self._TOKEN:
+ headers['Authorization'] = 'Token %s' % self._TOKEN
+ sources = self._download_json(
+ 'https://prod-api-funimationnow.dadcdigital.com/api/source/catalog/video/%s/signed/' % video_id,
+ video_id, headers=headers)['items']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ error = self._parse_json(e.cause.read(), video_id)['errors'][0]
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, error.get('detail') or error.get('title')), expected=True)
+ raise
+ formats = []
+ for source in sources:
+ source_url = source.get('src')
+ if not source_url:
+ continue
+ source_type = source.get('videoType') or determine_ext(source_url)
+ if source_type == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ source_url, video_id, 'mp4',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'format_id': source_type,
+ 'url': source_url,
+ })
self._sort_formats(formats)
- title = item['title']
- artist = item.get('artist')
- if artist:
- title = '%s - %s' % (artist, title)
- description = self._og_search_description(webpage) or item.get('description')
- thumbnail = self._og_search_thumbnail(webpage) or item.get('posterUrl')
- video_id = item.get('itemId') or display_id
-
return {
'id': video_id,
'display_id': display_id,
'title': title,
'description': description,
- 'thumbnail': thumbnail,
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'series': series,
+ 'season_number': int_or_none(title_data.get('seasonNum') or _search_kane('season')),
+ 'episode_number': int_or_none(title_data.get('episodeNum')),
+ 'episode': episode,
+ 'season_id': title_data.get('seriesId'),
'formats': formats,
}
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 81c0ce9..4940936 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -58,8 +58,7 @@ class FunnyOrDieIE(InfoExtractor):
m3u8_url, video_id, 'mp4', 'm3u8_native',
m3u8_id='hls', fatal=False)
source_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
bitrates = [int(bitrate) for bitrate in re.findall(r'[,/]v(\d+)(?=[,/])', m3u8_url)]
bitrates.sort()
diff --git a/youtube_dl/extractor/gamespot.py b/youtube_dl/extractor/gamespot.py
index 682c49e..00d3111 100644
--- a/youtube_dl/extractor/gamespot.py
+++ b/youtube_dl/extractor/gamespot.py
@@ -78,8 +78,7 @@ class GameSpotIE(OnceIE):
if m3u8_formats:
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
if len(qualities) == len(m3u8_formats):
for q, m3u8_format in zip(qualities, m3u8_formats):
f = m3u8_format.copy()
diff --git a/youtube_dl/extractor/gdcvault.py b/youtube_dl/extractor/gdcvault.py
index 3136427..f71d909 100644
--- a/youtube_dl/extractor/gdcvault.py
+++ b/youtube_dl/extractor/gdcvault.py
@@ -75,6 +75,19 @@ class GDCVaultIE(InfoExtractor):
'format': 'jp', # The japanese audio
}
},
+ {
+ # gdc-player.html
+ 'url': 'http://www.gdcvault.com/play/1435/An-American-engine-in-Tokyo',
+ 'info_dict': {
+ 'id': '1435',
+ 'display_id': 'An-American-engine-in-Tokyo',
+ 'ext': 'flv',
+ 'title': 'An American Engine in Tokyo:/nThe collaboration of Epic Games and Square Enix/nFor THE LAST REMINANT',
+ },
+ 'params': {
+ 'skip_download': True, # Requires rtmpdump
+ },
+ },
]
def _login(self, webpage_url, display_id):
@@ -128,7 +141,7 @@ class GDCVaultIE(InfoExtractor):
'title': title,
}
- PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/player.*?\.html.*?".*?</iframe>'
+ PLAYER_REGEX = r'<iframe src="(?P<xml_root>.+?)/(?:gdc-)?player.*?\.html.*?".*?</iframe>'
xml_root = self._html_search_regex(
PLAYER_REGEX, start_page, 'xml root', default=None)
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index 274f817..c108d4a 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -85,6 +85,11 @@ from .ustream import UstreamIE
from .openload import OpenloadIE
from .videopress import VideoPressIE
from .rutube import RutubeIE
+from .limelight import LimelightBaseIE
+from .anvato import AnvatoIE
+from .washingtonpost import WashingtonPostIE
+from .wistia import WistiaIE
+from .mediaset import MediasetIE
class GenericIE(InfoExtractor):
@@ -430,6 +435,22 @@ class GenericIE(InfoExtractor):
},
},
{
+ # Brightcove video in <iframe>
+ 'url': 'http://www.un.org/chinese/News/story.asp?NewsID=27724',
+ 'md5': '36d74ef5e37c8b4a2ce92880d208b968',
+ 'info_dict': {
+ 'id': '5360463607001',
+ 'ext': 'mp4',
+ 'title': '叙利亚失明儿童在废墟上演唱《心跳》 呼吁获得正常童年生活',
+ 'description': '联合国儿童基金会中东和北非区域大使、作曲家扎德·迪拉尼(Zade Dirani)在3月15日叙利亚冲突爆发7周年纪念日之际发布了为叙利亚谱写的歌曲《心跳》(HEARTBEAT),为受到六年冲突影响的叙利亚儿童发出强烈呐喊,呼吁世界做出共同努力,使叙利亚儿童重新获得享有正常童年生活的权利。',
+ 'uploader': 'United Nations',
+ 'uploader_id': '1362235914001',
+ 'timestamp': 1489593889,
+ 'upload_date': '20170315',
+ },
+ 'add_ie': ['BrightcoveLegacy'],
+ },
+ {
# Brightcove with alternative playerID key
'url': 'http://www.nature.com/nmeth/journal/v9/n7/fig_tab/nmeth.2062_SV1.html',
'info_dict': {
@@ -465,6 +486,59 @@ class GenericIE(InfoExtractor):
'params': {
'skip_download': True, # m3u8 download
},
+ 'skip': 'video rotates...weekly?',
+ },
+ {
+ # Brightcove:new type [2].
+ 'url': 'http://www.delawaresportszone.com/video-st-thomas-more-earns-first-trip-to-basketball-semis',
+ 'md5': '2b35148fcf48da41c9fb4591650784f3',
+ 'info_dict': {
+ 'id': '5348741021001',
+ 'ext': 'mp4',
+ 'upload_date': '20170306',
+ 'uploader_id': '4191638492001',
+ 'timestamp': 1488769918,
+ 'title': 'VIDEO: St. Thomas More earns first trip to basketball semis',
+
+ },
+ },
+ {
+ # Alternative brightcove <video> attributes
+ 'url': 'http://www.programme-tv.net/videos/extraits/81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche/',
+ 'info_dict': {
+ 'id': '81095-guillaume-canet-evoque-les-rumeurs-d-infidelite-de-marion-cotillard-avec-brad-pitt-dans-vivement-dimanche',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche, Extraits : toutes les vidéos avec Télé-Loisirs",
+ },
+ 'playlist': [{
+ 'md5': '732d22ba3d33f2f3fc253c39f8f36523',
+ 'info_dict': {
+ 'id': '5311302538001',
+ 'ext': 'mp4',
+ 'title': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche",
+ 'description': "Guillaume Canet évoque les rumeurs d'infidélité de Marion Cotillard avec Brad Pitt dans Vivement Dimanche (France 2, 5 février 2017)",
+ 'timestamp': 1486321708,
+ 'upload_date': '20170205',
+ 'uploader_id': '800000640001',
+ },
+ 'only_matching': True,
+ }],
+ },
+ {
+ # Brightcove with UUID in videoPlayer
+ 'url': 'http://www8.hp.com/cn/zh/home.html',
+ 'info_dict': {
+ 'id': '5255815316001',
+ 'ext': 'mp4',
+ 'title': 'Sprocket Video - China',
+ 'description': 'Sprocket Video - China',
+ 'uploader': 'HP-Video Gallery',
+ 'timestamp': 1482263210,
+ 'upload_date': '20161220',
+ 'uploader_id': '1107601872001',
+ },
+ 'params': {
+ 'skip_download': True, # m3u8 download
+ },
},
# ooyala video
{
@@ -730,6 +804,21 @@ class GenericIE(InfoExtractor):
'skip_download': True,
}
},
+ # YouTube <object> embed
+ {
+ 'url': 'http://www.improbable.com/2017/04/03/untrained-modern-youths-and-ancient-masters-in-selfie-portraits/',
+ 'md5': '516718101ec834f74318df76259fb3cc',
+ 'info_dict': {
+ 'id': 'msN87y-iEx0',
+ 'ext': 'webm',
+ 'title': 'Feynman: Mirrors FUN TO IMAGINE 6',
+ 'upload_date': '20080526',
+ 'description': 'md5:0ffc78ea3f01b2e2c247d5f8d1d3c18d',
+ 'uploader': 'Christopher Sykes',
+ 'uploader_id': 'ChristopherJSykes',
+ },
+ 'add_ie': ['Youtube'],
+ },
# Camtasia studio
{
'url': 'http://www.ll.mit.edu/workshops/education/videocourses/antennas/lecture1/video/',
@@ -1080,6 +1169,21 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Kaltura'],
},
+ {
+ # Kaltura iframe embed
+ 'url': 'http://www.gsd.harvard.edu/event/i-m-pei-a-centennial-celebration/',
+ 'md5': 'ae5ace8eb09dc1a35d03b579a9c2cc44',
+ 'info_dict': {
+ 'id': '0_f2cfbpwy',
+ 'ext': 'mp4',
+ 'title': 'I. M. Pei: A Centennial Celebration',
+ 'description': 'md5:1db8f40c69edc46ca180ba30c567f37c',
+ 'upload_date': '20170403',
+ 'uploader_id': 'batchUser',
+ 'timestamp': 1491232186,
+ },
+ 'add_ie': ['Kaltura'],
+ },
# Eagle.Platform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
@@ -1327,6 +1431,22 @@ class GenericIE(InfoExtractor):
'skip_download': True,
},
},
+ {
+ # Brightcove embed with whitespace around attribute names
+ 'url': 'http://www.stack.com/video/3167554373001/learn-to-hit-open-three-pointers-with-damian-lillard-s-baseline-drift-drill',
+ 'info_dict': {
+ 'id': '3167554373001',
+ 'ext': 'mp4',
+ 'title': "Learn to Hit Open Three-Pointers With Damian Lillard's Baseline Drift Drill",
+ 'description': 'md5:57bacb0e0f29349de4972bfda3191713',
+ 'uploader_id': '1079349493',
+ 'upload_date': '20140207',
+ 'timestamp': 1391810548,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# Another form of arte.tv embed
{
'url': 'http://www.tv-replay.fr/redirection/09-04-16/arte-reportage-arte-11508975.html',
@@ -1568,6 +1688,51 @@ class GenericIE(InfoExtractor):
},
'add_ie': [SenateISVPIE.ie_key()],
},
+ {
+ # Limelight embeds (1 channel embed + 4 media embeds)
+ 'url': 'http://www.sedona.com/FacilitatorTraining2017',
+ 'info_dict': {
+ 'id': 'FacilitatorTraining2017',
+ 'title': 'Facilitator Training 2017',
+ },
+ 'playlist_mincount': 5,
+ },
+ {
+ 'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
+ 'info_dict': {
+ 'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
+ 'title': 'Standoff with Walnut Creek murder suspect ends',
+ 'description': 'md5:3ccc48a60fc9441eeccfc9c469ebf788',
+ },
+ 'playlist_mincount': 4,
+ },
+ {
+ # WashingtonPost embed
+ 'url': 'http://www.vanityfair.com/hollywood/2017/04/donald-trump-tv-pitches',
+ 'info_dict': {
+ 'id': '8caf6e88-d0ec-11e5-90d3-34c2c42653ac',
+ 'ext': 'mp4',
+ 'title': "No one has seen the drama series based on Trump's life \u2014 until now",
+ 'description': 'Donald Trump wanted a weekly TV drama based on his life. It never aired. But The Washington Post recently obtained a scene from the pilot script — and enlisted actors.',
+ 'timestamp': 1455216756,
+ 'uploader': 'The Washington Post',
+ 'upload_date': '20160211',
+ },
+ 'add_ie': [WashingtonPostIE.ie_key()],
+ },
+ {
+ # Mediaset embed
+ 'url': 'http://www.tgcom24.mediaset.it/politica/serracchiani-voglio-vivere-in-una-societa-aperta-reazioni-sproporzionate-_3071354-201702a.shtml',
+ 'info_dict': {
+ 'id': '720642',
+ 'ext': 'mp4',
+ 'title': 'Serracchiani: "Voglio vivere in una società aperta, con tutela del patto di fiducia"',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': [MediasetIE.ie_key()],
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -1610,7 +1775,7 @@ class GenericIE(InfoExtractor):
continue
entries.append({
- '_type': 'url',
+ '_type': 'url_transparent',
'url': next_url,
'title': it.find('title').text,
})
@@ -1870,7 +2035,6 @@ class GenericIE(InfoExtractor):
# Look for Brightcove Legacy Studio embeds
bc_urls = BrightcoveLegacyIE._extract_brightcove_urls(webpage)
if bc_urls:
- self.to_screen('Brightcove video detected.')
entries = [{
'_type': 'url',
'url': smuggle_url(bc_url, {'Referer': url}),
@@ -1885,7 +2049,7 @@ class GenericIE(InfoExtractor):
}
# Look for Brightcove New Studio embeds
- bc_urls = BrightcoveNewIE._extract_urls(webpage)
+ bc_urls = BrightcoveNewIE._extract_urls(self, webpage)
if bc_urls:
return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
@@ -1923,6 +2087,7 @@ class GenericIE(InfoExtractor):
data-video-url=|
<embed[^>]+?src=|
embedSWF\(?:\s*|
+ <object[^>]+data=|
new\s+SWFObject\(
)
(["\'])
@@ -1961,57 +2126,20 @@ class GenericIE(InfoExtractor):
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
# Look for embedded Wistia player
- match = re.search(
- r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
- if match:
- embed_url = self._proto_relative_url(
- unescapeHTML(match.group('url')))
+ wistia_url = WistiaIE._extract_url(webpage)
+ if wistia_url:
return {
'_type': 'url_transparent',
- 'url': embed_url,
- 'ie_key': 'Wistia',
+ 'url': self._proto_relative_url(wistia_url),
+ 'ie_key': WistiaIE.ie_key(),
'uploader': video_uploader,
}
- match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
- if match:
- return {
- '_type': 'url_transparent',
- 'url': 'wistia:%s' % match.group('id'),
- 'ie_key': 'Wistia',
- 'uploader': video_uploader,
- }
-
- match = re.search(
- r'''(?sx)
- <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
- <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
- ''', webpage)
- if match:
- return self.url_result(self._proto_relative_url(
- 'wistia:%s' % match.group('id')), 'Wistia')
-
# Look for SVT player
svt_url = SVTIE._extract_url(webpage)
if svt_url:
return self.url_result(svt_url, 'SVT')
- # Look for embedded condenast player
- matches = re.findall(
- r'<iframe\s+(?:[a-zA-Z-]+="[^"]+"\s+)*?src="(https?://player\.cnevids\.com/embed/[^"]+")',
- webpage)
- if matches:
- return {
- '_type': 'playlist',
- 'entries': [{
- '_type': 'url',
- 'ie_key': 'CondeNast',
- 'url': ma,
- } for ma in matches],
- 'title': video_title,
- 'id': video_id,
- }
-
# Look for Bandcamp pages with custom domain
mobj = re.search(r'<meta property="og:url"[^>]*?content="(.*?bandcamp\.com.*?)"', webpage)
if mobj is not None:
@@ -2400,28 +2528,16 @@ class GenericIE(InfoExtractor):
return self.url_result(piksel_url, PikselIE.ie_key())
# Look for Limelight embeds
- mobj = re.search(r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})', webpage)
- if mobj:
- lm = {
- 'Media': 'media',
- 'Channel': 'channel',
- 'ChannelList': 'channel_list',
- }
- return self.url_result(smuggle_url('limelight:%s:%s' % (
- lm[mobj.group(1)], mobj.group(2)), {'source_url': url}),
- 'Limelight%s' % mobj.group(1), mobj.group(2))
+ limelight_urls = LimelightBaseIE._extract_urls(webpage, url)
+ if limelight_urls:
+ return self.playlist_result(
+ limelight_urls, video_id, video_title, video_description)
- mobj = re.search(
- r'''(?sx)
- <object[^>]+class=(["\'])LimelightEmbeddedPlayerFlash\1[^>]*>.*?
- <param[^>]+
- name=(["\'])flashVars\2[^>]+
- value=(["\'])(?:(?!\3).)*mediaId=(?P<id>[a-z0-9]{32})
- ''', webpage)
- if mobj:
- return self.url_result(smuggle_url(
- 'limelight:media:%s' % mobj.group('id'),
- {'source_url': url}), 'LimelightMedia', mobj.group('id'))
+ # Look for Anvato embeds
+ anvato_urls = AnvatoIE._extract_urls(self, webpage, video_id)
+ if anvato_urls:
+ return self.playlist_result(
+ anvato_urls, video_id, video_title, video_description)
# Look for AdobeTVVideo embeds
mobj = re.search(
@@ -2540,6 +2656,18 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
rutube_urls, ie=RutubeIE.ie_key())
+ # Look for WashingtonPost embeds
+ wapo_urls = WashingtonPostIE._extract_urls(webpage)
+ if wapo_urls:
+ return self.playlist_from_matches(
+ wapo_urls, video_id, video_title, ie=WashingtonPostIE.ie_key())
+
+ # Look for Mediaset embeds
+ mediaset_urls = MediasetIE._extract_urls(webpage)
+ if mediaset_urls:
+ return self.playlist_from_matches(
+ mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
+
# Looking for http://schema.org/VideoObject
json_ld = self._search_json_ld(
webpage, video_id, default={}, expected_type='VideoObject')
@@ -2568,7 +2696,7 @@ class GenericIE(InfoExtractor):
webpage, video_id, transform_source=js_to_json)
if jwplayer_data:
info = self._parse_jwplayer_data(
- jwplayer_data, video_id, require_title=False)
+ jwplayer_data, video_id, require_title=False, base_url=url)
if not info.get('title'):
info['title'] = video_title
return info
@@ -2580,7 +2708,7 @@ class GenericIE(InfoExtractor):
return True
vpath = compat_urlparse.urlparse(vurl).path
vext = determine_ext(vpath)
- return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js')
+ return '.' in vpath and vext not in ('swf', 'png', 'jpg', 'srt', 'sbv', 'sub', 'vtt', 'ttml', 'js', 'xml')
def filter_video(urls):
return list(filter(check_video, urls))
diff --git a/youtube_dl/extractor/go.py b/youtube_dl/extractor/go.py
index 4c9be47..9c7b1bd 100644
--- a/youtube_dl/extractor/go.py
+++ b/youtube_dl/extractor/go.py
@@ -36,22 +36,26 @@ class GoIE(AdobePassIE):
'requestor_id': 'DisneyXD',
}
}
- _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:[^/]+/)*(?:vdka(?P<id>\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
+ _VALID_URL = r'https?://(?:(?P<sub_domain>%s)\.)?go\.com/(?:(?:[^/]+/)*(?P<id>vdka\w+)|(?:[^/]+/)*(?P<display_id>[^/?#]+))' % '|'.join(_SITE_INFO.keys())
_TESTS = [{
- 'url': 'http://abc.go.com/shows/castle/video/most-recent/vdka0_g86w5onx',
+ 'url': 'http://abc.go.com/shows/designated-survivor/video/most-recent/VDKA3807643',
'info_dict': {
- 'id': '0_g86w5onx',
+ 'id': 'VDKA3807643',
'ext': 'mp4',
- 'title': 'Sneak Peek: Language Arts',
- 'description': 'md5:7dcdab3b2d17e5217c953256af964e9c',
+ 'title': 'The Traitor in the White House',
+ 'description': 'md5:05b009d2d145a1e85d25111bd37222e8',
},
'params': {
# m3u8 download
'skip_download': True,
},
}, {
- 'url': 'http://abc.go.com/shows/after-paradise/video/most-recent/vdka3335601',
- 'only_matching': True,
+ 'url': 'http://watchdisneyxd.go.com/doraemon',
+ 'info_dict': {
+ 'title': 'Doraemon',
+ 'id': 'SH55574025',
+ },
+ 'playlist_mincount': 51,
}, {
'url': 'http://abc.go.com/shows/the-catch/episode-guide/season-01/10-the-wedding',
'only_matching': True,
@@ -60,19 +64,36 @@ class GoIE(AdobePassIE):
'only_matching': True,
}]
+ def _extract_videos(self, brand, video_id='-1', show_id='-1'):
+ display_id = video_id if video_id != '-1' else show_id
+ return self._download_json(
+ 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/%s/-1/%s/-1/-1.json' % (brand, show_id, video_id),
+ display_id)['video']
+
def _real_extract(self, url):
sub_domain, video_id, display_id = re.match(self._VALID_URL, url).groups()
+ site_info = self._SITE_INFO[sub_domain]
+ brand = site_info['brand']
if not video_id:
webpage = self._download_webpage(url, display_id)
video_id = self._search_regex(
# There may be inner quotes, e.g. data-video-id="'VDKA3609139'"
# from http://freeform.go.com/shows/shadowhunters/episodes/season-2/1-this-guilty-blood
- r'data-video-id=["\']*VDKA(\w+)', webpage, 'video id')
- site_info = self._SITE_INFO[sub_domain]
- brand = site_info['brand']
- video_data = self._download_json(
- 'http://api.contents.watchabc.go.com/vp2/ws/contents/3000/videos/%s/001/-1/-1/-1/%s/-1/-1.json' % (brand, video_id),
- video_id)['video'][0]
+ r'data-video-id=["\']*(VDKA\w+)', webpage, 'video id', default=None)
+ if not video_id:
+ # show extraction works for Disney, DisneyJunior and DisneyXD
+ # ABC and Freeform has different layout
+ show_id = self._search_regex(r'data-show-id=["\']*(SH\d+)', webpage, 'show id')
+ videos = self._extract_videos(brand, show_id=show_id)
+ show_title = self._search_regex(r'data-show-title="([^"]+)"', webpage, 'show title', fatal=False)
+ entries = []
+ for video in videos:
+ entries.append(self.url_result(
+ video['url'], 'Go', video.get('id'), video.get('title')))
+ entries.reverse()
+ return self.playlist_result(entries, show_id, show_title)
+ video_data = self._extract_videos(brand, video_id)[0]
+ video_id = video_data['id']
title = video_data['title']
formats = []
@@ -105,7 +126,7 @@ class GoIE(AdobePassIE):
self._initialize_geo_bypass(['US'])
entitlement = self._download_json(
'https://api.entitlement.watchabc.go.com/vp2/ws-secure/entitlement/2020/authorize.json',
- video_id, data=urlencode_postdata(data), headers=self.geo_verification_headers())
+ video_id, data=urlencode_postdata(data))
errors = entitlement.get('errors', {}).get('errors', [])
if errors:
for error in errors:
diff --git a/youtube_dl/extractor/go90.py b/youtube_dl/extractor/go90.py
new file mode 100644
index 0000000..9b2e1c1
--- /dev/null
+++ b/youtube_dl/extractor/go90.py
@@ -0,0 +1,126 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_iso8601,
+)
+
+
+class Go90IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?go90\.com/videos/(?P<id>[0-9a-zA-Z]+)'
+ _TEST = {
+ 'url': 'https://www.go90.com/videos/84BUqjLpf9D',
+ 'md5': 'efa7670dbbbf21a7b07b360652b24a32',
+ 'info_dict': {
+ 'id': '84BUqjLpf9D',
+ 'ext': 'mp4',
+ 'title': 'Daily VICE - Inside The Utah Coalition Against Pornography Convention',
+ 'description': 'VICE\'s Karley Sciortino meets with activists who discuss the state\'s strong anti-porn stance. Then, VICE Sports explains NFL contracts.',
+ 'timestamp': 1491868800,
+ 'upload_date': '20170411',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ video_data = self._download_json(
+ 'https://www.go90.com/api/view/items/' + video_id,
+ video_id, headers={
+ 'Content-Type': 'application/json; charset=utf-8',
+ }, data=b'{"client":"web","device_type":"pc"}')
+ main_video_asset = video_data['main_video_asset']
+
+ episode_number = int_or_none(video_data.get('episode_number'))
+ series = None
+ season = None
+ season_id = None
+ season_number = None
+ for metadata in video_data.get('__children', {}).get('Item', {}).values():
+ if metadata.get('type') == 'show':
+ series = metadata.get('title')
+ elif metadata.get('type') == 'season':
+ season = metadata.get('title')
+ season_id = metadata.get('id')
+ season_number = int_or_none(metadata.get('season_number'))
+
+ title = episode = video_data.get('title') or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+
+ thumbnails = []
+ formats = []
+ subtitles = {}
+ for asset in video_data.get('assets'):
+ if asset.get('id') == main_video_asset:
+ for source in asset.get('sources', []):
+ source_location = source.get('location')
+ if not source_location:
+ continue
+ source_type = source.get('type')
+ if source_type == 'hls':
+ m3u8_formats = self._extract_m3u8_formats(
+ source_location, video_id, 'mp4',
+ 'm3u8_native', m3u8_id='hls', fatal=False)
+ for f in m3u8_formats:
+ mobj = re.search(r'/hls-(\d+)-(\d+)K', f['url'])
+ if mobj:
+ height, tbr = mobj.groups()
+ height = int_or_none(height)
+ f.update({
+ 'height': f.get('height') or height,
+ 'width': f.get('width') or int_or_none(height / 9.0 * 16.0 if height else None),
+ 'tbr': f.get('tbr') or int_or_none(tbr),
+ })
+ formats.extend(m3u8_formats)
+ elif source_type == 'dash':
+ formats.extend(self._extract_mpd_formats(
+ source_location, video_id, mpd_id='dash', fatal=False))
+ else:
+ formats.append({
+ 'format_id': source.get('name'),
+ 'url': source_location,
+ 'width': int_or_none(source.get('width')),
+ 'height': int_or_none(source.get('height')),
+ 'tbr': int_or_none(source.get('bitrate')),
+ })
+
+ for caption in asset.get('caption_metadata', []):
+ caption_url = caption.get('source_url')
+ if not caption_url:
+ continue
+ subtitles.setdefault(caption.get('language', 'en'), []).append({
+ 'url': caption_url,
+ 'ext': determine_ext(caption_url, 'vtt'),
+ })
+ elif asset.get('type') == 'image':
+ asset_location = asset.get('location')
+ if not asset_location:
+ continue
+ thumbnails.append({
+ 'url': asset_location,
+ 'width': int_or_none(asset.get('width')),
+ 'height': int_or_none(asset.get('height')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'thumbnails': thumbnails,
+ 'description': video_data.get('short_description'),
+ 'like_count': int_or_none(video_data.get('like_count')),
+ 'timestamp': parse_iso8601(video_data.get('released_at')),
+ 'series': series,
+ 'episode': episode,
+ 'season': season,
+ 'season_id': season_id,
+ 'season_number': season_number,
+ 'episode_number': episode_number,
+ 'subtitles': subtitles,
+ }
diff --git a/youtube_dl/extractor/hbo.py b/youtube_dl/extractor/hbo.py
index 931f71a..859ad54 100644
--- a/youtube_dl/extractor/hbo.py
+++ b/youtube_dl/extractor/hbo.py
@@ -92,12 +92,14 @@ class HBOBaseIE(InfoExtractor):
video_url.replace('.tar', '/base_index_w8.m3u8'),
video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
elif source.tag == 'hls':
- # #EXT-X-BYTERANGE is not supported by native hls downloader
- # and ffmpeg (#10955)
- # formats.extend(self._extract_m3u8_formats(
- # video_url.replace('.tar', '/base_index.m3u8'),
- # video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False))
- continue
+ m3u8_formats = self._extract_m3u8_formats(
+ video_url.replace('.tar', '/base_index.m3u8'),
+ video_id, 'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
+ for f in m3u8_formats:
+ if f.get('vcodec') == 'none' and not f.get('tbr'):
+ f['tbr'] = int_or_none(self._search_regex(
+ r'-(\d+)k/', f['url'], 'tbr', default=None))
+ formats.extend(m3u8_formats)
elif source.tag == 'dash':
formats.extend(self._extract_mpd_formats(
video_url.replace('.tar', '/manifest.mpd'),
@@ -110,7 +112,7 @@ class HBOBaseIE(InfoExtractor):
'width': format_info.get('width'),
'height': format_info.get('height'),
})
- self._sort_formats(formats, ('width', 'height', 'tbr', 'format_id'))
+ self._sort_formats(formats)
thumbnails = []
card_sizes = xpath_element(video_data, 'titleCardSizes')
diff --git a/youtube_dl/extractor/imdb.py b/youtube_dl/extractor/imdb.py
index f95c00c..3ff672a 100644
--- a/youtube_dl/extractor/imdb.py
+++ b/youtube_dl/extractor/imdb.py
@@ -13,7 +13,7 @@ from ..utils import (
class ImdbIE(InfoExtractor):
IE_NAME = 'imdb'
IE_DESC = 'Internet Movie Database trailers'
- _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video/[^/]+/|title/tt\d+.*?#lb-|videoplayer/)vi(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www|m)\.imdb\.com/(?:video|title).+?[/-]vi(?P<id>\d+)'
_TESTS = [{
'url': 'http://www.imdb.com/video/imdb/vi2524815897',
@@ -35,6 +35,9 @@ class ImdbIE(InfoExtractor):
}, {
'url': 'http://www.imdb.com/videoplayer/vi1562949145',
'only_matching': True,
+ }, {
+ 'url': 'http://www.imdb.com/title/tt4218696/videoplayer/vi2608641561',
+ 'only_matching': True,
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/infoq.py b/youtube_dl/extractor/infoq.py
index 9fb71e8..fe425e7 100644
--- a/youtube_dl/extractor/infoq.py
+++ b/youtube_dl/extractor/infoq.py
@@ -87,8 +87,8 @@ class InfoQIE(BokeCCBaseIE):
def _extract_http_audio(self, webpage, video_id):
fields = self._hidden_inputs(webpage)
- http_audio_url = fields['filename']
- if http_audio_url is None:
+ http_audio_url = fields.get('filename')
+ if not http_audio_url:
return []
cookies_header = {'Cookie': self._extract_cookies(webpage)}
diff --git a/youtube_dl/extractor/instagram.py b/youtube_dl/extractor/instagram.py
index c1921cb..4667335 100644
--- a/youtube_dl/extractor/instagram.py
+++ b/youtube_dl/extractor/instagram.py
@@ -112,7 +112,8 @@ class InstagramIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
(video_url, description, thumbnail, timestamp, uploader,
- uploader_id, like_count, comment_count, height, width) = [None] * 10
+ uploader_id, like_count, comment_count, comments, height,
+ width) = [None] * 11
shared_data = self._parse_json(
self._search_regex(
@@ -121,7 +122,10 @@ class InstagramIE(InfoExtractor):
video_id, fatal=False)
if shared_data:
media = try_get(
- shared_data, lambda x: x['entry_data']['PostPage'][0]['media'], dict)
+ shared_data,
+ (lambda x: x['entry_data']['PostPage'][0]['graphql']['shortcode_media'],
+ lambda x: x['entry_data']['PostPage'][0]['media']),
+ dict)
if media:
video_url = media.get('video_url')
height = int_or_none(media.get('dimensions', {}).get('height'))
diff --git a/youtube_dl/extractor/iqiyi.py b/youtube_dl/extractor/iqiyi.py
index 2af6a6d..fdfa7de 100644
--- a/youtube_dl/extractor/iqiyi.py
+++ b/youtube_dl/extractor/iqiyi.py
@@ -189,7 +189,11 @@ class IqiyiIE(InfoExtractor):
'only_matching': True,
}, {
'url': 'http://yule.iqiyi.com/pcb.html',
- 'only_matching': True,
+ 'info_dict': {
+ 'id': '4a0af228fddb55ec96398a364248ed7f',
+ 'ext': 'mp4',
+ 'title': '第2017-04-21期 女艺人频遭极端粉丝骚扰',
+ },
}, {
# VIP-only video. The first 2 parts (6 minutes) are available without login
# MD5 sums omitted as values are different on Travis CI and my machine
@@ -337,15 +341,18 @@ class IqiyiIE(InfoExtractor):
url, 'temp_id', note='download video page')
# There's no simple way to determine whether an URL is a playlist or not
- # So detect it
- playlist_result = self._extract_playlist(webpage)
- if playlist_result:
- return playlist_result
-
+ # Sometimes there are playlist links in individual videos, so treat it
+ # as a single video first
tvid = self._search_regex(
- r'data-player-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid')
+ r'data-(?:player|shareplattrigger)-tvid\s*=\s*[\'"](\d+)', webpage, 'tvid', default=None)
+ if tvid is None:
+ playlist_result = self._extract_playlist(webpage)
+ if playlist_result:
+ return playlist_result
+ raise ExtractorError('Can\'t find any video')
+
video_id = self._search_regex(
- r'data-player-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
+ r'data-(?:player|shareplattrigger)-videoid\s*=\s*[\'"]([a-f\d]+)', webpage, 'video_id')
formats = []
for _ in range(5):
@@ -377,7 +384,8 @@ class IqiyiIE(InfoExtractor):
self._sort_formats(formats)
title = (get_element_by_id('widget-videotitle', webpage) or
- clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)))
+ clean_html(get_element_by_attribute('class', 'mod-play-tit', webpage)) or
+ self._html_search_regex(r'<span[^>]+data-videochanged-title="word"[^>]*>([^<]+)</span>', webpage, 'title'))
return {
'id': video_id,
diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py
index 021c6b2..f315680 100644
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@@ -116,13 +116,25 @@ class ITVIE(InfoExtractor):
if not play_path:
continue
tbr = int_or_none(media_file.get('bitrate'), 1000)
- formats.append({
+ f = {
'format_id': 'rtmp' + ('-%d' % tbr if tbr else ''),
- 'url': rtmp_url,
'play_path': play_path,
+ # Providing this swfVfy allows to avoid truncated downloads
+ 'player_url': 'http://www.itv.com/mercury/Mercury_VideoPlayer.swf',
+ 'page_url': url,
'tbr': tbr,
'ext': 'flv',
- })
+ }
+ app = self._search_regex(
+ 'rtmpe?://[^/]+/(.+)$', rtmp_url, 'app', default=None)
+ if app:
+ f.update({
+ 'url': rtmp_url.split('?', 1)[0],
+ 'app': app,
+ })
+ else:
+ f['url'] = rtmp_url
+ formats.append(f)
ios_playlist_url = params.get('data-video-playlist')
hmac = params.get('data-video-hmac')
@@ -172,7 +184,9 @@ class ITVIE(InfoExtractor):
href = ios_base_url + href
ext = determine_ext(href)
if ext == 'm3u8':
- formats.extend(self._extract_m3u8_formats(href, video_id, 'mp4', m3u8_id='hls', fatal=False))
+ formats.extend(self._extract_m3u8_formats(
+ href, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
else:
formats.append({
'url': href,
@@ -189,7 +203,8 @@ class ITVIE(InfoExtractor):
'ext': 'ttml' if ext == 'xml' else ext,
})
- return {
+ info = self._search_json_ld(webpage, video_id, default={})
+ info.update({
'id': video_id,
'title': title,
'formats': formats,
@@ -198,4 +213,5 @@ class ITVIE(InfoExtractor):
'episode_number': int_or_none(xpath_text(playlist, 'EpisodeNumber')),
'series': xpath_text(playlist, 'ProgrammeTitle'),
'duartion': parse_duration(xpath_text(playlist, 'Duration')),
- }
+ })
+ return info
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index 54374ea..41c1f3d 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -91,6 +91,7 @@ class KalturaIE(InfoExtractor):
}],
},
},
+ 'skip': 'Gone. Maybe https://www.safaribooksonline.com/library/tutorials/introduction-to-python-anon/3469/',
'params': {
'skip_download': True,
},
@@ -107,27 +108,37 @@ class KalturaIE(InfoExtractor):
@staticmethod
def _extract_url(webpage):
+ # Embed codes: https://knowledge.kaltura.com/embedding-kaltura-media-players-your-site
mobj = (
re.search(
r"""(?xs)
kWidget\.(?:thumb)?[Ee]mbed\(
\{.*?
- (?P<q1>['\"])wid(?P=q1)\s*:\s*
- (?P<q2>['\"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*?
- (?P<q3>['\"])entry_?[Ii]d(?P=q3)\s*:\s*
- (?P<q4>['\"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\})
+ (?P<q1>['"])wid(?P=q1)\s*:\s*
+ (?P<q2>['"])_?(?P<partner_id>(?:(?!(?P=q2)).)+)(?P=q2),.*?
+ (?P<q3>['"])entry_?[Ii]d(?P=q3)\s*:\s*
+ (?P<q4>['"])(?P<id>(?:(?!(?P=q4)).)+)(?P=q4)(?:,|\s*\})
""", webpage) or
re.search(
r'''(?xs)
- (?P<q1>["\'])
+ (?P<q1>["'])
(?:https?:)?//cdnapi(?:sec)?\.kaltura\.com(?::\d+)?/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)(?:(?!(?P=q1)).)*
(?P=q1).*?
(?:
entry_?[Ii]d|
- (?P<q2>["\'])entry_?[Ii]d(?P=q2)
+ (?P<q2>["'])entry_?[Ii]d(?P=q2)
)\s*:\s*
- (?P<q3>["\'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
- ''', webpage))
+ (?P<q3>["'])(?P<id>(?:(?!(?P=q3)).)+)(?P=q3)
+ ''', webpage) or
+ re.search(
+ r'''(?xs)
+ <iframe[^>]+src=(?P<q1>["'])
+ (?:https?:)?//(?:www\.)?kaltura\.com/(?:(?!(?P=q1)).)*\b(?:p|partner_id)/(?P<partner_id>\d+)
+ (?:(?!(?P=q1)).)*
+ [?&]entry_id=(?P<id>(?:(?!(?P=q1))[^&])+)
+ (?P=q1)
+ ''', webpage)
+ )
if mobj:
embed_info = mobj.groupdict()
url = 'kaltura:%(partner_id)s:%(id)s' % embed_info
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index 3190b18..1f91ba0 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import json
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
@@ -8,15 +10,15 @@ from ..utils import (
urlencode_postdata,
xpath_element,
xpath_text,
- urljoin,
update_url_query,
+ js_to_json,
)
class Laola1TvEmbedIE(InfoExtractor):
IE_NAME = 'laola1tv:embed'
_VALID_URL = r'https?://(?:www\.)?laola1\.tv/titanplayer\.php\?.*?\bvideoid=(?P<id>\d+)'
- _TEST = {
+ _TESTS = [{
# flashvars.premium = "false";
'url': 'https://www.laola1.tv/titanplayer.php?videoid=708065&type=V&lang=en&portal=int&customer=1024',
'info_dict': {
@@ -26,7 +28,30 @@ class Laola1TvEmbedIE(InfoExtractor):
'uploader': 'ITTF - International Table Tennis Federation',
'upload_date': '20161211',
},
- }
+ }]
+
+ def _extract_token_url(self, stream_access_url, video_id, data):
+ return self._download_json(
+ stream_access_url, video_id, headers={
+ 'Content-Type': 'application/json',
+ }, data=json.dumps(data).encode())['data']['stream-access'][0]
+
+ def _extract_formats(self, token_url, video_id):
+ token_doc = self._download_xml(
+ token_url, video_id, 'Downloading token',
+ headers=self.geo_verification_headers())
+
+ token_attrib = xpath_element(token_doc, './/token').attrib
+
+ if token_attrib['status'] != '0':
+ raise ExtractorError(
+ 'Token error: %s' % token_attrib['comment'], expected=True)
+
+ formats = self._extract_akamai_formats(
+ '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
+ video_id)
+ self._sort_formats(formats)
+ return formats
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -68,29 +93,16 @@ class Laola1TvEmbedIE(InfoExtractor):
else:
data_abo = urlencode_postdata(
dict((i, v) for i, v in enumerate(_v('req_liga_abos').split(','))))
- token_url = self._download_json(
- 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access',
- video_id, query={
+ stream_access_url = update_url_query(
+ 'https://club.laola1.tv/sp/laola1/api/v3/user/session/premium/player/stream-access', {
'videoId': _v('id'),
'target': self._search_regex(r'vs_target = (\d+);', webpage, 'vs target'),
'label': _v('label'),
'area': _v('area'),
- }, data=data_abo)['data']['stream-access'][0]
-
- token_doc = self._download_xml(
- token_url, video_id, 'Downloading token',
- headers=self.geo_verification_headers())
-
- token_attrib = xpath_element(token_doc, './/token').attrib
-
- if token_attrib['status'] != '0':
- raise ExtractorError(
- 'Token error: %s' % token_attrib['comment'], expected=True)
+ })
+ token_url = self._extract_token_url(stream_access_url, video_id, data_abo)
- formats = self._extract_akamai_formats(
- '%s?hdnea=%s' % (token_attrib['url'], token_attrib['auth']),
- video_id)
- self._sort_formats(formats)
+ formats = self._extract_formats(token_url, video_id)
categories_str = _v('meta_sports')
categories = categories_str.split(',') if categories_str else []
@@ -107,7 +119,7 @@ class Laola1TvEmbedIE(InfoExtractor):
}
-class Laola1TvIE(InfoExtractor):
+class Laola1TvIE(Laola1TvEmbedIE):
IE_NAME = 'laola1tv'
_VALID_URL = r'https?://(?:www\.)?laola1\.tv/[a-z]+-[a-z]+/[^/]+/(?P<id>[^/?#&]+)'
_TESTS = [{
@@ -164,13 +176,42 @@ class Laola1TvIE(InfoExtractor):
if 'Dieser Livestream ist bereits beendet.' in webpage:
raise ExtractorError('This live stream has already finished.', expected=True)
- iframe_url = urljoin(url, self._search_regex(
- r'<iframe[^>]*?id="videoplayer"[^>]*?src="([^"]+)"',
- webpage, 'iframe url'))
+ conf = self._parse_json(self._search_regex(
+ r'(?s)conf\s*=\s*({.+?});', webpage, 'conf'),
+ display_id, js_to_json)
+
+ video_id = conf['videoid']
+
+ config = self._download_json(conf['configUrl'], video_id, query={
+ 'videoid': video_id,
+ 'partnerid': conf['partnerid'],
+ 'language': conf.get('language', ''),
+ 'portal': conf.get('portalid', ''),
+ })
+ error = config.get('error')
+ if error:
+ raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
+
+ video_data = config['video']
+ title = video_data['title']
+ is_live = video_data.get('isLivestream') and video_data.get('isLive')
+ meta = video_data.get('metaInformation')
+ sports = meta.get('sports')
+ categories = sports.split(',') if sports else []
+
+ token_url = self._extract_token_url(
+ video_data['streamAccess'], video_id,
+ video_data['abo']['required'])
+
+ formats = self._extract_formats(token_url, video_id)
return {
- '_type': 'url',
+ 'id': video_id,
'display_id': display_id,
- 'url': iframe_url,
- 'ie_key': 'Laola1TvEmbed',
+ 'title': self._live_title(title) if is_live else title,
+ 'description': video_data.get('description'),
+ 'thumbnail': video_data.get('image'),
+ 'categories': categories,
+ 'formats': formats,
+ 'is_live': is_live,
}
diff --git a/youtube_dl/extractor/leeco.py b/youtube_dl/extractor/leeco.py
index 9eda956..0a07c13 100644
--- a/youtube_dl/extractor/leeco.py
+++ b/youtube_dl/extractor/leeco.py
@@ -23,7 +23,6 @@ from ..utils import (
str_or_none,
url_basename,
urshift,
- update_url_query,
)
@@ -51,7 +50,7 @@ class LeIE(InfoExtractor):
'id': '1415246',
'ext': 'mp4',
'title': '美人天下01',
- 'description': 'md5:f88573d9d7225ada1359eaf0dbf8bcda',
+ 'description': 'md5:28942e650e82ed4fcc8e4de919ee854d',
},
'params': {
'hls_prefer_native': True,
@@ -69,7 +68,6 @@ class LeIE(InfoExtractor):
'params': {
'hls_prefer_native': True,
},
- 'skip': 'Only available in China',
}, {
'url': 'http://sports.le.com/video/25737697.html',
'only_matching': True,
@@ -81,7 +79,7 @@ class LeIE(InfoExtractor):
'only_matching': True,
}]
- # ror() and calc_time_key() are reversed from a embedded swf file in KLetvPlayer.swf
+ # ror() and calc_time_key() are reversed from a embedded swf file in LetvPlayer.swf
def ror(self, param1, param2):
_loc3_ = 0
while _loc3_ < param2:
@@ -90,15 +88,8 @@ class LeIE(InfoExtractor):
return param1
def calc_time_key(self, param1):
- _loc2_ = 773625421
- _loc3_ = self.ror(param1, _loc2_ % 13)
- _loc3_ = _loc3_ ^ _loc2_
- _loc3_ = self.ror(_loc3_, _loc2_ % 17)
- return _loc3_
-
- # reversed from http://jstatic.letvcdn.com/sdk/player.js
- def get_mms_key(self, time):
- return self.ror(time, 8) ^ 185025305
+ _loc2_ = 185025305
+ return self.ror(param1, _loc2_ % 17) ^ _loc2_
# see M3U8Encryption class in KLetvPlayer.swf
@staticmethod
@@ -122,7 +113,7 @@ class LeIE(InfoExtractor):
def _check_errors(self, play_json):
# Check for errors
- playstatus = play_json['playstatus']
+ playstatus = play_json['msgs']['playstatus']
if playstatus['status'] == 0:
flag = playstatus['flag']
if flag == 1:
@@ -134,58 +125,31 @@ class LeIE(InfoExtractor):
media_id = self._match_id(url)
page = self._download_webpage(url, media_id)
- play_json_h5 = self._download_json(
- 'http://api.le.com/mms/out/video/playJsonH5',
- media_id, 'Downloading html5 playJson data', query={
- 'id': media_id,
- 'platid': 3,
- 'splatid': 304,
- 'format': 1,
- 'tkey': self.get_mms_key(int(time.time())),
- 'domain': 'www.le.com',
- 'tss': 'no',
- },
- headers=self.geo_verification_headers())
- self._check_errors(play_json_h5)
-
play_json_flash = self._download_json(
- 'http://api.le.com/mms/out/video/playJson',
+ 'http://player-pc.le.com/mms/out/video/playJson',
media_id, 'Downloading flash playJson data', query={
'id': media_id,
'platid': 1,
'splatid': 101,
'format': 1,
+ 'source': 1000,
'tkey': self.calc_time_key(int(time.time())),
'domain': 'www.le.com',
+ 'region': 'cn',
},
headers=self.geo_verification_headers())
self._check_errors(play_json_flash)
- def get_h5_urls(media_url, format_id):
- location = self._download_json(
- media_url, media_id,
- 'Download JSON metadata for format %s' % format_id, query={
- 'format': 1,
- 'expect': 3,
- 'tss': 'no',
- })['location']
-
- return {
- 'http': update_url_query(location, {'tss': 'no'}),
- 'hls': update_url_query(location, {'tss': 'ios'}),
- }
-
def get_flash_urls(media_url, format_id):
- media_url += '&' + compat_urllib_parse_urlencode({
- 'm3v': 1,
- 'format': 1,
- 'expect': 3,
- 'rateid': format_id,
- })
-
nodes_data = self._download_json(
media_url, media_id,
- 'Download JSON metadata for format %s' % format_id)
+ 'Download JSON metadata for format %s' % format_id,
+ query={
+ 'm3v': 1,
+ 'format': 1,
+ 'expect': 3,
+ 'tss': 'ios',
+ })
req = self._request_webpage(
nodes_data['nodelist'][0]['location'], media_id,
@@ -199,29 +163,28 @@ class LeIE(InfoExtractor):
extracted_formats = []
formats = []
- for play_json, get_urls in ((play_json_h5, get_h5_urls), (play_json_flash, get_flash_urls)):
- playurl = play_json['playurl']
- play_domain = playurl['domain'][0]
-
- for format_id, format_data in playurl.get('dispatch', []).items():
- if format_id in extracted_formats:
- continue
- extracted_formats.append(format_id)
-
- media_url = play_domain + format_data[0]
- for protocol, format_url in get_urls(media_url, format_id).items():
- f = {
- 'url': format_url,
- 'ext': determine_ext(format_data[1]),
- 'format_id': '%s-%s' % (protocol, format_id),
- 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
- 'quality': int_or_none(format_id),
- }
-
- if format_id[-1:] == 'p':
- f['height'] = int_or_none(format_id[:-1])
-
- formats.append(f)
+ playurl = play_json_flash['msgs']['playurl']
+ play_domain = playurl['domain'][0]
+
+ for format_id, format_data in playurl.get('dispatch', []).items():
+ if format_id in extracted_formats:
+ continue
+ extracted_formats.append(format_id)
+
+ media_url = play_domain + format_data[0]
+ for protocol, format_url in get_flash_urls(media_url, format_id).items():
+ f = {
+ 'url': format_url,
+ 'ext': determine_ext(format_data[1]),
+ 'format_id': '%s-%s' % (protocol, format_id),
+ 'protocol': 'm3u8_native' if protocol == 'hls' else 'http',
+ 'quality': int_or_none(format_id),
+ }
+
+ if format_id[-1:] == 'p':
+ f['height'] = int_or_none(format_id[:-1])
+
+ formats.append(f)
self._sort_formats(formats, ('height', 'quality', 'format_id'))
publish_time = parse_iso8601(self._html_search_regex(
diff --git a/youtube_dl/extractor/lego.py b/youtube_dl/extractor/lego.py
index d3bca64..b312e77 100644
--- a/youtube_dl/extractor/lego.py
+++ b/youtube_dl/extractor/lego.py
@@ -86,7 +86,7 @@ class LEGOIE(InfoExtractor):
formats = self._extract_akamai_formats(
'%si/s/public/%s_,%s,.mp4.csmil/master.m3u8' % (streaming_base, path, streaming_path), video_id)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8_native' and f.get('vcodec') != 'none',
formats))
if len(m3u8_formats) == len(self._BITRATES):
self._sort_formats(m3u8_formats)
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
index 422be25..0a5a395 100644
--- a/youtube_dl/extractor/limelight.py
+++ b/youtube_dl/extractor/limelight.py
@@ -9,6 +9,7 @@ from ..utils import (
determine_ext,
float_or_none,
int_or_none,
+ smuggle_url,
unsmuggle_url,
ExtractorError,
)
@@ -18,6 +19,42 @@ class LimelightBaseIE(InfoExtractor):
_PLAYLIST_SERVICE_URL = 'http://production-ps.lvp.llnw.net/r/PlaylistService/%s/%s/%s'
_API_URL = 'http://api.video.limelight.com/rest/organizations/%s/%s/%s/%s.json'
+ @classmethod
+ def _extract_urls(cls, webpage, source_url):
+ lm = {
+ 'Media': 'media',
+ 'Channel': 'channel',
+ 'ChannelList': 'channel_list',
+ }
+ entries = []
+ for kind, video_id in re.findall(
+ r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',
+ webpage):
+ entries.append(cls.url_result(
+ smuggle_url(
+ 'limelight:%s:%s' % (lm[kind], video_id),
+ {'source_url': source_url}),
+ 'Limelight%s' % kind, video_id))
+ for mobj in re.finditer(
+ # As per [1] class attribute should be exactly equal to
+ # LimelightEmbeddedPlayerFlash but numerous examples seen
+ # that don't exactly match it (e.g. [2]).
+ # 1. http://support.3playmedia.com/hc/en-us/articles/227732408-Limelight-Embedding-the-Captions-Plugin-with-the-Limelight-Player-on-Your-Webpage
+ # 2. http://www.sedona.com/FacilitatorTraining2017
+ r'''(?sx)
+ <object[^>]+class=(["\'])(?:(?!\1).)*\bLimelightEmbeddedPlayerFlash\b(?:(?!\1).)*\1[^>]*>.*?
+ <param[^>]+
+ name=(["\'])flashVars\2[^>]+
+ value=(["\'])(?:(?!\3).)*(?P<kind>media|channel(?:List)?)Id=(?P<id>[a-z0-9]{32})
+ ''', webpage):
+ kind, video_id = mobj.group('kind'), mobj.group('id')
+ entries.append(cls.url_result(
+ smuggle_url(
+ 'limelight:%s:%s' % (kind, video_id),
+ {'source_url': source_url}),
+ 'Limelight%s' % kind.capitalize(), video_id))
+ return entries
+
def _call_playlist_service(self, item_id, method, fatal=True, referer=None):
headers = {}
if referer:
@@ -62,13 +99,21 @@ class LimelightBaseIE(InfoExtractor):
fmt = {
'url': stream_url,
'abr': float_or_none(stream.get('audioBitRate')),
- 'vbr': float_or_none(stream.get('videoBitRate')),
'fps': float_or_none(stream.get('videoFrameRate')),
- 'width': int_or_none(stream.get('videoWidthInPixels')),
- 'height': int_or_none(stream.get('videoHeightInPixels')),
'ext': ext,
}
- rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp4:.+)$', stream_url)
+ width = int_or_none(stream.get('videoWidthInPixels'))
+ height = int_or_none(stream.get('videoHeightInPixels'))
+ vbr = float_or_none(stream.get('videoBitRate'))
+ if width or height or vbr:
+ fmt.update({
+ 'width': width,
+ 'height': height,
+ 'vbr': vbr,
+ })
+ else:
+ fmt['vcodec'] = 'none'
+ rtmp = re.search(r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+))/(?P<playpath>mp[34]:.+)$', stream_url)
if rtmp:
format_id = 'rtmp'
if stream.get('videoBitRate'):
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index c7de653..c545196 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -1,6 +1,5 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
@@ -11,10 +10,10 @@ class LiveLeakIE(InfoExtractor):
_VALID_URL = r'https?://(?:\w+\.)?liveleak\.com/view\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)'
_TESTS = [{
'url': 'http://www.liveleak.com/view?i=757_1364311680',
- 'md5': '50f79e05ba149149c1b4ea961223d5b3',
+ 'md5': '0813c2430bea7a46bf13acf3406992f4',
'info_dict': {
'id': '757_1364311680',
- 'ext': 'flv',
+ 'ext': 'mp4',
'description': 'extremely bad day for this guy..!',
'uploader': 'ljfriel2',
'title': 'Most unlucky car accident',
@@ -22,7 +21,7 @@ class LiveLeakIE(InfoExtractor):
}
}, {
'url': 'http://www.liveleak.com/view?i=f93_1390833151',
- 'md5': 'b13a29626183c9d33944e6a04f41aafc',
+ 'md5': 'd3f1367d14cc3c15bf24fbfbe04b9abf',
'info_dict': {
'id': 'f93_1390833151',
'ext': 'mp4',
@@ -32,6 +31,7 @@ class LiveLeakIE(InfoExtractor):
'thumbnail': r're:^https?://.*\.jpg$'
}
}, {
+ # Prochan embed
'url': 'http://www.liveleak.com/view?i=4f7_1392687779',
'md5': '42c6d97d54f1db107958760788c5f48f',
'info_dict': {
@@ -41,11 +41,13 @@ class LiveLeakIE(InfoExtractor):
'uploader': 'CapObveus',
'title': 'Man is Fatally Struck by Reckless Car While Packing up a Moving Truck',
'age_limit': 18,
- }
+ },
+ 'skip': 'Video is dead',
}, {
# Covers https://github.com/rg3/youtube-dl/pull/5983
+ # Multiple resolutions
'url': 'http://www.liveleak.com/view?i=801_1409392012',
- 'md5': '0b3bec2d888c20728ca2ad3642f0ef15',
+ 'md5': 'c3a449dbaca5c0d1825caecd52a57d7b',
'info_dict': {
'id': '801_1409392012',
'ext': 'mp4',
@@ -93,57 +95,38 @@ class LiveLeakIE(InfoExtractor):
webpage, 'age limit', default=None))
video_thumbnail = self._og_search_thumbnail(webpage)
- sources_raw = self._search_regex(
- r'(?s)sources:\s*(\[.*?\]),', webpage, 'video URLs', default=None)
- if sources_raw is None:
- alt_source = self._search_regex(
- r'(file: ".*?"),', webpage, 'video URL', default=None)
- if alt_source:
- sources_raw = '[{ %s}]' % alt_source
- else:
- # Maybe an embed?
- embed_url = self._search_regex(
- r'<iframe[^>]+src="(https?://(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
- webpage, 'embed URL')
- return {
- '_type': 'url_transparent',
- 'url': embed_url,
- 'id': video_id,
- 'title': video_title,
- 'description': video_description,
- 'uploader': video_uploader,
- 'age_limit': age_limit,
- }
+ entries = self._parse_html5_media_entries(url, webpage, video_id)
+ if not entries:
+ # Maybe an embed?
+ embed_url = self._search_regex(
+ r'<iframe[^>]+src="((?:https?:)?//(?:www\.)?(?:prochan|youtube)\.com/embed[^"]+)"',
+ webpage, 'embed URL')
+ return {
+ '_type': 'url_transparent',
+ 'url': embed_url,
+ 'id': video_id,
+ 'title': video_title,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'age_limit': age_limit,
+ }
- sources_json = re.sub(r'\s([a-z]+):\s', r'"\1": ', sources_raw)
- sources = json.loads(sources_json)
+ info_dict = entries[0]
- formats = [{
- 'format_id': '%s' % i,
- 'format_note': s.get('label'),
- 'url': s['file'],
- } for i, s in enumerate(sources)]
+ for a_format in info_dict['formats']:
+ if not a_format.get('height'):
+ a_format['height'] = self._search_regex(
+ r'([0-9]+)p\.mp4', a_format['url'], 'height label', default=None)
- for i, s in enumerate(sources):
- # Removing '.h264_*.mp4' gives the raw video, which is essentially
- # the same video without the LiveLeak logo at the top (see
- # https://github.com/rg3/youtube-dl/pull/4768)
- orig_url = re.sub(r'\.h264_.+?\.mp4', '', s['file'])
- if s['file'] != orig_url:
- formats.append({
- 'format_id': 'original-%s' % i,
- 'format_note': s.get('label'),
- 'url': orig_url,
- 'preference': 1,
- })
- self._sort_formats(formats)
+ self._sort_formats(info_dict['formats'])
- return {
+ info_dict.update({
'id': video_id,
'title': video_title,
'description': video_description,
'uploader': video_uploader,
- 'formats': formats,
'age_limit': age_limit,
'thumbnail': video_thumbnail,
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/mediaset.py b/youtube_dl/extractor/mediaset.py
new file mode 100644
index 0000000..9760eaf
--- /dev/null
+++ b/youtube_dl/extractor/mediaset.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ parse_duration,
+ try_get,
+ unified_strdate,
+)
+
+
+class MediasetIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ mediaset:|
+ https?://
+ (?:www\.)?video\.mediaset\.it/
+ (?:
+ (?:video|on-demand)/(?:[^/]+/)+[^/]+_|
+ player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=
+ )
+ )(?P<id>[0-9]+)
+ '''
+ _TESTS = [{
+ # full episode
+ 'url': 'http://www.video.mediaset.it/video/hello_goodbye/full/quarta-puntata_661824.html',
+ 'md5': '9b75534d42c44ecef7bf1ffeacb7f85d',
+ 'info_dict': {
+ 'id': '661824',
+ 'ext': 'mp4',
+ 'title': 'Quarta puntata',
+ 'description': 'md5:7183696d6df570e3412a5ef74b27c5e2',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1414,
+ 'creator': 'mediaset',
+ 'upload_date': '20161107',
+ 'series': 'Hello Goodbye',
+ 'categories': ['reality'],
+ },
+ 'expected_warnings': ['is not a supported codec'],
+ }, {
+ # clip
+ 'url': 'http://www.video.mediaset.it/video/gogglebox/clip/un-grande-classico-della-commedia-sexy_661680.html',
+ 'only_matching': True,
+ }, {
+ # iframe simple
+ 'url': 'http://www.video.mediaset.it/player/playerIFrame.shtml?id=665924&autoplay=true',
+ 'only_matching': True,
+ }, {
+ # iframe twitter (from http://www.wittytv.it/se-prima-mi-fidavo-zero/)
+ 'url': 'https://www.video.mediaset.it/player/playerIFrameTwitter.shtml?id=665104&playrelated=false&autoplay=false&related=true&hidesocial=true',
+ 'only_matching': True,
+ }, {
+ 'url': 'mediaset:661824',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return [
+ mobj.group('url')
+ for mobj in re.finditer(
+ r'<iframe\b[^>]+\bsrc=(["\'])(?P<url>https?://(?:www\.)?video\.mediaset\.it/player/playerIFrame(?:Twitter)?\.shtml\?.*?\bid=\d+.*?)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ video_list = self._download_json(
+ 'http://cdnsel01.mediaset.net/GetCdn.aspx',
+ video_id, 'Downloading video CDN JSON', query={
+ 'streamid': video_id,
+ 'format': 'json',
+ })['videoList']
+
+ formats = []
+ for format_url in video_list:
+ if '.ism' in format_url:
+ formats.extend(self._extract_ism_formats(
+ format_url, video_id, ism_id='mss', fatal=False))
+ else:
+ formats.append({
+ 'url': format_url,
+ 'format_id': determine_ext(format_url),
+ })
+ self._sort_formats(formats)
+
+ mediainfo = self._download_json(
+ 'http://plr.video.mediaset.it/html/metainfo.sjson',
+ video_id, 'Downloading video info JSON', query={
+ 'id': video_id,
+ })['video']
+
+ title = mediainfo['title']
+
+ creator = try_get(
+ mediainfo, lambda x: x['brand-info']['publisher'], compat_str)
+ category = try_get(
+ mediainfo, lambda x: x['brand-info']['category'], compat_str)
+ categories = [category] if category else None
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': mediainfo.get('short-description'),
+ 'thumbnail': mediainfo.get('thumbnail'),
+ 'duration': parse_duration(mediainfo.get('duration')),
+ 'creator': creator,
+ 'upload_date': unified_strdate(mediainfo.get('production-date')),
+ 'webpage_url': mediainfo.get('url'),
+ 'series': mediainfo.get('brand-value'),
+ 'categories': categories,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/medici.py b/youtube_dl/extractor/medici.py
new file mode 100644
index 0000000..cd91023
--- /dev/null
+++ b/youtube_dl/extractor/medici.py
@@ -0,0 +1,70 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ unified_strdate,
+ update_url_query,
+ urlencode_postdata,
+)
+
+
+class MediciIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?medici\.tv/#!/(?P<id>[^?#&]+)'
+ _TEST = {
+ 'url': 'http://www.medici.tv/#!/daniel-harding-frans-helmerson-verbier-festival-music-camp',
+ 'md5': '004c21bb0a57248085b6ff3fec72719d',
+ 'info_dict': {
+ 'id': '3059',
+ 'ext': 'flv',
+ 'title': 'Daniel Harding conducts the Verbier Festival Music Camp \u2013 With Frans Helmerson',
+ 'description': 'md5:322a1e952bafb725174fd8c1a8212f58',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20170408',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ # Sets csrftoken cookie
+ self._download_webpage(url, video_id)
+
+ MEDICI_URL = 'http://www.medici.tv/'
+
+ data = self._download_json(
+ MEDICI_URL, video_id,
+ data=urlencode_postdata({
+ 'json': 'true',
+ 'page': '/%s' % video_id,
+ 'timezone_offset': -420,
+ }), headers={
+ 'X-CSRFToken': self._get_cookies(url)['csrftoken'].value,
+ 'X-Requested-With': 'XMLHttpRequest',
+ 'Referer': MEDICI_URL,
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ })
+
+ video = data['video']['videos']['video1']
+
+ title = video.get('nom') or data['title']
+
+ video_id = video.get('id') or video_id
+ formats = self._extract_f4m_formats(
+ update_url_query(video['url_akamai'], {
+ 'hdcore': '3.1.0',
+ 'plugin=aasp': '3.1.0.43.124',
+ }), video_id, f4m_id='hds')
+
+ description = data.get('meta_description')
+ thumbnail = video.get('url_thumbnail') or data.get('main_image')
+ upload_date = unified_strdate(data['video'].get('date'))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index a24b316..0efbe66 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -97,7 +97,7 @@ class MixcloudIE(InfoExtractor):
view_count = str_to_int(self._search_regex(
[r'<meta itemprop="interactionCount" content="UserPlays:([0-9]+)"',
r'/listeners/?">([0-9,.]+)</a>',
- r'm-tooltip=["\']([\d,.]+) plays'],
+ r'(?:m|data)-tooltip=["\']([\d,.]+) plays'],
webpage, 'play count', default=None))
return {
@@ -138,12 +138,12 @@ class MixcloudPlaylistBaseIE(InfoExtractor):
def _get_user_description(self, page_content):
return self._html_search_regex(
- r'<div[^>]+class="description-text"[^>]*>(.+?)</div>',
+ r'<div[^>]+class="profile-bio"[^>]*>(.+?)</div>',
page_content, 'user description', fatal=False)
class MixcloudUserIE(MixcloudPlaylistBaseIE):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
+ _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/(?P<type>uploads|favorites|listens)?/?$'
IE_NAME = 'mixcloud:user'
_TESTS = [{
@@ -151,7 +151,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)',
- 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ 'description': 'md5:def36060ac8747b3aabca54924897e47',
},
'playlist_mincount': 11,
}, {
@@ -159,7 +159,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_uploads',
'title': 'Daniel Holbach (uploads)',
- 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ 'description': 'md5:def36060ac8747b3aabca54924897e47',
},
'playlist_mincount': 11,
}, {
@@ -167,7 +167,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_favorites',
'title': 'Daniel Holbach (favorites)',
- 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ 'description': 'md5:def36060ac8747b3aabca54924897e47',
},
'params': {
'playlist_items': '1-100',
@@ -178,7 +178,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
'info_dict': {
'id': 'dholbach_listens',
'title': 'Daniel Holbach (listens)',
- 'description': 'md5:327af72d1efeb404a8216c27240d1370',
+ 'description': 'md5:def36060ac8747b3aabca54924897e47',
},
'params': {
'playlist_items': '1-100',
@@ -216,7 +216,7 @@ class MixcloudUserIE(MixcloudPlaylistBaseIE):
class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
+ _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<user>[^/]+)/playlists/(?P<playlist>[^/]+)/?$'
IE_NAME = 'mixcloud:playlist'
_TESTS = [{
@@ -229,12 +229,7 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
'playlist_mincount': 16,
}, {
'url': 'https://www.mixcloud.com/maxvibes/playlists/jazzcat-on-ness-radio/',
- 'info_dict': {
- 'id': 'maxvibes_jazzcat-on-ness-radio',
- 'title': 'Jazzcat on Ness Radio',
- 'description': 'md5:7bbbf0d6359a0b8cda85224be0f8f263',
- },
- 'playlist_mincount': 23
+ 'only_matching': True,
}]
def _real_extract(self, url):
@@ -243,15 +238,16 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
playlist_id = mobj.group('playlist')
video_id = '%s_%s' % (user_id, playlist_id)
- profile = self._download_webpage(
+ webpage = self._download_webpage(
url, user_id,
note='Downloading playlist page',
errnote='Unable to download playlist page')
- description = self._get_user_description(profile)
- playlist_title = self._html_search_regex(
- r'<span[^>]+class="[^"]*list-playlist-title[^"]*"[^>]*>(.*?)</span>',
- profile, 'playlist title')
+ title = self._html_search_regex(
+ r'<a[^>]+class="parent active"[^>]*><b>\d+</b><span[^>]*>([^<]+)',
+ webpage, 'playlist title',
+ default=None) or self._og_search_title(webpage, fatal=False)
+ description = self._get_user_description(webpage)
entries = OnDemandPagedList(
functools.partial(
@@ -259,11 +255,11 @@ class MixcloudPlaylistIE(MixcloudPlaylistBaseIE):
'%s/playlists/%s' % (user_id, playlist_id), video_id, 'tracklist'),
self._PAGE_SIZE)
- return self.playlist_result(entries, video_id, playlist_title, description)
+ return self.playlist_result(entries, video_id, title, description)
class MixcloudStreamIE(MixcloudPlaylistBaseIE):
- _VALID_URL = r'^(?:https?://)?(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
+ _VALID_URL = r'https?://(?:www\.)?mixcloud\.com/(?P<id>[^/]+)/stream/?$'
IE_NAME = 'mixcloud:stream'
_TEST = {
diff --git a/youtube_dl/extractor/myspace.py b/youtube_dl/extractor/myspace.py
index f281238..e164d59 100644
--- a/youtube_dl/extractor/myspace.py
+++ b/youtube_dl/extractor/myspace.py
@@ -12,64 +12,62 @@ from ..utils import (
class MySpaceIE(InfoExtractor):
- _VALID_URL = r'https?://myspace\.com/([^/]+)/(?P<mediatype>video/[^/]+/|music/song/.*?)(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ myspace\.com/[^/]+/
+ (?P<mediatype>
+ video/[^/]+/(?P<video_id>\d+)|
+ music/song/[^/?#&]+-(?P<song_id>\d+)-\d+(?:[/?#&]|$)
+ )
+ '''
- _TESTS = [
- {
- 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
- 'md5': '9c1483c106f4a695c47d2911feed50a7',
- 'info_dict': {
- 'id': '109594919',
- 'ext': 'mp4',
- 'title': 'Little Big Town',
- 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
- 'uploader': 'Five Minutes to the Stage',
- 'uploader_id': 'fiveminutestothestage',
- 'timestamp': 1414108751,
- 'upload_date': '20141023',
- },
+ _TESTS = [{
+ 'url': 'https://myspace.com/fiveminutestothestage/video/little-big-town/109594919',
+ 'md5': '9c1483c106f4a695c47d2911feed50a7',
+ 'info_dict': {
+ 'id': '109594919',
+ 'ext': 'mp4',
+ 'title': 'Little Big Town',
+ 'description': 'This country quartet was all smiles while playing a sold out show at the Pacific Amphitheatre in Orange County, California.',
+ 'uploader': 'Five Minutes to the Stage',
+ 'uploader_id': 'fiveminutestothestage',
+ 'timestamp': 1414108751,
+ 'upload_date': '20141023',
},
+ }, {
# songs
- {
- 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
- 'md5': '1d7ee4604a3da226dd69a123f748b262',
- 'info_dict': {
- 'id': '93388656',
- 'ext': 'm4a',
- 'title': 'Of weakened soul...',
- 'uploader': 'Killsorrow',
- 'uploader_id': 'killsorrow',
- },
- }, {
- 'add_ie': ['Youtube'],
- 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
- 'info_dict': {
- 'id': 'xqds0B_meys',
- 'ext': 'webm',
- 'title': 'Three Days Grace - Animal I Have Become',
- 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
- 'uploader': 'ThreeDaysGraceVEVO',
- 'uploader_id': 'ThreeDaysGraceVEVO',
- 'upload_date': '20091002',
- },
- }, {
- 'add_ie': ['Youtube'],
- 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
- 'info_dict': {
- 'id': 'ypWvQgnJrSU',
- 'ext': 'mp4',
- 'title': 'Starset - First Light',
- 'description': 'md5:2d5db6c9d11d527683bcda818d332414',
- 'uploader': 'Yumi K',
- 'uploader_id': 'SorenPromotions',
- 'upload_date': '20140725',
- }
+ 'url': 'https://myspace.com/killsorrow/music/song/of-weakened-soul...-93388656-103880681',
+ 'md5': '1d7ee4604a3da226dd69a123f748b262',
+ 'info_dict': {
+ 'id': '93388656',
+ 'ext': 'm4a',
+ 'title': 'Of weakened soul...',
+ 'uploader': 'Killsorrow',
+ 'uploader_id': 'killsorrow',
},
- ]
+ }, {
+ 'add_ie': ['Youtube'],
+ 'url': 'https://myspace.com/threedaysgrace/music/song/animal-i-have-become-28400208-28218041',
+ 'info_dict': {
+ 'id': 'xqds0B_meys',
+ 'ext': 'webm',
+ 'title': 'Three Days Grace - Animal I Have Become',
+ 'description': 'md5:8bd86b3693e72a077cf863a8530c54bb',
+ 'uploader': 'ThreeDaysGraceVEVO',
+ 'uploader_id': 'ThreeDaysGraceVEVO',
+ 'upload_date': '20091002',
+ },
+ }, {
+ 'url': 'https://myspace.com/starset2/music/song/first-light-95799905-106964426',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://myspace.com/thelargemouthbassband/music/song/02-pure-eyes.mp3-94422330-105113388',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
- video_id = mobj.group('id')
+ video_id = mobj.group('video_id') or mobj.group('song_id')
is_song = mobj.group('mediatype').startswith('music/song')
webpage = self._download_webpage(url, video_id)
player_url = self._search_regex(
diff --git a/youtube_dl/extractor/nbc.py b/youtube_dl/extractor/nbc.py
index d2a44d0..62db70b 100644
--- a/youtube_dl/extractor/nbc.py
+++ b/youtube_dl/extractor/nbc.py
@@ -5,10 +5,8 @@ import re
from .common import InfoExtractor
from .theplatform import ThePlatformIE
from .adobepass import AdobePassIE
-from ..compat import compat_urllib_parse_urlparse
from ..utils import (
find_xpath_attr,
- lowercase_escape,
smuggle_url,
unescapeHTML,
update_url_query,
@@ -17,7 +15,7 @@ from ..utils import (
class NBCIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?nbc\.com/(?:[^/]+/)+(?P<id>n?\d+)'
+ _VALID_URL = r'(?P<permalink>https?://(?:www\.)?nbc\.com/[^/]+/video/[^/]+/(?P<id>n?\d+))'
_TESTS = [
{
@@ -37,16 +35,6 @@ class NBCIE(AdobePassIE):
},
},
{
- 'url': 'http://www.nbc.com/the-tonight-show/episodes/176',
- 'info_dict': {
- 'id': '176',
- 'ext': 'flv',
- 'title': 'Ricky Gervais, Steven Van Zandt, ILoveMakonnen',
- 'description': 'A brand new episode of The Tonight Show welcomes Ricky Gervais, Steven Van Zandt and ILoveMakonnen.',
- },
- 'skip': '404 Not Found',
- },
- {
'url': 'http://www.nbc.com/saturday-night-live/video/star-wars-teaser/2832821',
'info_dict': {
'id': '2832821',
@@ -64,11 +52,6 @@ class NBCIE(AdobePassIE):
'skip': 'Only works from US',
},
{
- # This video has expired but with an escaped embedURL
- 'url': 'http://www.nbc.com/parenthood/episode-guide/season-5/just-like-at-home/515',
- 'only_matching': True,
- },
- {
# HLS streams requires the 'hdnea3' cookie
'url': 'http://www.nbc.com/Kings/video/goliath/n1806',
'info_dict': {
@@ -88,59 +71,38 @@ class NBCIE(AdobePassIE):
]
def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- info = {
+ permalink, video_id = re.match(self._VALID_URL, url).groups()
+ video_data = self._download_json(
+ 'https://api.nbc.com/v3/videos', video_id, query={
+ 'filter[permalink]': permalink,
+ })['data'][0]['attributes']
+ query = {
+ 'mbr': 'true',
+ 'manifest': 'm3u',
+ }
+ video_id = video_data['guid']
+ title = video_data['title']
+ if video_data.get('entitlement') == 'auth':
+ resource = self._get_mvpd_resource(
+ 'nbcentertainment', title, video_id,
+ video_data.get('vChipRating'))
+ query['auth'] = self._extract_mvpd_auth(
+ url, video_id, 'nbcentertainment', resource)
+ theplatform_url = smuggle_url(update_url_query(
+ 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
+ query), {'force_smil_url': True})
+ return {
'_type': 'url_transparent',
- 'ie_key': 'ThePlatform',
'id': video_id,
+ 'title': title,
+ 'url': theplatform_url,
+ 'description': video_data.get('description'),
+ 'keywords': video_data.get('keywords'),
+ 'season_number': int_or_none(video_data.get('seasonNumber')),
+ 'episode_number': int_or_none(video_data.get('episodeNumber')),
+ 'series': video_data.get('showName'),
+ 'ie_key': 'ThePlatform',
}
- video_data = None
- preload = self._search_regex(
- r'PRELOAD\s*=\s*({.+})', webpage, 'preload data', default=None)
- if preload:
- preload_data = self._parse_json(preload, video_id)
- path = compat_urllib_parse_urlparse(url).path.rstrip('/')
- entity_id = preload_data.get('xref', {}).get(path)
- video_data = preload_data.get('entities', {}).get(entity_id)
- if video_data:
- query = {
- 'mbr': 'true',
- 'manifest': 'm3u',
- }
- video_id = video_data['guid']
- title = video_data['title']
- if video_data.get('entitlement') == 'auth':
- resource = self._get_mvpd_resource(
- 'nbcentertainment', title, video_id,
- video_data.get('vChipRating'))
- query['auth'] = self._extract_mvpd_auth(
- url, video_id, 'nbcentertainment', resource)
- theplatform_url = smuggle_url(update_url_query(
- 'http://link.theplatform.com/s/NnzsPC/media/guid/2410887629/' + video_id,
- query), {'force_smil_url': True})
- info.update({
- 'id': video_id,
- 'title': title,
- 'url': theplatform_url,
- 'description': video_data.get('description'),
- 'keywords': video_data.get('keywords'),
- 'season_number': int_or_none(video_data.get('seasonNumber')),
- 'episode_number': int_or_none(video_data.get('episodeNumber')),
- 'series': video_data.get('showName'),
- })
- else:
- theplatform_url = unescapeHTML(lowercase_escape(self._html_search_regex(
- [
- r'(?:class="video-player video-player-full" data-mpx-url|class="player" src)="(.*?)"',
- r'<iframe[^>]+src="((?:https?:)?//player\.theplatform\.com/[^"]+)"',
- r'"embedURL"\s*:\s*"([^"]+)"'
- ],
- webpage, 'theplatform url').replace('_no_endcard', '').replace('\\/', '/')))
- if theplatform_url.startswith('//'):
- theplatform_url = 'http:' + theplatform_url
- info['url'] = smuggle_url(theplatform_url, {'source_url': url})
- return info
class NBCSportsVPlayerIE(InfoExtractor):
diff --git a/youtube_dl/extractor/nonktube.py b/youtube_dl/extractor/nonktube.py
new file mode 100644
index 0000000..63e58aa
--- /dev/null
+++ b/youtube_dl/extractor/nonktube.py
@@ -0,0 +1,33 @@
+from __future__ import unicode_literals
+
+from .nuevo import NuevoBaseIE
+
+
+class NonkTubeIE(NuevoBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?nonktube\.com/(?:(?:video|embed)/|media/nuevo/embed\.php\?.*?\bid=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://www.nonktube.com/video/118636/sensual-wife-uncensored-fucked-in-hairy-pussy-and-facialized',
+ 'info_dict': {
+ 'id': '118636',
+ 'ext': 'mp4',
+ 'title': 'Sensual Wife Uncensored Fucked In Hairy Pussy And Facialized',
+ 'age_limit': 18,
+ 'duration': 1150.98,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ }, {
+ 'url': 'https://www.nonktube.com/embed/118636',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ info = self._extract_nuevo(
+ 'https://www.nonktube.com/media/nuevo/econfig.php?key=%s'
+ % video_id, video_id)
+
+ info['age_limit'] = 18
+ return info
diff --git a/youtube_dl/extractor/noovo.py b/youtube_dl/extractor/noovo.py
new file mode 100644
index 0000000..f7fa098
--- /dev/null
+++ b/youtube_dl/extractor/noovo.py
@@ -0,0 +1,97 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .brightcove import BrightcoveNewIE
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ smuggle_url,
+ try_get,
+)
+
+
+class NoovoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:[^/]+\.)?noovo\.ca/videos/(?P<id>[^/]+/[^/?#&]+)'
+ _TESTS = [{
+ # clip
+ 'url': 'http://noovo.ca/videos/rpm-plus/chrysler-imperial',
+ 'info_dict': {
+ 'id': '5386045029001',
+ 'ext': 'mp4',
+ 'title': 'Chrysler Imperial',
+ 'description': 'md5:de3c898d1eb810f3e6243e08c8b4a056',
+ 'timestamp': 1491399228,
+ 'upload_date': '20170405',
+ 'uploader_id': '618566855001',
+ 'creator': 'vtele',
+ 'view_count': int,
+ 'series': 'RPM+',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # episode
+ 'url': 'http://noovo.ca/videos/l-amour-est-dans-le-pre/episode-13-8',
+ 'info_dict': {
+ 'id': '5395865725001',
+ 'title': 'Épisode 13 : Les retrouvailles',
+ 'description': 'md5:336d5ebc5436534e61d16e63ddfca327',
+ 'ext': 'mp4',
+ 'timestamp': 1492019320,
+ 'upload_date': '20170412',
+ 'uploader_id': '618566855001',
+ 'creator': 'vtele',
+ 'view_count': int,
+ 'series': "L'amour est dans le pré",
+ 'season_number': 5,
+ 'episode': 'Épisode 13',
+ 'episode_number': 13,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+ BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/618566855001/default_default/index.html?videoId=%s'
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ data = self._download_json(
+ 'http://api.noovo.ca/api/v1/pages/single-episode/%s' % video_id,
+ video_id)['data']
+
+ content = try_get(data, lambda x: x['contents'][0])
+
+ brightcove_id = data.get('brightcoveId') or content['brightcoveId']
+
+ series = try_get(
+ data, (
+ lambda x: x['show']['title'],
+ lambda x: x['season']['show']['title']),
+ compat_str)
+
+ episode = None
+ og = data.get('og')
+ if isinstance(og, dict) and og.get('type') == 'video.episode':
+ episode = og.get('title')
+
+ video = content or data
+
+ return {
+ '_type': 'url_transparent',
+ 'ie_key': BrightcoveNewIE.ie_key(),
+ 'url': smuggle_url(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id,
+ {'geo_countries': ['CA']}),
+ 'id': brightcove_id,
+ 'title': video.get('title'),
+ 'creator': video.get('source'),
+ 'view_count': int_or_none(video.get('viewsCount')),
+ 'series': series,
+ 'season_number': int_or_none(try_get(
+ data, lambda x: x['season']['seasonNumber'])),
+ 'episode': episode,
+ 'episode_number': int_or_none(data.get('episodeNumber')),
+ }
diff --git a/youtube_dl/extractor/nowness.py b/youtube_dl/extractor/nowness.py
index b6c5ee6..f26dafb 100644
--- a/youtube_dl/extractor/nowness.py
+++ b/youtube_dl/extractor/nowness.py
@@ -28,7 +28,7 @@ class NownessBaseIE(InfoExtractor):
bc_url = BrightcoveLegacyIE._extract_brightcove_url(player_code)
if bc_url:
return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
- bc_url = BrightcoveNewIE._extract_url(player_code)
+ bc_url = BrightcoveNewIE._extract_url(self, player_code)
if bc_url:
return self.url_result(bc_url, BrightcoveNewIE.ie_key())
raise ExtractorError('Could not find player definition')
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 38fefe4..79296f0 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -313,9 +313,9 @@ class NPOIE(NPOBaseIE):
class NPOLiveIE(NPOBaseIE):
IE_NAME = 'npo.nl:live'
- _VALID_URL = r'https?://(?:www\.)?npo\.nl/live/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?npo\.nl/live(?:/(?P<id>[^/?#&]+))?'
- _TEST = {
+ _TESTS = [{
'url': 'http://www.npo.nl/live/npo-1',
'info_dict': {
'id': 'LI_NL1_4188102',
@@ -327,10 +327,13 @@ class NPOLiveIE(NPOBaseIE):
'params': {
'skip_download': True,
}
- }
+ }, {
+ 'url': 'http://www.npo.nl/live',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
- display_id = self._match_id(url)
+ display_id = self._match_id(url) or 'npo-1'
webpage = self._download_webpage(url, display_id)
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index 7fe79cb..3b4f51f 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -148,13 +148,34 @@ class NRKBaseIE(InfoExtractor):
vcodec = 'none' if data.get('mediaType') == 'Audio' else None
- # TODO: extract chapters when https://github.com/rg3/youtube-dl/pull/9409 is merged
-
for entry in entries:
entry.update(common_info)
for f in entry['formats']:
f['vcodec'] = vcodec
+ points = data.get('shortIndexPoints')
+ if isinstance(points, list):
+ chapters = []
+ for next_num, point in enumerate(points, start=1):
+ if not isinstance(point, dict):
+ continue
+ start_time = parse_duration(point.get('startPoint'))
+ if start_time is None:
+ continue
+ end_time = parse_duration(
+ data.get('duration')
+ if next_num == len(points)
+ else points[next_num].get('startPoint'))
+ if end_time is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': point.get('title'),
+ })
+ if chapters and len(entries) == 1:
+ entries[0]['chapters'] = chapters
+
return self.playlist_result(entries, video_id, title, description)
diff --git a/youtube_dl/extractor/nuevo.py b/youtube_dl/extractor/nuevo.py
index 87fb94d..be1e09d 100644
--- a/youtube_dl/extractor/nuevo.py
+++ b/youtube_dl/extractor/nuevo.py
@@ -10,9 +10,10 @@ from ..utils import (
class NuevoBaseIE(InfoExtractor):
- def _extract_nuevo(self, config_url, video_id):
+ def _extract_nuevo(self, config_url, video_id, headers={}):
config = self._download_xml(
- config_url, video_id, transform_source=lambda s: s.strip())
+ config_url, video_id, transform_source=lambda s: s.strip(),
+ headers=headers)
title = xpath_text(config, './title', 'title', fatal=True).strip()
video_id = xpath_text(config, './mediaid', default=video_id)
diff --git a/youtube_dl/extractor/odnoklassniki.py b/youtube_dl/extractor/odnoklassniki.py
index 986708e..854b680 100644
--- a/youtube_dl/extractor/odnoklassniki.py
+++ b/youtube_dl/extractor/odnoklassniki.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import (
+ compat_etree_fromstring,
compat_parse_qs,
compat_urllib_parse_unquote,
compat_urllib_parse_urlparse,
@@ -37,7 +38,7 @@ class OdnoklassnikiIE(InfoExtractor):
}, {
# metadataUrl
'url': 'http://ok.ru/video/63567059965189-0?fromTime=5',
- 'md5': '9676cf86eff5391d35dea675d224e131',
+ 'md5': '6ff470ea2dd51d5d18c295a355b0b6bc',
'info_dict': {
'id': '63567059965189-0',
'ext': 'mp4',
@@ -53,7 +54,7 @@ class OdnoklassnikiIE(InfoExtractor):
}, {
# YouTube embed (metadataUrl, provider == USER_YOUTUBE)
'url': 'http://ok.ru/video/64211978996595-1',
- 'md5': '5d7475d428845cd2e13bae6f1a992278',
+ 'md5': '2f206894ffb5dbfcce2c5a14b909eea5',
'info_dict': {
'id': '64211978996595-1',
'ext': 'mp4',
@@ -61,8 +62,8 @@ class OdnoklassnikiIE(InfoExtractor):
'description': 'md5:848eb8b85e5e3471a3a803dae1343ed0',
'duration': 440,
'upload_date': '20150826',
- 'uploader_id': '750099571',
- 'uploader': 'Алина П',
+ 'uploader_id': 'tvroscosmos',
+ 'uploader': 'Телестудия Роскосмоса',
'age_limit': 0,
},
}, {
@@ -81,6 +82,7 @@ class OdnoklassnikiIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ 'skip': 'Video has not been found',
}, {
'url': 'http://ok.ru/web-api/video/moviePlayer/20079905452',
'only_matching': True,
@@ -176,14 +178,32 @@ class OdnoklassnikiIE(InfoExtractor):
})
return info
- quality = qualities(('mobile', 'lowest', 'low', 'sd', 'hd'))
+ quality = qualities(('4', '0', '1', '2', '3', '5'))
formats = [{
'url': f['url'],
'ext': 'mp4',
'format_id': f['name'],
- 'quality': quality(f['name']),
} for f in metadata['videos']]
+
+ m3u8_url = metadata.get('hlsManifestUrl')
+ if m3u8_url:
+ formats.extend(self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+
+ dash_manifest = metadata.get('metadataEmbedded')
+ if dash_manifest:
+ formats.extend(self._parse_mpd_formats(
+ compat_etree_fromstring(dash_manifest), 'mpd'))
+
+ for fmt in formats:
+ fmt_type = self._search_regex(
+ r'\btype[/=](\d)', fmt['url'],
+ 'format type', default=None)
+ if fmt_type:
+ fmt['quality'] = quality(fmt_type)
+
self._sort_formats(formats)
info['formats'] = formats
diff --git a/youtube_dl/extractor/openload.py b/youtube_dl/extractor/openload.py
index 58ffde5..d8036b5 100644
--- a/youtube_dl/extractor/openload.py
+++ b/youtube_dl/extractor/openload.py
@@ -75,51 +75,38 @@ class OpenloadIE(InfoExtractor):
'<span[^>]+id="[^"]+"[^>]*>([0-9A-Za-z]+)</span>',
webpage, 'openload ID')
- video_url_chars = []
-
- first_char = ord(ol_id[0])
- key = first_char - 55
- maxKey = max(2, key)
- key = min(maxKey, len(ol_id) - 38)
- t = ol_id[key:key + 36]
-
- hashMap = {}
- v = ol_id.replace(t, '')
- h = 0
-
- while h < len(t):
- f = t[h:h + 3]
- i = int(f, 8)
- hashMap[h / 3] = i
- h += 3
-
- h = 0
- H = 0
- while h < len(v):
- B = ''
- C = ''
- if len(v) >= h + 2:
- B = v[h:h + 2]
- if len(v) >= h + 3:
- C = v[h:h + 3]
- i = int(B, 16)
- h += 2
- if H % 3 == 0:
- i = int(C, 8)
- h += 1
- elif H % 2 == 0 and H != 0 and ord(v[H - 1]) < 60:
- i = int(C, 10)
- h += 1
- index = H % 7
-
- A = hashMap[index]
- i ^= 213
- i ^= A
- video_url_chars.append(compat_chr(i))
- H += 1
+ decoded = ''
+ a = ol_id[0:24]
+ b = []
+ for i in range(0, len(a), 8):
+ b.append(int(a[i:i + 8] or '0', 16))
+ ol_id = ol_id[24:]
+ j = 0
+ k = 0
+ while j < len(ol_id):
+ c = 128
+ d = 0
+ e = 0
+ f = 0
+ _more = True
+ while _more:
+ if j + 1 >= len(ol_id):
+ c = 143
+ f = int(ol_id[j:j + 2] or '0', 16)
+ j += 2
+ d += (f & 127) << e
+ e += 7
+ _more = f >= c
+ g = d ^ b[k % 3]
+ for i in range(4):
+ char_dec = (g >> 8 * i) & (c + 127)
+ char = compat_chr(char_dec)
+ if char != '#':
+ decoded += char
+ k += 1
video_url = 'https://openload.co/stream/%s?mime=true'
- video_url = video_url % (''.join(video_url_chars))
+ video_url = video_url % decoded
title = self._og_search_title(webpage, default=None) or self._search_regex(
r'<span[^>]+class=["\']title["\'][^>]*>([^<]+)', webpage,
diff --git a/youtube_dl/extractor/orf.py b/youtube_dl/extractor/orf.py
index 1e2c54e..cc296ea 100644
--- a/youtube_dl/extractor/orf.py
+++ b/youtube_dl/extractor/orf.py
@@ -2,8 +2,6 @@
from __future__ import unicode_literals
import re
-import calendar
-import datetime
from .common import InfoExtractor
from ..compat import compat_str
@@ -144,77 +142,25 @@ class ORFTVthekIE(InfoExtractor):
}
-class ORFOE1IE(InfoExtractor):
- IE_NAME = 'orf:oe1'
- IE_DESC = 'Radio Österreich 1'
- _VALID_URL = r'https?://oe1\.orf\.at/(?:programm/|konsole\?.*?\btrack_id=)(?P<id>[0-9]+)'
-
- # Audios on ORF radio are only available for 7 days, so we can't add tests.
- _TESTS = [{
- 'url': 'http://oe1.orf.at/konsole?show=on_demand#?track_id=394211',
- 'only_matching': True,
- }, {
- 'url': 'http://oe1.orf.at/konsole?show=ondemand&track_id=443608&load_day=/programm/konsole/tag/20160726',
- 'only_matching': True,
- }]
-
- def _real_extract(self, url):
- show_id = self._match_id(url)
- data = self._download_json(
- 'http://oe1.orf.at/programm/%s/konsole' % show_id,
- show_id
- )
-
- timestamp = datetime.datetime.strptime('%s %s' % (
- data['item']['day_label'],
- data['item']['time']
- ), '%d.%m.%Y %H:%M')
- unix_timestamp = calendar.timegm(timestamp.utctimetuple())
-
- return {
- 'id': show_id,
- 'title': data['item']['title'],
- 'url': data['item']['url_stream'],
- 'ext': 'mp3',
- 'description': data['item'].get('info'),
- 'timestamp': unix_timestamp
- }
-
-
-class ORFFM4IE(InfoExtractor):
- IE_NAME = 'orf:fm4'
- IE_DESC = 'radio FM4'
- _VALID_URL = r'https?://fm4\.orf\.at/(?:7tage/?#|player/)(?P<date>[0-9]+)/(?P<show>\w+)'
-
- _TEST = {
- 'url': 'http://fm4.orf.at/player/20160110/IS/',
- 'md5': '01e736e8f1cef7e13246e880a59ad298',
- 'info_dict': {
- 'id': '2016-01-10_2100_tl_54_7DaysSun13_11244',
- 'ext': 'mp3',
- 'title': 'Im Sumpf',
- 'description': 'md5:384c543f866c4e422a55f66a62d669cd',
- 'duration': 7173,
- 'timestamp': 1452456073,
- 'upload_date': '20160110',
- },
- 'skip': 'Live streams on FM4 got deleted soon',
- }
-
+class ORFRadioIE(InfoExtractor):
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
+ station = mobj.group('station')
show_date = mobj.group('date')
show_id = mobj.group('show')
+ if station == 'fm4':
+ show_id = '4%s' % show_id
+
data = self._download_json(
- 'http://audioapi.orf.at/fm4/json/2.0/broadcasts/%s/4%s' % (show_date, show_id),
+ 'http://audioapi.orf.at/%s/api/json/current/broadcast/%s/%s' % (station, show_id, show_date),
show_id
)
def extract_entry_dict(info, title, subtitle):
return {
'id': info['loopStreamId'].replace('.mp3', ''),
- 'url': 'http://loopstream01.apa.at/?channel=fm4&id=%s' % info['loopStreamId'],
+ 'url': 'http://loopstream01.apa.at/?channel=%s&id=%s' % (station, info['loopStreamId']),
'title': title,
'description': subtitle,
'duration': (info['end'] - info['start']) / 1000,
@@ -233,6 +179,47 @@ class ORFFM4IE(InfoExtractor):
}
+class ORFFM4IE(ORFRadioIE):
+ IE_NAME = 'orf:fm4'
+ IE_DESC = 'radio FM4'
+ _VALID_URL = r'https?://(?P<station>fm4)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+
+ _TEST = {
+ 'url': 'http://fm4.orf.at/player/20170107/CC',
+ 'md5': '2b0be47375432a7ef104453432a19212',
+ 'info_dict': {
+ 'id': '2017-01-07_2100_tl_54_7DaysSat18_31295',
+ 'ext': 'mp3',
+ 'title': 'Solid Steel Radioshow',
+ 'description': 'Die Mixshow von Coldcut und Ninja Tune.',
+ 'duration': 3599,
+ 'timestamp': 1483819257,
+ 'upload_date': '20170107',
+ },
+ 'skip': 'Shows from ORF radios are only available for 7 days.'
+ }
+
+
+class ORFOE1IE(ORFRadioIE):
+ IE_NAME = 'orf:oe1'
+ IE_DESC = 'Radio Österreich 1'
+ _VALID_URL = r'https?://(?P<station>oe1)\.orf\.at/player/(?P<date>[0-9]+)/(?P<show>\w+)'
+
+ _TEST = {
+ 'url': 'http://oe1.orf.at/player/20170108/456544',
+ 'md5': '34d8a6e67ea888293741c86a099b745b',
+ 'info_dict': {
+ 'id': '2017-01-08_0759_tl_51_7DaysSun6_256141',
+ 'ext': 'mp3',
+ 'title': 'Morgenjournal',
+ 'duration': 609,
+ 'timestamp': 1483858796,
+ 'upload_date': '20170108',
+ },
+ 'skip': 'Shows from ORF radios are only available for 7 days.'
+ }
+
+
class ORFIPTVIE(InfoExtractor):
IE_NAME = 'orf:iptv'
IE_DESC = 'iptv.ORF.at'
diff --git a/youtube_dl/extractor/packtpub.py b/youtube_dl/extractor/packtpub.py
new file mode 100644
index 0000000..bb668c9
--- /dev/null
+++ b/youtube_dl/extractor/packtpub.py
@@ -0,0 +1,171 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_str,
+ compat_HTTPError,
+)
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ remove_end,
+ strip_or_none,
+ unified_timestamp,
+ urljoin,
+ urlencode_postdata,
+)
+
+
+class PacktPubBaseIE(InfoExtractor):
+ _PACKT_BASE = 'https://www.packtpub.com'
+ _MAPT_REST = '%s/mapt-rest' % _PACKT_BASE
+
+
+class PacktPubIE(PacktPubBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<course_id>\d+)/(?P<chapter_id>\d+)/(?P<id>\d+)'
+
+ _TEST = {
+ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215/20528/20530/Project+Intro',
+ 'md5': '1e74bd6cfd45d7d07666f4684ef58f70',
+ 'info_dict': {
+ 'id': '20530',
+ 'ext': 'mp4',
+ 'title': 'Project Intro',
+ 'thumbnail': r're:(?i)^https?://.*\.jpg',
+ 'timestamp': 1490918400,
+ 'upload_date': '20170331',
+ },
+ }
+ _NETRC_MACHINE = 'packtpub'
+ _TOKEN = None
+
+ def _real_initialize(self):
+ (username, password) = self._get_login_info()
+ if username is None:
+ return
+ webpage = self._download_webpage(self._PACKT_BASE, None)
+ login_form = self._form_hidden_inputs(
+ 'packt-user-login-form', webpage)
+ login_form.update({
+ 'email': username,
+ 'password': password,
+ })
+ self._download_webpage(
+ self._PACKT_BASE, None, 'Logging in as %s' % username,
+ data=urlencode_postdata(login_form))
+ try:
+ self._TOKEN = self._download_json(
+ '%s/users/tokens/sessions' % self._MAPT_REST, None,
+ 'Downloading Authorization Token')['data']['token']
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (401, 404):
+ message = self._parse_json(e.cause.read().decode(), None)['message']
+ raise ExtractorError(message, expected=True)
+ raise
+
+ def _handle_error(self, response):
+ if response.get('status') != 'success':
+ raise ExtractorError(
+ '% said: %s' % (self.IE_NAME, response['message']),
+ expected=True)
+
+ def _download_json(self, *args, **kwargs):
+ response = super(PacktPubIE, self)._download_json(*args, **kwargs)
+ self._handle_error(response)
+ return response
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ course_id, chapter_id, video_id = mobj.group(
+ 'course_id', 'chapter_id', 'id')
+
+ headers = {}
+ if self._TOKEN:
+ headers['Authorization'] = self._TOKEN
+ video = self._download_json(
+ '%s/users/me/products/%s/chapters/%s/sections/%s'
+ % (self._MAPT_REST, course_id, chapter_id, video_id), video_id,
+ 'Downloading JSON video', headers=headers)['data']
+
+ content = video.get('content')
+ if not content:
+ self.raise_login_required('This video is locked')
+
+ video_url = content['file']
+
+ metadata = self._download_json(
+ '%s/products/%s/chapters/%s/sections/%s/metadata'
+ % (self._MAPT_REST, course_id, chapter_id, video_id),
+ video_id)['data']
+
+ title = metadata['pageTitle']
+ course_title = metadata.get('title')
+ if course_title:
+ title = remove_end(title, ' - %s' % course_title)
+ timestamp = unified_timestamp(metadata.get('publicationDate'))
+ thumbnail = urljoin(self._PACKT_BASE, metadata.get('filepath'))
+
+ return {
+ 'id': video_id,
+ 'url': video_url,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'timestamp': timestamp,
+ }
+
+
+class PacktPubCourseIE(PacktPubBaseIE):
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?packtpub\.com/mapt/video/[^/]+/(?P<id>\d+))'
+ _TEST = {
+ 'url': 'https://www.packtpub.com/mapt/video/web-development/9781787122215',
+ 'info_dict': {
+ 'id': '9781787122215',
+ 'title': 'Learn Nodejs by building 12 projects [Video]',
+ },
+ 'playlist_count': 90,
+ }
+
+ @classmethod
+ def suitable(cls, url):
+ return False if PacktPubIE.suitable(url) else super(
+ PacktPubCourseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ url, course_id = mobj.group('url', 'id')
+
+ course = self._download_json(
+ '%s/products/%s/metadata' % (self._MAPT_REST, course_id),
+ course_id)['data']
+
+ entries = []
+ for chapter_num, chapter in enumerate(course['tableOfContents'], 1):
+ if chapter.get('type') != 'chapter':
+ continue
+ children = chapter.get('children')
+ if not isinstance(children, list):
+ continue
+ chapter_info = {
+ 'chapter': chapter.get('title'),
+ 'chapter_number': chapter_num,
+ 'chapter_id': chapter.get('id'),
+ }
+ for section in children:
+ if section.get('type') != 'section':
+ continue
+ section_url = section.get('seoUrl')
+ if not isinstance(section_url, compat_str):
+ continue
+ entry = {
+ '_type': 'url_transparent',
+ 'url': urljoin(url + '/', section_url),
+ 'title': strip_or_none(section.get('title')),
+ 'description': clean_html(section.get('summary')),
+ 'ie_key': PacktPubIE.ie_key(),
+ }
+ entry.update(chapter_info)
+ entries.append(entry)
+
+ return self.playlist_result(entries, course_id, course.get('title'))
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 3e51b4d..16cc667 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -8,7 +8,9 @@ from ..utils import (
ExtractorError,
determine_ext,
int_or_none,
+ float_or_none,
js_to_json,
+ orderedSet,
strip_jsonp,
strip_or_none,
unified_strdate,
@@ -264,6 +266,13 @@ class PBSIE(InfoExtractor):
'playlist_count': 2,
},
{
+ 'url': 'http://www.pbs.org/wgbh/americanexperience/films/great-war/',
+ 'info_dict': {
+ 'id': 'great-war',
+ },
+ 'playlist_count': 3,
+ },
+ {
'url': 'http://www.pbs.org/wgbh/americanexperience/films/death/player/',
'info_dict': {
'id': '2276541483',
@@ -381,10 +390,10 @@ class PBSIE(InfoExtractor):
# tabbed frontline videos
MULTI_PART_REGEXES = (
r'<div[^>]+class="videotab[^"]*"[^>]+vid="(\d+)"',
- r'<a[^>]+href=["\']#video-\d+["\'][^>]+data-coveid=["\'](\d+)',
+ r'<a[^>]+href=["\']#(?:video-|part)\d+["\'][^>]+data-cove[Ii]d=["\'](\d+)',
)
for p in MULTI_PART_REGEXES:
- tabbed_videos = re.findall(p, webpage)
+ tabbed_videos = orderedSet(re.findall(p, webpage))
if tabbed_videos:
return tabbed_videos, presumptive_id, upload_date, description
@@ -464,6 +473,7 @@ class PBSIE(InfoExtractor):
redirects.append(redirect)
redirect_urls.add(redirect_url)
+ chapters = []
# Player pages may also serve different qualities
for page in ('widget/partnerplayer', 'portalplayer'):
player = self._download_webpage(
@@ -479,6 +489,20 @@ class PBSIE(InfoExtractor):
extract_redirect_urls(video_info)
if not info:
info = video_info
+ if not chapters:
+ for chapter_data in re.findall(r'(?s)chapters\.push\(({.*?})\)', player):
+ chapter = self._parse_json(chapter_data, video_id, js_to_json, fatal=False)
+ if not chapter:
+ continue
+ start_time = float_or_none(chapter.get('start_time'), 1000)
+ duration = float_or_none(chapter.get('duration'), 1000)
+ if start_time is None or duration is None:
+ continue
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': start_time + duration,
+ 'title': chapter.get('title'),
+ })
formats = []
http_url = None
@@ -515,7 +539,7 @@ class PBSIE(InfoExtractor):
http_url = format_url
self._remove_duplicate_formats(formats)
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
@@ -588,4 +612,5 @@ class PBSIE(InfoExtractor):
'upload_date': upload_date,
'formats': formats,
'subtitles': subtitles,
+ 'chapters': chapters,
}
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index 0e36230..1add6b8 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -20,7 +20,7 @@ class PeriscopeBaseIE(InfoExtractor):
class PeriscopeIE(PeriscopeBaseIE):
IE_DESC = 'Periscope'
IE_NAME = 'periscope'
- _VALID_URL = r'https?://(?:www\.)?periscope\.tv/[^/]+/(?P<id>[^/?#]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/[^/]+/(?P<id>[^/?#]+)'
# Alive example URLs can be found here http://onperiscope.com/
_TESTS = [{
'url': 'https://www.periscope.tv/w/aJUQnjY3MjA3ODF8NTYxMDIyMDl2zCg2pECBgwTqRpQuQD352EMPTKQjT4uqlM3cgWFA-g==',
@@ -41,6 +41,9 @@ class PeriscopeIE(PeriscopeBaseIE):
}, {
'url': 'https://www.periscope.tv/bastaakanoggano/1OdKrlkZZjOJX',
'only_matching': True,
+ }, {
+ 'url': 'https://www.periscope.tv/w/1ZkKzPbMVggJv',
+ 'only_matching': True,
}]
@staticmethod
@@ -103,7 +106,7 @@ class PeriscopeIE(PeriscopeBaseIE):
class PeriscopeUserIE(PeriscopeBaseIE):
- _VALID_URL = r'https?://(?:www\.)?periscope\.tv/(?P<id>[^/]+)/?$'
+ _VALID_URL = r'https?://(?:www\.)?(?:periscope|pscp)\.tv/(?P<id>[^/]+)/?$'
IE_DESC = 'Periscope user videos'
IE_NAME = 'periscope:user'
diff --git a/youtube_dl/extractor/porn91.py b/youtube_dl/extractor/porn91.py
index 073fc3e..24c3600 100644
--- a/youtube_dl/extractor/porn91.py
+++ b/youtube_dl/extractor/porn91.py
@@ -1,10 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-from ..compat import (
- compat_urllib_parse_unquote,
- compat_urllib_parse_urlencode,
-)
from .common import InfoExtractor
from ..utils import (
parse_duration,
@@ -19,7 +15,7 @@ class Porn91IE(InfoExtractor):
_TEST = {
'url': 'http://91porn.com/view_video.php?viewkey=7e42283b4f5ab36da134',
- 'md5': '6df8f6d028bc8b14f5dbd73af742fb20',
+ 'md5': '7fcdb5349354f40d41689bd0fa8db05a',
'info_dict': {
'id': '7e42283b4f5ab36da134',
'title': '18岁大一漂亮学妹,水嫩性感,再爽一次!',
@@ -43,24 +39,7 @@ class Porn91IE(InfoExtractor):
r'<div id="viewvideo-title">([^<]+)</div>', webpage, 'title')
title = title.replace('\n', '')
- # get real url
- file_id = self._search_regex(
- r'so.addVariable\(\'file\',\'(\d+)\'', webpage, 'file id')
- sec_code = self._search_regex(
- r'so.addVariable\(\'seccode\',\'([^\']+)\'', webpage, 'sec code')
- max_vid = self._search_regex(
- r'so.addVariable\(\'max_vid\',\'(\d+)\'', webpage, 'max vid')
- url_params = compat_urllib_parse_urlencode({
- 'VID': file_id,
- 'mp4': '1',
- 'seccode': sec_code,
- 'max_vid': max_vid,
- })
- info_cn = self._download_webpage(
- 'http://91porn.com/getfile.php?' + url_params, video_id,
- 'Downloading real video url')
- video_url = compat_urllib_parse_unquote(self._search_regex(
- r'file=([^&]+)&', info_cn, 'url'))
+ info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
duration = parse_duration(self._search_regex(
r'时长:\s*</span>\s*(\d+:\d+)', webpage, 'duration', fatal=False))
@@ -68,11 +47,12 @@ class Porn91IE(InfoExtractor):
comment_count = int_or_none(self._search_regex(
r'留言:\s*</span>\s*(\d+)', webpage, 'comment count', fatal=False))
- return {
+ info_dict.update({
'id': video_id,
'title': title,
- 'url': video_url,
'duration': duration,
'comment_count': comment_count,
'age_limit': self._rta_search(webpage),
- }
+ })
+
+ return info_dict
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index b25f1f1..1dcc8df 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -33,7 +33,7 @@ class PornHubIE(InfoExtractor):
_VALID_URL = r'''(?x)
https?://
(?:
- (?:[a-z]+\.)?pornhub\.com/(?:view_video\.php\?viewkey=|embed/)|
+ (?:[a-z]+\.)?pornhub\.com/(?:(?:view_video\.php|video/show)\?viewkey=|embed/)|
(?:www\.)?thumbzilla\.com/video/
)
(?P<id>[\da-z]+)
@@ -97,6 +97,9 @@ class PornHubIE(InfoExtractor):
}, {
'url': 'https://www.thumbzilla.com/video/ph56c6114abd99a/horny-girlfriend-sex',
'only_matching': True,
+ }, {
+ 'url': 'http://www.pornhub.com/video/show?viewkey=648719015',
+ 'only_matching': True,
}]
@staticmethod
diff --git a/youtube_dl/extractor/r7.py b/youtube_dl/extractor/r7.py
index ed38c77..e2202d6 100644
--- a/youtube_dl/extractor/r7.py
+++ b/youtube_dl/extractor/r7.py
@@ -62,8 +62,7 @@ class R7IE(InfoExtractor):
# m3u8 format always matches the http format, let's copy metadata from
# one to another
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- formats))
+ lambda f: f.get('vcodec') != 'none', formats))
if len(m3u8_formats) == 1:
f_copy = m3u8_formats[0].copy()
f_copy.update(f)
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index 41afbd9..81eb9db 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -1,23 +1,40 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
-from ..compat import compat_urlparse
+from ..compat import (
+ compat_urlparse,
+ compat_str,
+)
from ..utils import (
- determine_ext,
ExtractorError,
+ determine_ext,
find_xpath_attr,
fix_xml_ampersands,
+ GeoRestrictedError,
int_or_none,
parse_duration,
+ strip_or_none,
+ try_get,
unified_strdate,
+ unified_timestamp,
update_url_query,
+ urljoin,
xpath_text,
)
class RaiBaseIE(InfoExtractor):
- def _extract_relinker_formats(self, relinker_url, video_id):
+ _UUID_RE = r'[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
+ _GEO_COUNTRIES = ['IT']
+ _GEO_BYPASS = False
+
+ def _extract_relinker_info(self, relinker_url, video_id):
formats = []
+ geoprotection = None
+ is_live = None
+ duration = None
for platform in ('mon', 'flash', 'native'):
relinker = self._download_xml(
@@ -27,9 +44,27 @@ class RaiBaseIE(InfoExtractor):
query={'output': 45, 'pl': platform},
headers=self.geo_verification_headers())
- media_url = find_xpath_attr(relinker, './url', 'type', 'content').text
+ if not geoprotection:
+ geoprotection = xpath_text(
+ relinker, './geoprotection', default=None) == 'Y'
+
+ if not is_live:
+ is_live = xpath_text(
+ relinker, './is_live', default=None) == 'Y'
+ if not duration:
+ duration = parse_duration(xpath_text(
+ relinker, './duration', default=None))
+
+ url_elem = find_xpath_attr(relinker, './url', 'type', 'content')
+ if url_elem is None:
+ continue
+
+ media_url = url_elem.text
+
+ # This does not imply geo restriction (e.g.
+ # http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html)
if media_url == 'http://download.rai.it/video_no_available.mp4':
- self.raise_geo_restricted()
+ continue
ext = determine_ext(media_url)
if (ext == 'm3u8' and platform != 'mon') or (ext == 'f4m' and platform != 'flash'):
@@ -53,215 +88,333 @@ class RaiBaseIE(InfoExtractor):
'format_id': 'http-%d' % bitrate if bitrate > 0 else 'http',
})
- return formats
+ if not formats and geoprotection is True:
+ self.raise_geo_restricted(countries=self._GEO_COUNTRIES)
+
+ return dict((k, v) for k, v in {
+ 'is_live': is_live,
+ 'duration': duration,
+ 'formats': formats,
+ }.items() if v is not None)
+
+ @staticmethod
+ def _extract_subtitles(url, subtitle_url):
+ subtitles = {}
+ if subtitle_url and isinstance(subtitle_url, compat_str):
+ subtitle_url = urljoin(url, subtitle_url)
+ STL_EXT = '.stl'
+ SRT_EXT = '.srt'
+ subtitles['it'] = [{
+ 'ext': 'stl',
+ 'url': subtitle_url,
+ }]
+ if subtitle_url.endswith(STL_EXT):
+ srt_url = subtitle_url[:-len(STL_EXT)] + SRT_EXT
+ subtitles['it'].append({
+ 'ext': 'srt',
+ 'url': srt_url,
+ })
+ return subtitles
+
+
+class RaiPlayIE(RaiBaseIE):
+ _VALID_URL = r'(?P<url>https?://(?:www\.)?raiplay\.it/.+?-(?P<id>%s)\.html)' % RaiBaseIE._UUID_RE
+ _TESTS = [{
+ 'url': 'http://www.raiplay.it/video/2016/10/La-Casa-Bianca-e06118bb-59a9-4636-b914-498e4cfd2c66.html?source=twitter',
+ 'md5': '340aa3b7afb54bfd14a8c11786450d76',
+ 'info_dict': {
+ 'id': 'e06118bb-59a9-4636-b914-498e4cfd2c66',
+ 'ext': 'mp4',
+ 'title': 'La Casa Bianca',
+ 'alt_title': 'S2016 - Puntata del 23/10/2016',
+ 'description': 'md5:a09d45890850458077d1f68bb036e0a5',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Rai 3',
+ 'creator': 'Rai 3',
+ 'duration': 3278,
+ 'timestamp': 1477764300,
+ 'upload_date': '20161029',
+ 'series': 'La Casa Bianca',
+ 'season': '2016',
+ },
+ }, {
+ 'url': 'http://www.raiplay.it/video/2014/04/Report-del-07042014-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
+ 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
+ 'info_dict': {
+ 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
+ 'ext': 'mp4',
+ 'title': 'Report del 07/04/2014',
+ 'alt_title': 'S2013/14 - Puntata del 07/04/2014',
+ 'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'uploader': 'Rai 5',
+ 'creator': 'Rai 5',
+ 'duration': 6160,
+ 'series': 'Report',
+ 'season_number': 5,
+ 'season': '2013/14',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.raiplay.it/video/2016/11/gazebotraindesi-efebe701-969c-4593-92f3-285f0d1ce750.html?',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ url, video_id = mobj.group('url', 'id')
- def _extract_from_content_id(self, content_id, base_url):
+ media = self._download_json(
+ '%s?json' % url, video_id, 'Downloading video JSON')
+
+ title = media['name']
+
+ video = media['video']
+
+ relinker_info = self._extract_relinker_info(video['contentUrl'], video_id)
+ self._sort_formats(relinker_info['formats'])
+
+ thumbnails = []
+ if 'images' in media:
+ for _, value in media.get('images').items():
+ if value:
+ thumbnails.append({
+ 'url': value.replace('[RESOLUTION]', '600x400')
+ })
+
+ timestamp = unified_timestamp(try_get(
+ media, lambda x: x['availabilities'][0]['start'], compat_str))
+
+ subtitles = self._extract_subtitles(url, video.get('subtitles'))
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'alt_title': media.get('subtitle'),
+ 'description': media.get('description'),
+ 'uploader': media.get('channel'),
+ 'creator': media.get('editor'),
+ 'duration': parse_duration(video.get('duration')),
+ 'timestamp': timestamp,
+ 'thumbnails': thumbnails,
+ 'series': try_get(
+ media, lambda x: x['isPartOf']['name'], compat_str),
+ 'season_number': int_or_none(try_get(
+ media, lambda x: x['isPartOf']['numeroStagioni'])),
+ 'season': media.get('stagione') or None,
+ 'subtitles': subtitles,
+ }
+
+ info.update(relinker_info)
+
+ return info
+
+
+class RaiIE(RaiBaseIE):
+ _VALID_URL = r'https?://[^/]+\.(?:rai\.(?:it|tv)|rainews\.it)/dl/.+?-(?P<id>%s)(?:-.+?)?\.html' % RaiBaseIE._UUID_RE
+ _TESTS = [{
+ # var uniquename = "ContentItem-..."
+ # data-id="ContentItem-..."
+ 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
+ 'info_dict': {
+ 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
+ 'ext': 'mp4',
+ 'title': 'TG PRIMO TEMPO',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1758,
+ 'upload_date': '20140612',
+ }
+ }, {
+ # with ContentItem in many metas
+ 'url': 'http://www.rainews.it/dl/rainews/media/Weekend-al-cinema-da-Hollywood-arriva-il-thriller-di-Tate-Taylor-La-ragazza-del-treno-1632c009-c843-4836-bb65-80c33084a64b.html',
+ 'info_dict': {
+ 'id': '1632c009-c843-4836-bb65-80c33084a64b',
+ 'ext': 'mp4',
+ 'title': 'Weekend al cinema, da Hollywood arriva il thriller di Tate Taylor "La ragazza del treno"',
+ 'description': 'I film in uscita questa settimana.',
+ 'thumbnail': r're:^https?://.*\.png$',
+ 'duration': 833,
+ 'upload_date': '20161103',
+ }
+ }, {
+ # with ContentItem in og:url
+ 'url': 'http://www.rai.it/dl/RaiTV/programmi/media/ContentItem-efb17665-691c-45d5-a60c-5301333cbb0c.html',
+ 'md5': '11959b4e44fa74de47011b5799490adf',
+ 'info_dict': {
+ 'id': 'efb17665-691c-45d5-a60c-5301333cbb0c',
+ 'ext': 'mp4',
+ 'title': 'TG1 ore 20:00 del 03/11/2016',
+ 'description': 'TG1 edizione integrale ore 20:00 del giorno 03/11/2016',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 2214,
+ 'upload_date': '20161103',
+ }
+ }, {
+ # drawMediaRaiTV(...)
+ 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
+ 'md5': '2dd727e61114e1ee9c47f0da6914e178',
+ 'info_dict': {
+ 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
+ 'ext': 'mp4',
+ 'title': 'Il pacco',
+ 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'upload_date': '20141221',
+ },
+ }, {
+ # initEdizione('ContentItem-...'
+ 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
+ 'info_dict': {
+ 'id': 'c2187016-8484-4e3a-8ac8-35e475b07303',
+ 'ext': 'mp4',
+ 'title': r're:TG1 ore \d{2}:\d{2} del \d{2}/\d{2}/\d{4}',
+ 'duration': 2274,
+ 'upload_date': '20170401',
+ },
+ 'skip': 'Changes daily',
+ }, {
+ # HDS live stream with only relinker URL
+ 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
+ 'info_dict': {
+ 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
+ 'ext': 'flv',
+ 'title': 'EuroNews',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # HLS live stream with ContentItem in og:url
+ 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
+ 'info_dict': {
+ 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
+ 'ext': 'mp4',
+ 'title': 'La diretta di Rainews24',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }]
+
+ def _extract_from_content_id(self, content_id, url):
media = self._download_json(
'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-%s.html?json' % content_id,
content_id, 'Downloading video JSON')
- thumbnails = []
- for image_type in ('image', 'image_medium', 'image_300'):
- thumbnail_url = media.get(image_type)
- if thumbnail_url:
- thumbnails.append({
- 'url': compat_urlparse.urljoin(base_url, thumbnail_url),
- })
+ title = media['name'].strip()
- formats = []
media_type = media['type']
if 'Audio' in media_type:
- formats.append({
- 'format_id': media.get('formatoAudio'),
- 'url': media['audioUrl'],
- 'ext': media.get('formatoAudio'),
- })
+ relinker_info = {
+ 'formats': {
+ 'format_id': media.get('formatoAudio'),
+ 'url': media['audioUrl'],
+ 'ext': media.get('formatoAudio'),
+ }
+ }
elif 'Video' in media_type:
- formats.extend(self._extract_relinker_formats(media['mediaUri'], content_id))
- self._sort_formats(formats)
+ relinker_info = self._extract_relinker_info(media['mediaUri'], content_id)
else:
raise ExtractorError('not a media file')
- subtitles = {}
- captions = media.get('subtitlesUrl')
- if captions:
- STL_EXT = '.stl'
- SRT_EXT = '.srt'
- if captions.endswith(STL_EXT):
- captions = captions[:-len(STL_EXT)] + SRT_EXT
- subtitles['it'] = [{
- 'ext': 'srt',
- 'url': captions,
- }]
+ self._sort_formats(relinker_info['formats'])
+
+ thumbnails = []
+ for image_type in ('image', 'image_medium', 'image_300'):
+ thumbnail_url = media.get(image_type)
+ if thumbnail_url:
+ thumbnails.append({
+ 'url': compat_urlparse.urljoin(url, thumbnail_url),
+ })
- return {
+ subtitles = self._extract_subtitles(url, media.get('subtitlesUrl'))
+
+ info = {
'id': content_id,
- 'title': media['name'],
- 'description': media.get('desc'),
+ 'title': title,
+ 'description': strip_or_none(media.get('desc')),
'thumbnails': thumbnails,
'uploader': media.get('author'),
'upload_date': unified_strdate(media.get('date')),
'duration': parse_duration(media.get('length')),
- 'formats': formats,
'subtitles': subtitles,
}
+ info.update(relinker_info)
-class RaiTVIE(RaiBaseIE):
- _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/(?:[^/]+/)+(?:media|ondemand)/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
- _TESTS = [
- {
- 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-cb27157f-9dd0-4aee-b788-b1f67643a391.html',
- 'md5': '8970abf8caf8aef4696e7b1f2adfc696',
- 'info_dict': {
- 'id': 'cb27157f-9dd0-4aee-b788-b1f67643a391',
- 'ext': 'mp4',
- 'title': 'Report del 07/04/2014',
- 'description': 'md5:f27c544694cacb46a078db84ec35d2d9',
- 'upload_date': '20140407',
- 'duration': 6160,
- 'thumbnail': r're:^https?://.*\.jpg$',
- }
- },
- {
- # no m3u8 stream
- 'url': 'http://www.raisport.rai.it/dl/raiSport/media/rassegna-stampa-04a9f4bd-b563-40cf-82a6-aad3529cb4a9.html',
- # HDS download, MD5 is unstable
- 'info_dict': {
- 'id': '04a9f4bd-b563-40cf-82a6-aad3529cb4a9',
- 'ext': 'flv',
- 'title': 'TG PRIMO TEMPO',
- 'upload_date': '20140612',
- 'duration': 1758,
- 'thumbnail': r're:^https?://.*\.jpg$',
- },
- 'skip': 'Geo-restricted to Italy',
- },
- {
- 'url': 'http://www.rainews.it/dl/rainews/media/state-of-the-net-Antonella-La-Carpia-regole-virali-7aafdea9-0e5d-49d5-88a6-7e65da67ae13.html',
- 'md5': '35cf7c229f22eeef43e48b5cf923bef0',
- 'info_dict': {
- 'id': '7aafdea9-0e5d-49d5-88a6-7e65da67ae13',
- 'ext': 'mp4',
- 'title': 'State of the Net, Antonella La Carpia: regole virali',
- 'description': 'md5:b0ba04a324126903e3da7763272ae63c',
- 'upload_date': '20140613',
- },
- 'skip': 'Error 404',
- },
- {
- 'url': 'http://www.rai.tv/dl/RaiTV/programmi/media/ContentItem-b4a49761-e0cc-4b14-8736-2729f6f73132-tg2.html',
- 'info_dict': {
- 'id': 'b4a49761-e0cc-4b14-8736-2729f6f73132',
- 'ext': 'mp4',
- 'title': 'Alluvione in Sardegna e dissesto idrogeologico',
- 'description': 'Edizione delle ore 20:30 ',
- },
- 'skip': 'invalid urls',
- },
- {
- 'url': 'http://www.ilcandidato.rai.it/dl/ray/media/Il-Candidato---Primo-episodio-Le-Primarie-28e5525a-b495-45e8-a7c3-bc48ba45d2b6.html',
- 'md5': 'e57493e1cb8bc7c564663f363b171847',
- 'info_dict': {
- 'id': '28e5525a-b495-45e8-a7c3-bc48ba45d2b6',
- 'ext': 'mp4',
- 'title': 'Il Candidato - Primo episodio: "Le Primarie"',
- 'description': 'md5:364b604f7db50594678f483353164fb8',
- 'upload_date': '20140923',
- 'duration': 386,
- 'thumbnail': r're:^https?://.*\.jpg$',
- }
- },
- ]
+ return info
def _real_extract(self, url):
video_id = self._match_id(url)
- return self._extract_from_content_id(video_id, url)
+ webpage = self._download_webpage(url, video_id)
+ content_item_id = None
-class RaiIE(RaiBaseIE):
- _VALID_URL = r'https?://(?:.+?\.)?(?:rai\.it|rai\.tv|rainews\.it)/dl/.+?-(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})(?:-.+?)?\.html'
- _TESTS = [
- {
- 'url': 'http://www.report.rai.it/dl/Report/puntata/ContentItem-0c7a664b-d0f4-4b2c-8835-3f82e46f433e.html',
- 'md5': '2dd727e61114e1ee9c47f0da6914e178',
- 'info_dict': {
- 'id': '59d69d28-6bb6-409d-a4b5-ed44096560af',
- 'ext': 'mp4',
- 'title': 'Il pacco',
- 'description': 'md5:4b1afae1364115ce5d78ed83cd2e5b3a',
- 'upload_date': '20141221',
- },
- },
- {
- # Direct relinker URL
- 'url': 'http://www.rai.tv/dl/RaiTV/dirette/PublishingBlock-1912dbbf-3f96-44c3-b4cf-523681fbacbc.html?channel=EuroNews',
- # HDS live stream, MD5 is unstable
- 'info_dict': {
- 'id': '1912dbbf-3f96-44c3-b4cf-523681fbacbc',
- 'ext': 'flv',
- 'title': 'EuroNews',
- },
- 'skip': 'Geo-restricted to Italy',
- },
- {
- # Embedded content item ID
- 'url': 'http://www.tg1.rai.it/dl/tg1/2010/edizioni/ContentSet-9b6e0cba-4bef-4aef-8cf0-9f7f665b7dfb-tg1.html?item=undefined',
- 'md5': '84c1135ce960e8822ae63cec34441d63',
- 'info_dict': {
- 'id': '0960e765-62c8-474a-ac4b-7eb3e2be39c8',
- 'ext': 'mp4',
- 'title': 'TG1 ore 20:00 del 02/07/2016',
- 'upload_date': '20160702',
- },
- },
- {
- 'url': 'http://www.rainews.it/dl/rainews/live/ContentItem-3156f2f2-dc70-4953-8e2f-70d7489d4ce9.html',
- # HDS live stream, MD5 is unstable
- 'info_dict': {
- 'id': '3156f2f2-dc70-4953-8e2f-70d7489d4ce9',
- 'ext': 'flv',
- 'title': 'La diretta di Rainews24',
- },
- },
- ]
+ content_item_url = self._html_search_meta(
+ ('og:url', 'og:video', 'og:video:secure_url', 'twitter:url',
+ 'twitter:player', 'jsonlink'), webpage, default=None)
+ if content_item_url:
+ content_item_id = self._search_regex(
+ r'ContentItem-(%s)' % self._UUID_RE, content_item_url,
+ 'content item id', default=None)
- @classmethod
- def suitable(cls, url):
- return False if RaiTVIE.suitable(url) else super(RaiIE, cls).suitable(url)
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
+ if not content_item_id:
+ content_item_id = self._search_regex(
+ r'''(?x)
+ (?:
+ (?:initEdizione|drawMediaRaiTV)\(|
+ <(?:[^>]+\bdata-id|var\s+uniquename)=
+ )
+ (["\'])
+ (?:(?!\1).)*\bContentItem-(?P<id>%s)
+ ''' % self._UUID_RE,
+ webpage, 'content item id', default=None, group='id')
- iframe_url = self._search_regex(
- [r'<iframe[^>]+src="([^"]*/dl/[^"]+\?iframe\b[^"]*)"',
- r'drawMediaRaiTV\(["\'](.+?)["\']'],
- webpage, 'iframe', default=None)
- if iframe_url:
- if not iframe_url.startswith('http'):
- iframe_url = compat_urlparse.urljoin(url, iframe_url)
- return self.url_result(iframe_url)
-
- content_item_id = self._search_regex(
- r'initEdizione\((?P<q1>[\'"])ContentItem-(?P<content_id>[^\'"]+)(?P=q1)',
- webpage, 'content item ID', group='content_id', default=None)
+ content_item_ids = set()
if content_item_id:
- return self._extract_from_content_id(content_item_id, url)
+ content_item_ids.add(content_item_id)
+ if video_id not in content_item_ids:
+ content_item_ids.add(video_id)
+
+ for content_item_id in content_item_ids:
+ try:
+ return self._extract_from_content_id(content_item_id, url)
+ except GeoRestrictedError:
+ raise
+ except ExtractorError:
+ pass
+
+ relinker_url = self._search_regex(
+ r'''(?x)
+ (?:
+ var\s+videoURL|
+ mediaInfo\.mediaUri
+ )\s*=\s*
+ ([\'"])
+ (?P<url>
+ (?:https?:)?
+ //mediapolis(?:vod)?\.rai\.it/relinker/relinkerServlet\.htm\?
+ (?:(?!\1).)*\bcont=(?:(?!\1).)+)\1
+ ''',
+ webpage, 'relinker URL', group='url')
- relinker_url = compat_urlparse.urljoin(url, self._search_regex(
- r'(?:var\s+videoURL|mediaInfo\.mediaUri)\s*=\s*(?P<q1>[\'"])(?P<url>(https?:)?//mediapolis\.rai\.it/relinker/relinkerServlet\.htm\?cont=\d+)(?P=q1)',
- webpage, 'relinker URL', group='url'))
- formats = self._extract_relinker_formats(relinker_url, video_id)
- self._sort_formats(formats)
+ relinker_info = self._extract_relinker_info(
+ urljoin(url, relinker_url), video_id)
+ self._sort_formats(relinker_info['formats'])
title = self._search_regex(
r'var\s+videoTitolo\s*=\s*([\'"])(?P<title>[^\'"]+)\1',
- webpage, 'title', group='title', default=None) or self._og_search_title(webpage)
+ webpage, 'title', group='title',
+ default=None) or self._og_search_title(webpage)
- return {
+ info = {
'id': video_id,
'title': title,
- 'formats': formats,
}
+
+ info.update(relinker_info)
+
+ return info
diff --git a/youtube_dl/extractor/rbmaradio.py b/youtube_dl/extractor/rbmaradio.py
index 53b82fb..afa7b91 100644
--- a/youtube_dl/extractor/rbmaradio.py
+++ b/youtube_dl/extractor/rbmaradio.py
@@ -13,15 +13,15 @@ from ..utils import (
class RBMARadioIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?rbmaradio\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?:rbmaradio|redbullradio)\.com/shows/(?P<show_id>[^/]+)/episodes/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://www.rbmaradio.com/shows/main-stage/episodes/ford-lopatin-live-at-primavera-sound-2011',
'md5': '6bc6f9bcb18994b4c983bc3bf4384d95',
'info_dict': {
'id': 'ford-lopatin-live-at-primavera-sound-2011',
'ext': 'mp3',
- 'title': 'Main Stage - Ford & Lopatin',
- 'description': 'md5:4f340fb48426423530af5a9d87bd7b91',
+ 'title': 'Main Stage - Ford & Lopatin at Primavera Sound',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
'thumbnail': r're:^https?://.*\.jpg',
'duration': 2452,
'timestamp': 1307103164,
diff --git a/youtube_dl/extractor/rmcdecouverte.py b/youtube_dl/extractor/rmcdecouverte.py
index 2340dae..e921ca3 100644
--- a/youtube_dl/extractor/rmcdecouverte.py
+++ b/youtube_dl/extractor/rmcdecouverte.py
@@ -13,21 +13,20 @@ class RMCDecouverteIE(InfoExtractor):
_VALID_URL = r'https?://rmcdecouverte\.bfmtv\.com/mediaplayer-replay.*?\bid=(?P<id>\d+)'
_TEST = {
- 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=1430&title=LES%20HEROS%20DU%2088e%20ETAGE',
+ 'url': 'http://rmcdecouverte.bfmtv.com/mediaplayer-replay/?id=13502&title=AQUAMEN:LES%20ROIS%20DES%20AQUARIUMS%20:UN%20DELICIEUX%20PROJET',
'info_dict': {
- 'id': '5111223049001',
+ 'id': '5419055995001',
'ext': 'mp4',
- 'title': ': LES HEROS DU 88e ETAGE',
- 'description': 'Découvrez comment la bravoure de deux hommes dans la Tour Nord du World Trade Center a sauvé la vie d\'innombrables personnes le 11 septembre 2001.',
+ 'title': 'UN DELICIEUX PROJET',
+ 'description': 'md5:63610df7c8b1fc1698acd4d0d90ba8b5',
'uploader_id': '1969646226001',
- 'upload_date': '20160904',
- 'timestamp': 1472951103,
+ 'upload_date': '20170502',
+ 'timestamp': 1493745308,
},
'params': {
- # rtmp download
'skip_download': True,
},
- 'skip': 'Only works from France',
+ 'skip': 'only available for a week',
}
BRIGHTCOVE_URL_TEMPLATE = 'http://players.brightcove.net/1969646226001/default_default/index.html?videoId=%s'
@@ -35,5 +34,12 @@ class RMCDecouverteIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
brightcove_legacy_url = BrightcoveLegacyIE._extract_brightcove_url(webpage)
- brightcove_id = compat_parse_qs(compat_urlparse.urlparse(brightcove_legacy_url).query)['@videoPlayer'][0]
- return self.url_result(self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew', brightcove_id)
+ if brightcove_legacy_url:
+ brightcove_id = compat_parse_qs(compat_urlparse.urlparse(
+ brightcove_legacy_url).query)['@videoPlayer'][0]
+ else:
+ brightcove_id = self._search_regex(
+ r'data-video-id=["\'](\d+)', webpage, 'brightcove id')
+ return self.url_result(
+ self.BRIGHTCOVE_URL_TEMPLATE % brightcove_id, 'BrightcoveNew',
+ brightcove_id)
diff --git a/youtube_dl/extractor/rtl2.py b/youtube_dl/extractor/rtl2.py
index 721ee73..666e90e 100644
--- a/youtube_dl/extractor/rtl2.py
+++ b/youtube_dl/extractor/rtl2.py
@@ -1,13 +1,26 @@
# coding: utf-8
from __future__ import unicode_literals
+import base64
import re
from .common import InfoExtractor
-from ..utils import int_or_none
+from ..aes import aes_cbc_decrypt
+from ..compat import (
+ compat_ord,
+ compat_str,
+)
+from ..utils import (
+ bytes_to_intlist,
+ ExtractorError,
+ intlist_to_bytes,
+ int_or_none,
+ strip_or_none,
+)
class RTL2IE(InfoExtractor):
+ IE_NAME = 'rtl2'
_VALID_URL = r'http?://(?:www\.)?rtl2\.de/[^?#]*?/(?P<id>[^?#/]*?)(?:$|/(?:$|[?#]))'
_TESTS = [{
'url': 'http://www.rtl2.de/sendung/grip-das-motormagazin/folge/folge-203-0',
@@ -98,3 +111,98 @@ class RTL2IE(InfoExtractor):
'duration': int_or_none(video_info.get('duration')),
'formats': formats,
}
+
+
+class RTL2YouBaseIE(InfoExtractor):
+ _BACKWERK_BASE_URL = 'https://p-you-backwerk.rtl2apps.de/'
+
+
+class RTL2YouIE(RTL2YouBaseIE):
+ IE_NAME = 'rtl2:you'
+ _VALID_URL = r'http?://you\.rtl2\.de/(?:video/\d+/|youplayer/index\.html\?.*?\bvid=)(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://you.rtl2.de/video/3002/15740/MJUNIK%20%E2%80%93%20Home%20of%20YOU/307-hirn-wo-bist-du',
+ 'info_dict': {
+ 'id': '15740',
+ 'ext': 'mp4',
+ 'title': 'MJUNIK – Home of YOU - #307 Hirn, wo bist du?!',
+ 'description': 'md5:ddaa95c61b372b12b66e115b2772fe01',
+ 'age_limit': 12,
+ },
+ }, {
+ 'url': 'http://you.rtl2.de/youplayer/index.html?vid=15712',
+ 'only_matching': True,
+ }]
+ _AES_KEY = b'\xe9W\xe4.<*\xb8\x1a\xd2\xb6\x92\xf3C\xd3\xefL\x1b\x03*\xbbbH\xc0\x03\xffo\xc2\xf2(\xaa\xaa!'
+ _GEO_COUNTRIES = ['DE']
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ stream_data = self._download_json(
+ self._BACKWERK_BASE_URL + 'stream/video/' + video_id, video_id)
+
+ data, iv = base64.b64decode(stream_data['streamUrl']).decode().split(':')
+ stream_url = intlist_to_bytes(aes_cbc_decrypt(
+ bytes_to_intlist(base64.b64decode(data)),
+ bytes_to_intlist(self._AES_KEY),
+ bytes_to_intlist(base64.b64decode(iv))
+ ))
+ if b'rtl2_you_video_not_found' in stream_url:
+ raise ExtractorError('video not found', expected=True)
+
+ formats = self._extract_m3u8_formats(
+ stream_url[:-compat_ord(stream_url[-1])].decode(),
+ video_id, 'mp4', 'm3u8_native')
+ self._sort_formats(formats)
+
+ video_data = self._download_json(
+ self._BACKWERK_BASE_URL + 'video/' + video_id, video_id)
+
+ series = video_data.get('formatTitle')
+ title = episode = video_data.get('title') or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'description': strip_or_none(video_data.get('description')),
+ 'thumbnail': video_data.get('image'),
+ 'duration': int_or_none(stream_data.get('duration') or video_data.get('duration'), 1000),
+ 'series': series,
+ 'episode': episode,
+ 'age_limit': int_or_none(video_data.get('minimumAge')),
+ }
+
+
+class RTL2YouSeriesIE(RTL2YouBaseIE):
+ IE_NAME = 'rtl2:you:series'
+ _VALID_URL = r'http?://you\.rtl2\.de/videos/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://you.rtl2.de/videos/115/dragon-ball',
+ 'info_dict': {
+ 'id': '115',
+ },
+ 'playlist_mincount': 5,
+ }
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ stream_data = self._download_json(
+ self._BACKWERK_BASE_URL + 'videos',
+ series_id, query={
+ 'formatId': series_id,
+ 'limit': 1000000000,
+ })
+
+ entries = []
+ for video in stream_data.get('videos', []):
+ video_id = compat_str(video['videoId'])
+ if not video_id:
+ continue
+ entries.append(self.url_result(
+ 'http://you.rtl2.de/video/%s/%s' % (series_id, video_id),
+ 'RTL2You', video_id))
+ return self.playlist_result(entries, series_id)
diff --git a/youtube_dl/extractor/rudo.py b/youtube_dl/extractor/rudo.py
index 5164401..f036f67 100644
--- a/youtube_dl/extractor/rudo.py
+++ b/youtube_dl/extractor/rudo.py
@@ -26,7 +26,7 @@ class RudoIE(InfoExtractor):
}
@classmethod
- def _extract_url(self, webpage):
+ def _extract_url(cls, webpage):
mobj = re.search(
r'<iframe[^>]+src=(?P<q1>[\'"])(?P<url>(?:https?:)?//rudo\.video/vod/[0-9a-zA-Z]+)(?P=q1)',
webpage)
diff --git a/youtube_dl/extractor/streamable.py b/youtube_dl/extractor/streamable.py
index 9f5c237..3472527 100644
--- a/youtube_dl/extractor/streamable.py
+++ b/youtube_dl/extractor/streamable.py
@@ -12,7 +12,7 @@ from ..utils import (
class StreamableIE(InfoExtractor):
- _VALID_URL = r'https?://streamable\.com/(?:e/)?(?P<id>\w+)'
+ _VALID_URL = r'https?://streamable\.com/(?:[es]/)?(?P<id>\w+)'
_TESTS = [
{
'url': 'https://streamable.com/dnd1',
@@ -47,6 +47,10 @@ class StreamableIE(InfoExtractor):
{
'url': 'https://streamable.com/e/dnd1',
'only_matching': True,
+ },
+ {
+ 'url': 'https://streamable.com/s/okkqk/drxjds',
+ 'only_matching': True,
}
]
diff --git a/youtube_dl/extractor/streamango.py b/youtube_dl/extractor/streamango.py
new file mode 100644
index 0000000..aa4fad1
--- /dev/null
+++ b/youtube_dl/extractor/streamango.py
@@ -0,0 +1,64 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+)
+
+
+class StreamangoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?streamango\.com/(?:f|embed)/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://streamango.com/f/clapasobsptpkdfe/20170315_150006_mp4',
+ 'md5': 'e992787515a182f55e38fc97588d802a',
+ 'info_dict': {
+ 'id': 'clapasobsptpkdfe',
+ 'ext': 'mp4',
+ 'title': '20170315_150006.mp4',
+ }
+ }, {
+ 'url': 'https://streamango.com/embed/clapasobsptpkdfe/20170315_150006_mp4',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_title(webpage)
+
+ formats = []
+ for format_ in re.findall(r'({[^}]*\bsrc\s*:\s*[^}]*})', webpage):
+ video = self._parse_json(
+ format_, video_id, transform_source=js_to_json, fatal=False)
+ if not video:
+ continue
+ src = video.get('src')
+ if not src:
+ continue
+ ext = determine_ext(src, default_ext=None)
+ if video.get('type') == 'application/dash+xml' or ext == 'mpd':
+ formats.extend(self._extract_mpd_formats(
+ src, video_id, mpd_id='dash', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ 'ext': ext or 'mp4',
+ 'width': int_or_none(video.get('width')),
+ 'height': int_or_none(video.get('height')),
+ 'tbr': int_or_none(video.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'url': url,
+ 'title': title,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index 1b1afab..3f3c681 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -210,7 +210,7 @@ class TEDIE(InfoExtractor):
resources.get('stream'), video_name, 'mp4', m3u8_id=format_id, fatal=False))
m3u8_formats = list(filter(
- lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
+ lambda f: f.get('protocol') == 'm3u8' and f.get('vcodec') != 'none',
formats))
if http_url:
for m3u8_format in m3u8_formats:
diff --git a/youtube_dl/extractor/theplatform.py b/youtube_dl/extractor/theplatform.py
index 9a424b1..de236bb 100644
--- a/youtube_dl/extractor/theplatform.py
+++ b/youtube_dl/extractor/theplatform.py
@@ -80,14 +80,33 @@ class ThePlatformBaseIE(OnceIE):
'url': src,
})
+ duration = info.get('duration')
+ tp_chapters = info.get('chapters', [])
+ chapters = []
+ if tp_chapters:
+ def _add_chapter(start_time, end_time):
+ start_time = float_or_none(start_time, 1000)
+ end_time = float_or_none(end_time, 1000)
+ if start_time is None or end_time is None:
+ return
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ })
+
+ for chapter in tp_chapters[:-1]:
+ _add_chapter(chapter.get('startTime'), chapter.get('endTime'))
+ _add_chapter(tp_chapters[-1].get('startTime'), tp_chapters[-1].get('endTime') or duration)
+
return {
'title': info['title'],
'subtitles': subtitles,
'description': info['description'],
'thumbnail': info['defaultThumbnailUrl'],
- 'duration': int_or_none(info.get('duration'), 1000),
+ 'duration': float_or_none(duration, 1000),
'timestamp': int_or_none(info.get('pubDate'), 1000) or None,
'uploader': info.get('billingCode'),
+ 'chapters': chapters,
}
def _extract_theplatform_metadata(self, path, video_id):
diff --git a/youtube_dl/extractor/thescene.py b/youtube_dl/extractor/thescene.py
index b8504f0..cd64235 100644
--- a/youtube_dl/extractor/thescene.py
+++ b/youtube_dl/extractor/thescene.py
@@ -3,10 +3,6 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..compat import compat_urlparse
-from ..utils import (
- int_or_none,
- qualities,
-)
class TheSceneIE(InfoExtractor):
@@ -24,6 +20,9 @@ class TheSceneIE(InfoExtractor):
'season': 'Ready To Wear Spring 2013',
'tags': list,
'categories': list,
+ 'upload_date': '20120913',
+ 'timestamp': 1347512400,
+ 'uploader': 'vogue',
},
}
@@ -37,32 +36,9 @@ class TheSceneIE(InfoExtractor):
self._html_search_regex(
r'id=\'js-player-script\'[^>]+src=\'(.+?)\'', webpage, 'player url'))
- player = self._download_webpage(player_url, display_id)
- info = self._parse_json(
- self._search_regex(
- r'(?m)video\s*:\s*({.+?}),$', player, 'info json'),
- display_id)
-
- video_id = info['id']
- title = info['title']
-
- qualities_order = qualities(('low', 'high'))
- formats = [{
- 'format_id': '{0}-{1}'.format(f['type'].split('/')[0], f['quality']),
- 'url': f['src'],
- 'quality': qualities_order(f['quality']),
- } for f in info['sources']]
- self._sort_formats(formats)
-
return {
- 'id': video_id,
+ '_type': 'url_transparent',
'display_id': display_id,
- 'title': title,
- 'formats': formats,
- 'thumbnail': info.get('poster_frame'),
- 'duration': int_or_none(info.get('duration')),
- 'series': info.get('series_title'),
- 'season': info.get('season_title'),
- 'tags': info.get('tags'),
- 'categories': info.get('categories'),
+ 'url': player_url,
+ 'ie_key': 'CondeNast',
}
diff --git a/youtube_dl/extractor/thesun.py b/youtube_dl/extractor/thesun.py
new file mode 100644
index 0000000..22d0037
--- /dev/null
+++ b/youtube_dl/extractor/thesun.py
@@ -0,0 +1,32 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .ooyala import OoyalaIE
+
+
+class TheSunIE(InfoExtractor):
+ _VALID_URL = r'https://(?:www\.)?thesun\.co\.uk/[^/]+/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.thesun.co.uk/tvandshowbiz/2261604/orlando-bloom-and-katy-perry-post-adorable-instagram-video-together-celebrating-thanksgiving-after-split-rumours/',
+ 'info_dict': {
+ 'id': '2261604',
+ 'title': 'md5:cba22f48bad9218b64d5bbe0e16afddf',
+ },
+ 'playlist_count': 2,
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, article_id)
+
+ entries = []
+ for ooyala_id in re.findall(
+ r'<[^>]+\b(?:id\s*=\s*"thesun-ooyala-player-|data-content-id\s*=\s*")([^"]+)',
+ webpage):
+ entries.append(OoyalaIE._build_url_result(ooyala_id))
+
+ return self.playlist_result(
+ entries, article_id, self._og_search_title(webpage, fatal=False))
diff --git a/youtube_dl/extractor/turner.py b/youtube_dl/extractor/turner.py
index 1c0be9f..efeb677 100644
--- a/youtube_dl/extractor/turner.py
+++ b/youtube_dl/extractor/turner.py
@@ -13,6 +13,7 @@ from ..utils import (
xpath_attr,
update_url_query,
ExtractorError,
+ strip_or_none,
)
@@ -163,17 +164,21 @@ class TurnerBaseIE(AdobePassIE):
'height': int_or_none(image.get('height')),
} for image in video_data.findall('images/image')]
+ is_live = xpath_text(video_data, 'isLive') == 'true'
+
return {
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if is_live else title,
'formats': formats,
'subtitles': subtitles,
'thumbnails': thumbnails,
- 'description': xpath_text(video_data, 'description'),
+ 'thumbnail': xpath_text(video_data, 'poster'),
+ 'description': strip_or_none(xpath_text(video_data, 'description')),
'duration': parse_duration(xpath_text(video_data, 'length') or xpath_text(video_data, 'trt')),
'timestamp': self._extract_timestamp(video_data),
'upload_date': xpath_attr(video_data, 'metas', 'version'),
'series': xpath_text(video_data, 'showTitle'),
'season_number': int_or_none(xpath_text(video_data, 'seasonNumber')),
'episode_number': int_or_none(xpath_text(video_data, 'episodeNumber')),
+ 'is_live': is_live,
}
diff --git a/youtube_dl/extractor/tv2hu.py b/youtube_dl/extractor/tv2hu.py
new file mode 100644
index 0000000..86017b7
--- /dev/null
+++ b/youtube_dl/extractor/tv2hu.py
@@ -0,0 +1,62 @@
+# encoding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class TV2HuIE(InfoExtractor):
+ IE_NAME = 'tv2.hu'
+ _VALID_URL = r'https?://(?:www\.)?tv2\.hu/(?:[^/]+/)+(?P<id>\d+)_[^/?#]+?\.html'
+ _TESTS = [{
+ 'url': 'http://tv2.hu/ezek_megorultek/217679_ezek-megorultek---1.-adas-1.-resz.html',
+ 'md5': '585e58e2e090f34603804bb2c48e98d8',
+ 'info_dict': {
+ 'id': '217679',
+ 'ext': 'mp4',
+ 'title': 'Ezek megőrültek! - 1. adás 1. rész',
+ 'upload_date': '20160826',
+ 'thumbnail': r're:^https?://.*\.jpg$'
+ }
+ }, {
+ 'url': 'http://tv2.hu/ezek_megorultek/teljes_adasok/217677_ezek-megorultek---1.-adas-2.-resz.html',
+ 'only_matching': True
+ }, {
+ 'url': 'http://tv2.hu/musoraink/aktiv/aktiv_teljes_adas/217963_aktiv-teljes-adas---2016.08.30..html',
+ 'only_matching': True
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ json_url = self._search_regex(
+ r'jsonUrl\s*=\s*"([^"]+)"', webpage, 'json url')
+ json_data = self._download_json(json_url, video_id)
+
+ formats = []
+ for b in ('bitrates', 'backupBitrates'):
+ bitrates = json_data.get(b, {})
+ m3u8_url = bitrates.get('hls')
+ if m3u8_url:
+ formats.extend(self._extract_wowza_formats(
+ m3u8_url, video_id, skip_protocols=['rtmp', 'rtsp']))
+
+ for mp4_url in bitrates.get('mp4', []):
+ height = int_or_none(self._search_regex(
+ r'\.(\d+)p\.mp4', mp4_url, 'height', default=None))
+ formats.append({
+ 'format_id': 'http' + ('-%d' % height if height else ''),
+ 'url': mp4_url,
+ 'height': height,
+ 'width': int_or_none(height / 9.0 * 16.0 if height else None),
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': self._og_search_title(webpage).strip(),
+ 'thumbnail': self._og_search_thumbnail(webpage),
+ 'upload_date': self._search_regex(
+ r'/vod/(\d{8})/', json_url, 'upload_date', default=None),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/tv5mondeplus.py b/youtube_dl/extractor/tv5mondeplus.py
new file mode 100644
index 0000000..88b6baa
--- /dev/null
+++ b/youtube_dl/extractor/tv5mondeplus.py
@@ -0,0 +1,79 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ clean_html,
+ determine_ext,
+ extract_attributes,
+ get_element_by_class,
+ int_or_none,
+ parse_duration,
+ parse_iso8601,
+)
+
+
+class TV5MondePlusIE(InfoExtractor):
+ IE_DESC = 'TV5MONDE+'
+ _VALID_URL = r'https?://(?:www\.)?tv5mondeplus\.com/toutes-les-videos/[^/]+/(?P<id>[^/?#]+)'
+ _TEST = {
+ 'url': 'http://www.tv5mondeplus.com/toutes-les-videos/documentaire/tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants',
+ 'md5': '12130fc199f020673138a83466542ec6',
+ 'info_dict': {
+ 'id': 'tdah-mon-amour-tele-quebec-tdah-mon-amour-ep001-enfants',
+ 'ext': 'mp4',
+ 'title': 'Tdah, mon amour - Enfants',
+ 'description': 'md5:230e3aca23115afcf8006d1bece6df74',
+ 'upload_date': '20170401',
+ 'timestamp': 1491022860,
+ }
+ }
+ _GEO_BYPASS = False
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+ webpage = self._download_webpage(url, display_id)
+
+ if ">Ce programme n'est malheureusement pas disponible pour votre zone géographique.<" in webpage:
+ self.raise_geo_restricted(countries=['FR'])
+
+ series = get_element_by_class('video-detail__title', webpage)
+ title = episode = get_element_by_class(
+ 'video-detail__subtitle', webpage) or series
+ if series and series != title:
+ title = '%s - %s' % (series, title)
+ vpl_data = extract_attributes(self._search_regex(
+ r'(<[^>]+class="video_player_loader"[^>]+>)',
+ webpage, 'video player loader'))
+
+ video_files = self._parse_json(
+ vpl_data['data-broadcast'], display_id).get('files', [])
+ formats = []
+ for video_file in video_files:
+ v_url = video_file.get('url')
+ if not v_url:
+ continue
+ video_format = video_file.get('format') or determine_ext(v_url)
+ if video_format == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ v_url, display_id, 'mp4', 'm3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': v_url,
+ 'format_id': video_format,
+ })
+ self._sort_formats(formats)
+
+ return {
+ 'id': display_id,
+ 'display_id': display_id,
+ 'title': title,
+ 'description': clean_html(get_element_by_class('video-detail__description', webpage)),
+ 'thumbnail': vpl_data.get('data-image'),
+ 'duration': int_or_none(vpl_data.get('data-duration')) or parse_duration(self._html_search_meta('duration', webpage)),
+ 'timestamp': parse_iso8601(self._html_search_meta('uploadDate', webpage)),
+ 'formats': formats,
+ 'episode': episode,
+ 'series': series,
+ }
diff --git a/youtube_dl/extractor/tvp.py b/youtube_dl/extractor/tvp.py
index 06ea2b4..c5b3288 100644
--- a/youtube_dl/extractor/tvp.py
+++ b/youtube_dl/extractor/tvp.py
@@ -150,8 +150,7 @@ class TVPEmbedIE(InfoExtractor):
'mp4', 'm3u8_native', m3u8_id='hls', fatal=False)
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
formats.extend(m3u8_formats)
for i, m3u8_format in enumerate(m3u8_formats, 2):
http_url = '%s-%d.mp4' % (video_url_base, i)
diff --git a/youtube_dl/extractor/tvplay.py b/youtube_dl/extractor/tvplay.py
index 3eda0a3..99ff82a 100644
--- a/youtube_dl/extractor/tvplay.py
+++ b/youtube_dl/extractor/tvplay.py
@@ -225,7 +225,11 @@ class TVPlayIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
-
+ geo_country = self._search_regex(
+ r'https?://[^/]+\.([a-z]{2})', url,
+ 'geo country', default=None)
+ if geo_country:
+ self._initialize_geo_bypass([geo_country.upper()])
video = self._download_json(
'http://playapi.mtgx.tv/v3/videos/%s' % video_id, video_id, 'Downloading video JSON')
diff --git a/youtube_dl/extractor/tvplayer.py b/youtube_dl/extractor/tvplayer.py
index b653714..ebde605 100644
--- a/youtube_dl/extractor/tvplayer.py
+++ b/youtube_dl/extractor/tvplayer.py
@@ -2,9 +2,13 @@
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
extract_attributes,
+ try_get,
urlencode_postdata,
ExtractorError,
)
@@ -34,25 +38,32 @@ class TVPlayerIE(InfoExtractor):
webpage, 'channel element'))
title = current_channel['data-name']
- resource_id = self._search_regex(
- r'resourceId\s*=\s*"(\d+)"', webpage, 'resource id')
- platform = self._search_regex(
- r'platform\s*=\s*"([^"]+)"', webpage, 'platform')
+ resource_id = current_channel['data-id']
+
token = self._search_regex(
- r'token\s*=\s*"([^"]+)"', webpage, 'token', default='null')
- validate = self._search_regex(
- r'validate\s*=\s*"([^"]+)"', webpage, 'validate', default='null')
+ r'data-token=(["\'])(?P<token>(?!\1).+)\1', webpage,
+ 'token', group='token')
+
+ context = self._download_json(
+ 'https://tvplayer.com/watch/context', display_id,
+ 'Downloading JSON context', query={
+ 'resource': resource_id,
+ 'nonce': token,
+ })
+
+ validate = context['validate']
+ platform = try_get(
+ context, lambda x: x['platform']['key'], compat_str) or 'firefox'
try:
response = self._download_json(
'http://api.tvplayer.com/api/v2/stream/live',
- resource_id, headers={
+ display_id, 'Downloading JSON stream', headers={
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
}, data=urlencode_postdata({
+ 'id': resource_id,
'service': 1,
'platform': platform,
- 'id': resource_id,
- 'token': token,
'validate': validate,
}))['tvplayer']['response']
except ExtractorError as e:
@@ -63,7 +74,7 @@ class TVPlayerIE(InfoExtractor):
'%s said: %s' % (self.IE_NAME, response['error']), expected=True)
raise
- formats = self._extract_m3u8_formats(response['stream'], resource_id, 'mp4')
+ formats = self._extract_m3u8_formats(response['stream'], display_id, 'mp4')
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index cce29c6..dae1aa3 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -212,12 +212,15 @@ class UdemyIE(InfoExtractor):
thumbnail = asset.get('thumbnail_url') or asset.get('thumbnailUrl')
duration = float_or_none(asset.get('data', {}).get('duration'))
+ subtitles = {}
+ automatic_captions = {}
+
formats = []
- def extract_output_format(src):
+ def extract_output_format(src, f_id):
return {
'url': src['url'],
- 'format_id': '%sp' % (src.get('height') or format_id),
+ 'format_id': '%sp' % (src.get('height') or f_id),
'width': int_or_none(src.get('width')),
'height': int_or_none(src.get('height')),
'vbr': int_or_none(src.get('video_bitrate_in_kbps')),
@@ -237,30 +240,33 @@ class UdemyIE(InfoExtractor):
def add_output_format_meta(f, key):
output = outputs.get(key)
if isinstance(output, dict):
- output_format = extract_output_format(output)
+ output_format = extract_output_format(output, key)
output_format.update(f)
return output_format
return f
+ def extract_formats(source_list):
+ if not isinstance(source_list, list):
+ return
+ for source in source_list:
+ video_url = source.get('file') or source.get('src')
+ if not video_url or not isinstance(video_url, compat_str):
+ continue
+ format_id = source.get('label')
+ f = {
+ 'url': video_url,
+ 'format_id': '%sp' % format_id,
+ 'height': int_or_none(format_id),
+ }
+ if format_id:
+ # Some videos contain additional metadata (e.g.
+ # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
+ f = add_output_format_meta(f, format_id)
+ formats.append(f)
+
download_urls = asset.get('download_urls')
if isinstance(download_urls, dict):
- video = download_urls.get('Video')
- if isinstance(video, list):
- for format_ in video:
- video_url = format_.get('file')
- if not video_url:
- continue
- format_id = format_.get('label')
- f = {
- 'url': format_['file'],
- 'format_id': '%sp' % format_id,
- 'height': int_or_none(format_id),
- }
- if format_id:
- # Some videos contain additional metadata (e.g.
- # https://www.udemy.com/ios9-swift/learn/#/lecture/3383208)
- f = add_output_format_meta(f, format_id)
- formats.append(f)
+ extract_formats(download_urls.get('Video'))
view_html = lecture.get('view_html')
if view_html:
@@ -294,6 +300,35 @@ class UdemyIE(InfoExtractor):
'height': height,
}, res))
+ # react rendition since 2017.04.15 (see
+ # https://github.com/rg3/youtube-dl/issues/12744)
+ data = self._parse_json(
+ self._search_regex(
+ r'videojs-setup-data=(["\'])(?P<data>{.+?})\1', view_html,
+ 'setup data', default='{}', group='data'), video_id,
+ transform_source=unescapeHTML, fatal=False)
+ if data and isinstance(data, dict):
+ extract_formats(data.get('sources'))
+ if not duration:
+ duration = int_or_none(data.get('duration'))
+ tracks = data.get('tracks')
+ if isinstance(tracks, list):
+ for track in tracks:
+ if not isinstance(track, dict):
+ continue
+ if track.get('kind') != 'captions':
+ continue
+ src = track.get('src')
+ if not src or not isinstance(src, compat_str):
+ continue
+ lang = track.get('language') or track.get(
+ 'srclang') or track.get('label')
+ sub_dict = automatic_captions if track.get(
+ 'autogenerated') is True else subtitles
+ sub_dict.setdefault(lang, []).append({
+ 'url': src,
+ })
+
self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
return {
@@ -302,7 +337,9 @@ class UdemyIE(InfoExtractor):
'description': description,
'thumbnail': thumbnail,
'duration': duration,
- 'formats': formats
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'automatic_captions': automatic_captions,
}
diff --git a/youtube_dl/extractor/upskill.py b/youtube_dl/extractor/upskill.py
new file mode 100644
index 0000000..30297b4
--- /dev/null
+++ b/youtube_dl/extractor/upskill.py
@@ -0,0 +1,176 @@
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from .wistia import WistiaIE
+from ..compat import compat_str
+from ..utils import (
+ clean_html,
+ ExtractorError,
+ get_element_by_class,
+ urlencode_postdata,
+ urljoin,
+)
+
+
+class UpskillBaseIE(InfoExtractor):
+ _LOGIN_URL = 'http://upskillcourses.com/sign_in'
+ _NETRC_MACHINE = 'upskill'
+
+ def _real_initialize(self):
+ self._login()
+
+ def _login(self):
+ username, password = self._get_login_info()
+ if username is None:
+ return
+
+ login_page, urlh = self._download_webpage_handle(
+ self._LOGIN_URL, None, 'Downloading login page')
+
+ login_url = compat_str(urlh.geturl())
+
+ login_form = self._hidden_inputs(login_page)
+
+ login_form.update({
+ 'user[email]': username,
+ 'user[password]': password,
+ })
+
+ post_url = self._search_regex(
+ r'<form[^>]+action=(["\'])(?P<url>(?:(?!\1).)+)\1', login_page,
+ 'post url', default=login_url, group='url')
+
+ if not post_url.startswith('http'):
+ post_url = urljoin(login_url, post_url)
+
+ response = self._download_webpage(
+ post_url, None, 'Logging in',
+ data=urlencode_postdata(login_form),
+ headers={
+ 'Content-Type': 'application/x-www-form-urlencoded',
+ 'Referer': login_url,
+ })
+
+ # Successful login
+ if any(re.search(p, response) for p in (
+ r'class=["\']user-signout',
+ r'<a[^>]+\bhref=["\']/sign_out',
+ r'>\s*Log out\s*<')):
+ return
+
+ message = get_element_by_class('alert', response)
+ if message is not None:
+ raise ExtractorError(
+ 'Unable to login: %s' % clean_html(message), expected=True)
+
+ raise ExtractorError('Unable to log in')
+
+
+class UpskillIE(UpskillBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/[^/]+/lectures/(?P<id>\d+)'
+
+ _TESTS = [{
+ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/lectures/1747100',
+ 'info_dict': {
+ 'id': 'uzw6zw58or',
+ 'ext': 'mp4',
+ 'title': 'Welcome to the Course!',
+ 'description': 'md5:8d66c13403783370af62ca97a7357bdd',
+ 'duration': 138.763,
+ 'timestamp': 1479846621,
+ 'upload_date': '20161122',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://upskillcourses.com/courses/119763/lectures/1747100',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ wistia_url = WistiaIE._extract_url(webpage)
+ if not wistia_url:
+ if any(re.search(p, webpage) for p in (
+ r'class=["\']lecture-contents-locked',
+ r'>\s*Lecture contents locked',
+ r'id=["\']lecture-locked')):
+ self.raise_login_required('Lecture contents locked')
+
+ title = self._og_search_title(webpage, default=None)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': wistia_url,
+ 'ie_key': WistiaIE.ie_key(),
+ 'title': title,
+ }
+
+
+class UpskillCourseIE(UpskillBaseIE):
+ _VALID_URL = r'https?://(?:www\.)?upskillcourses\.com/courses/(?:enrolled/)?(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://upskillcourses.com/courses/essential-web-developer-course/',
+ 'info_dict': {
+ 'id': '119763',
+ 'title': 'The Essential Web Developer Course (Free)',
+ },
+ 'playlist_count': 192,
+ }, {
+ 'url': 'http://upskillcourses.com/courses/119763/',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://upskillcourses.com/courses/enrolled/119763',
+ 'only_matching': True,
+ }]
+
+ @classmethod
+ def suitable(cls, url):
+ return False if UpskillIE.suitable(url) else super(
+ UpskillCourseIE, cls).suitable(url)
+
+ def _real_extract(self, url):
+ course_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, course_id)
+
+ course_id = self._search_regex(
+ r'data-course-id=["\'](\d+)', webpage, 'course id',
+ default=course_id)
+
+ entries = []
+
+ for mobj in re.finditer(
+ r'(?s)(?P<li><li[^>]+class=(["\'])(?:(?!\2).)*?section-item[^>]+>.+?</li>)',
+ webpage):
+ li = mobj.group('li')
+ if 'fa-youtube-play' not in li:
+ continue
+ lecture_url = self._search_regex(
+ r'<a[^>]+href=(["\'])(?P<url>(?:(?!\1).)+)\1', li,
+ 'lecture url', default=None, group='url')
+ if not lecture_url:
+ continue
+ lecture_id = self._search_regex(
+ r'/lectures/(\d+)', lecture_url, 'lecture id', default=None)
+ title = self._html_search_regex(
+ r'<span[^>]+class=["\']lecture-name[^>]+>([^<]+)', li,
+ 'title', default=None)
+ entries.append(
+ self.url_result(
+ urljoin('http://upskillcourses.com/', lecture_url),
+ ie=UpskillIE.ie_key(), video_id=lecture_id,
+ video_title=clean_html(title)))
+
+ course_title = self._html_search_regex(
+ (r'(?s)<img[^>]+class=["\']course-image[^>]+>\s*<h\d>(.+?)</h',
+ r'(?s)<h\d[^>]+class=["\']course-title[^>]+>(.+?)</h'),
+ webpage, 'course title', fatal=False)
+
+ return self.playlist_result(entries, course_id, course_title)
diff --git a/youtube_dl/extractor/vevo.py b/youtube_dl/extractor/vevo.py
index 9aa38bc..890a149 100644
--- a/youtube_dl/extractor/vevo.py
+++ b/youtube_dl/extractor/vevo.py
@@ -1,6 +1,7 @@
from __future__ import unicode_literals
import re
+import json
from .common import InfoExtractor
from ..compat import (
@@ -11,7 +12,6 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
- sanitized_Request,
parse_iso8601,
)
@@ -154,19 +154,24 @@ class VevoIE(VevoBaseIE):
}
def _initialize_api(self, video_id):
- req = sanitized_Request(
- 'http://www.vevo.com/auth', data=b'')
webpage = self._download_webpage(
- req, None,
+ 'https://accounts.vevo.com/token', None,
note='Retrieving oauth token',
- errnote='Unable to retrieve oauth token')
+ errnote='Unable to retrieve oauth token',
+ data=json.dumps({
+ 'client_id': 'SPupX1tvqFEopQ1YS6SS',
+ 'grant_type': 'urn:vevo:params:oauth:grant-type:anonymous',
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json',
+ })
if re.search(r'(?i)THIS PAGE IS CURRENTLY UNAVAILABLE IN YOUR REGION', webpage):
self.raise_geo_restricted(
'%s said: This page is currently unavailable in your region' % self.IE_NAME)
auth_info = self._parse_json(webpage, video_id)
- self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['access_token']
+ self._api_url_template = self.http_scheme() + '//apiv2.vevo.com/%s?token=' + auth_info['legacy_token']
def _call_api(self, path, *args, **kwargs):
try:
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index f0a7fd7..54e207b 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -20,7 +20,7 @@ from ..utils import (
class ViceBaseIE(AdobePassIE):
- def _extract_preplay_video(self, url, webpage):
+ def _extract_preplay_video(self, url, locale, webpage):
watch_hub_data = extract_attributes(self._search_regex(
r'(?s)(<watch-hub\s*.+?</watch-hub>)', webpage, 'watch hub'))
video_id = watch_hub_data['vms-id']
@@ -32,7 +32,8 @@ class ViceBaseIE(AdobePassIE):
resource = self._get_mvpd_resource(
'VICELAND', title, video_id,
watch_hub_data.get('video-rating'))
- query['tvetoken'] = self._extract_mvpd_auth(url, video_id, 'VICELAND', resource)
+ query['tvetoken'] = self._extract_mvpd_auth(
+ url, video_id, 'VICELAND', resource)
# signature generation algorithm is reverse engineered from signatureGenerator in
# webpack:///../shared/~/vice-player/dist/js/vice-player.js in
@@ -45,11 +46,14 @@ class ViceBaseIE(AdobePassIE):
try:
host = 'www.viceland' if is_locked else self._PREPLAY_HOST
- preplay = self._download_json('https://%s.com/en_us/preplay/%s' % (host, video_id), video_id, query=query)
+ preplay = self._download_json(
+ 'https://%s.com/%s/preplay/%s' % (host, locale, video_id),
+ video_id, query=query)
except ExtractorError as e:
if isinstance(e.cause, compat_HTTPError) and e.cause.code == 400:
error = json.loads(e.cause.read().decode())
- raise ExtractorError('%s said: %s' % (self.IE_NAME, error['details']), expected=True)
+ raise ExtractorError('%s said: %s' % (
+ self.IE_NAME, error['details']), expected=True)
raise
video_data = preplay['video']
@@ -88,41 +92,30 @@ class ViceBaseIE(AdobePassIE):
class ViceIE(ViceBaseIE):
- _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?videos?/(?P<id>[^/?#&]+)'
+ IE_NAME = 'vice'
+ _VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:(?P<locale>[^/]+)/)?videos?/(?P<id>[^/?#&]+)'
_TESTS = [{
- 'url': 'http://www.vice.com/video/cowboy-capitalists-part-1',
- 'md5': 'e9d77741f9e42ba583e683cd170660f7',
+ 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
+ 'md5': '7d3ae2f9ba5f196cdd9f9efd43657ac2',
'info_dict': {
- 'id': '43cW1mYzpia9IlestBjVpd23Yu3afAfp',
+ 'id': 'N2bzkydjraWDGwnt8jAttCF6Y0PDv4Zj',
'ext': 'flv',
- 'title': 'VICE_COWBOYCAPITALISTS_PART01_v1_VICE_WM_1080p.mov',
- 'duration': 725.983,
+ 'title': 'Monkey Labs of Holland',
+ 'description': 'md5:92b3c7dcbfe477f772dd4afa496c9149',
},
'add_ie': ['Ooyala'],
}, {
- 'url': 'http://www.vice.com/video/how-to-hack-a-car',
- 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
- 'info_dict': {
- 'id': '3jstaBeXgAs',
- 'ext': 'mp4',
- 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
- 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
- 'uploader_id': 'MotherboardTV',
- 'uploader': 'Motherboard',
- 'upload_date': '20140529',
- },
- 'add_ie': ['Youtube'],
- }, {
'url': 'https://video.vice.com/en_us/video/the-signal-from-tolva/5816510690b70e6c5fd39a56',
- 'md5': '',
'info_dict': {
'id': '5816510690b70e6c5fd39a56',
'ext': 'mp4',
'uploader': 'Waypoint',
'title': 'The Signal From Tölva',
+ 'description': 'md5:3927e3c79f9e8094606a2b3c5b5e55d5',
'uploader_id': '57f7d621e05ca860fa9ccaf9',
- 'timestamp': 1477941983938,
+ 'timestamp': 1477941983,
+ 'upload_date': '20161031',
},
'params': {
# m3u8 download
@@ -130,19 +123,31 @@ class ViceIE(ViceBaseIE):
},
'add_ie': ['UplynkPreplay'],
}, {
- 'url': 'https://news.vice.com/video/experimenting-on-animals-inside-the-monkey-lab',
- 'only_matching': True,
- }, {
- 'url': 'http://www.vice.com/ru/video/big-night-out-ibiza-clive-martin-229',
- 'only_matching': True,
+ 'url': 'https://video.vice.com/alps/video/ulfs-wien-beruchtigste-grafitti-crew-part-1/581b12b60a0e1f4c0fb6ea2f',
+ 'info_dict': {
+ 'id': '581b12b60a0e1f4c0fb6ea2f',
+ 'ext': 'mp4',
+ 'title': 'ULFs - Wien berüchtigste Grafitti Crew - Part 1',
+ 'description': '<p>Zwischen Hinterzimmer-Tattoos und U-Bahnschächten erzählen uns die Ulfs, wie es ist, "süchtig nach Sachbeschädigung" zu sein.</p>',
+ 'uploader': 'VICE',
+ 'uploader_id': '57a204088cb727dec794c67b',
+ 'timestamp': 1485368119,
+ 'upload_date': '20170125',
+ 'age_limit': 14,
+ },
+ 'params': {
+ # AES-encrypted m3u8
+ 'skip_download': True,
+ },
+ 'add_ie': ['UplynkPreplay'],
}, {
- 'url': 'https://munchies.vice.com/en/videos/watch-the-trailer-for-our-new-series-the-pizza-show',
+ 'url': 'https://video.vice.com/en_us/video/pizza-show-trailer/56d8c9a54d286ed92f7f30e4',
'only_matching': True,
}]
_PREPLAY_HOST = 'video.vice'
def _real_extract(self, url):
- video_id = self._match_id(url)
+ locale, video_id = re.match(self._VALID_URL, url).groups()
webpage, urlh = self._download_webpage_handle(url, video_id)
embed_code = self._search_regex(
r'embedCode=([^&\'"]+)', webpage,
@@ -153,10 +158,11 @@ class ViceIE(ViceBaseIE):
r'data-youtube-id="([^"]+)"', webpage, 'youtube id', default=None)
if youtube_id:
return self.url_result(youtube_id, 'Youtube')
- return self._extract_preplay_video(urlh.geturl(), webpage)
+ return self._extract_preplay_video(urlh.geturl(), locale, webpage)
class ViceShowIE(InfoExtractor):
+ IE_NAME = 'vice:show'
_VALID_URL = r'https?://(?:.+?\.)?vice\.com/(?:[^/]+/)?show/(?P<id>[^/?#&]+)'
_TEST = {
@@ -183,6 +189,86 @@ class ViceShowIE(InfoExtractor):
r'<title>(.+?)</title>', webpage, 'title', default=None)
if title:
title = re.sub(r'(.+)\s*\|\s*.+$', r'\1', title).strip()
- description = self._html_search_meta('description', webpage, 'description')
+ description = self._html_search_meta(
+ 'description', webpage, 'description')
return self.playlist_result(entries, show_id, title, description)
+
+
+class ViceArticleIE(InfoExtractor):
+ IE_NAME = 'vice:article'
+ _VALID_URL = r'https://www.vice.com/[^/]+/article/(?P<id>[^?#]+)'
+
+ _TESTS = [{
+ 'url': 'https://www.vice.com/en_us/article/on-set-with-the-woman-making-mormon-porn-in-utah',
+ 'info_dict': {
+ 'id': '58dc0a3dee202d2a0ccfcbd8',
+ 'ext': 'mp4',
+ 'title': 'Mormon War on Porn ',
+ 'description': 'md5:ad396a2481e7f8afb5ed486878421090',
+ 'uploader': 'VICE',
+ 'uploader_id': '57a204088cb727dec794c693',
+ 'timestamp': 1489160690,
+ 'upload_date': '20170310',
+ },
+ 'params': {
+ # AES-encrypted m3u8
+ 'skip_download': True,
+ },
+ 'add_ie': ['UplynkPreplay'],
+ }, {
+ 'url': 'https://www.vice.com/en_us/article/how-to-hack-a-car',
+ 'md5': 'a7ecf64ee4fa19b916c16f4b56184ae2',
+ 'info_dict': {
+ 'id': '3jstaBeXgAs',
+ 'ext': 'mp4',
+ 'title': 'How to Hack a Car: Phreaked Out (Episode 2)',
+ 'description': 'md5:ee95453f7ff495db8efe14ae8bf56f30',
+ 'uploader_id': 'MotherboardTV',
+ 'uploader': 'Motherboard',
+ 'upload_date': '20140529',
+ },
+ 'add_ie': ['Youtube'],
+ }, {
+ 'url': 'https://www.vice.com/en_us/article/cowboy-capitalists-part-1',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.vice.com/ru/article/big-night-out-ibiza-clive-martin-229',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ display_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, display_id)
+
+ prefetch_data = self._parse_json(self._search_regex(
+ r'window\.__PREFETCH_DATA\s*=\s*({.*});',
+ webpage, 'prefetch data'), display_id)
+ body = prefetch_data['body']
+
+ def _url_res(video_url, ie_key):
+ return {
+ '_type': 'url_transparent',
+ 'url': video_url,
+ 'display_id': display_id,
+ 'ie_key': ie_key,
+ }
+
+ embed_code = self._search_regex(
+ r'embedCode=([^&\'"]+)', body,
+ 'ooyala embed code', default=None)
+ if embed_code:
+ return _url_res('ooyala:%s' % embed_code, 'Ooyala')
+
+ youtube_url = self._html_search_regex(
+ r'<iframe[^>]+src="(.*youtube\.com/.*)"',
+ body, 'YouTube URL', default=None)
+ if youtube_url:
+ return _url_res(youtube_url, 'Youtube')
+
+ video_url = self._html_search_regex(
+ r'data-video-url="([^"]+)"',
+ prefetch_data['embed_code'], 'video URL')
+
+ return _url_res(video_url, ViceIE.ie_key())
diff --git a/youtube_dl/extractor/viceland.py b/youtube_dl/extractor/viceland.py
index 87f9216..bd60235 100644
--- a/youtube_dl/extractor/viceland.py
+++ b/youtube_dl/extractor/viceland.py
@@ -1,11 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .vice import ViceBaseIE
class VicelandIE(ViceBaseIE):
- _VALID_URL = r'https?://(?:www\.)?viceland\.com/[^/]+/video/[^/]+/(?P<id>[a-f0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?viceland\.com/(?P<locale>[^/]+)/video/[^/]+/(?P<id>[a-f0-9]+)'
_TEST = {
'url': 'https://www.viceland.com/en_us/video/trapped/588a70d0dba8a16007de7316',
'info_dict': {
@@ -24,10 +26,13 @@ class VicelandIE(ViceBaseIE):
'skip_download': True,
},
'add_ie': ['UplynkPreplay'],
+ 'skip': '404',
}
_PREPLAY_HOST = 'www.viceland'
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id')
+ locale = mobj.group('locale')
webpage = self._download_webpage(url, video_id)
- return self._extract_preplay_video(url, webpage)
+ return self._extract_preplay_video(url, locale, webpage)
diff --git a/youtube_dl/extractor/videopress.py b/youtube_dl/extractor/videopress.py
index 049db25..e5f964d 100644
--- a/youtube_dl/extractor/videopress.py
+++ b/youtube_dl/extractor/videopress.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import random
import re
from .common import InfoExtractor
@@ -11,6 +10,7 @@ from ..utils import (
float_or_none,
parse_age_limit,
qualities,
+ random_birthday,
try_get,
unified_timestamp,
urljoin,
@@ -47,13 +47,10 @@ class VideoPressIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+ query = random_birthday('birth_year', 'birth_month', 'birth_day')
video = self._download_json(
'https://public-api.wordpress.com/rest/v1.1/videos/%s' % video_id,
- video_id, query={
- 'birth_month': random.randint(1, 12),
- 'birth_day': random.randint(1, 31),
- 'birth_year': random.randint(1950, 1995),
- })
+ video_id, query=query)
title = video['title']
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py
index 4e4b4e3..701bb1d 100644
--- a/youtube_dl/extractor/vidio.py
+++ b/youtube_dl/extractor/vidio.py
@@ -49,8 +49,11 @@ class VidioIE(InfoExtractor):
thumbnail = clip.get('image')
m3u8_url = m3u8_url or self._search_regex(
- r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>.+?)\1', webpage, 'hls url')
- formats = self._extract_m3u8_formats(m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+ r'data(?:-vjs)?-clip-hls-url=(["\'])(?P<url>(?!\1).+)\1',
+ webpage, 'hls url')
+ formats = self._extract_m3u8_formats(
+ m3u8_url, display_id, 'mp4', entry_protocol='m3u8_native')
+ self._sort_formats(formats)
duration = int_or_none(duration or self._search_regex(
r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
diff --git a/youtube_dl/extractor/vidzi.py b/youtube_dl/extractor/vidzi.py
index d055629..e64873b 100644
--- a/youtube_dl/extractor/vidzi.py
+++ b/youtube_dl/extractor/vidzi.py
@@ -42,14 +42,15 @@ class VidziIE(InfoExtractor):
title = self._html_search_regex(
r'(?s)<h2 class="video-title">(.*?)</h2>', webpage, 'title')
- packed_codes = [mobj.group(0) for mobj in re.finditer(
- PACKED_CODES_RE, webpage)]
- for num, pc in enumerate(packed_codes, 1):
- code = decode_packed_codes(pc).replace('\\\'', '\'')
+ codes = [webpage]
+ codes.extend([
+ decode_packed_codes(mobj.group(0)).replace('\\\'', '\'')
+ for mobj in re.finditer(PACKED_CODES_RE, webpage)])
+ for num, code in enumerate(codes, 1):
jwplayer_data = self._parse_json(
self._search_regex(
r'setup\(([^)]+)\)', code, 'jwplayer data',
- default=NO_DEFAULT if num == len(packed_codes) else '{}'),
+ default=NO_DEFAULT if num == len(codes) else '{}'),
video_id, transform_source=js_to_json)
if jwplayer_data:
break
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index 5086f59..3e67eb8 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -5,23 +5,30 @@ import re
import itertools
from .common import InfoExtractor
+from ..utils import (
+ urlencode_postdata,
+ int_or_none,
+ unified_strdate,
+)
class VierIE(InfoExtractor):
IE_NAME = 'vier'
+ IE_DESC = 'vier.be and vijf.be'
_VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
+ _NETRC_MACHINE = 'vier'
_TESTS = [{
'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
+ 'md5': 'e4ae2054a6b040ef1e289e20d111b46e',
'info_dict': {
'id': '16129',
'display_id': 'het-wordt-warm-de-moestuin',
'ext': 'mp4',
'title': 'Het wordt warm in De Moestuin',
'description': 'De vele uren werk eisen hun tol. Wim droomt van assistentie...',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ 'upload_date': '20121025',
+ 'series': 'Plan B',
+ 'tags': ['De Moestuin', 'Moestuin', 'meisjes', 'Tomaat', 'Wim', 'Droom'],
},
}, {
'url': 'http://www.vijf.be/temptationisland/videos/zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas/2561614',
@@ -29,32 +36,103 @@ class VierIE(InfoExtractor):
'id': '2561614',
'display_id': 'zo-grappig-temptation-island-hosts-moeten-kiezen-tussen-onmogelijke-dilemmas',
'ext': 'mp4',
- 'title': 'ZO grappig: Temptation Island hosts moeten kiezen tussen onmogelijke dilemma\'s',
- 'description': 'Het spel is simpel: Annelien Coorevits en Rick Brandsteder krijgen telkens 2 dilemma\'s voorgeschoteld en ze MOETEN een keuze maken.',
+ 'title': 'md5:84f45fe48b8c1fa296a7f6d208d080a7',
+ 'description': 'md5:0356d4981e58b8cbee19355cbd51a8fe',
+ 'upload_date': '20170228',
+ 'series': 'Temptation Island',
+ 'tags': list,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+ 'info_dict': {
+ 'id': '2674839',
+ 'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+ 'ext': 'mp4',
+ 'title': 'Jani gaat naar Tokio - Aflevering 4',
+ 'description': 'md5:aa8d611541db6ae9e863125704511f88',
+ 'upload_date': '20170501',
+ 'series': 'Jani gaat',
+ 'episode_number': 4,
+ 'tags': ['Jani Gaat', 'Volledige Aflevering'],
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'skip': 'Requires account credentials',
+ }, {
+ # Requires account credentials but bypassed extraction via v3/embed page
+ # without metadata
+ 'url': 'http://www.vier.be/janigaat/videos/jani-gaat-naar-tokio-aflevering-4/2674839',
+ 'info_dict': {
+ 'id': '2674839',
+ 'display_id': 'jani-gaat-naar-tokio-aflevering-4',
+ 'ext': 'mp4',
+ 'title': 'jani-gaat-naar-tokio-aflevering-4',
},
'params': {
- # m3u8 download
'skip_download': True,
},
+ 'expected_warnings': ['Log in to extract metadata'],
}, {
- 'url': 'http://www.vier.be/planb/videos/mieren-herders-van-de-bladluizen',
+ # Without video id in URL
+ 'url': 'http://www.vier.be/planb/videos/dit-najaar-plan-b',
'only_matching': True,
}, {
'url': 'http://www.vier.be/video/v3/embed/16129',
'only_matching': True,
}]
+ def _real_initialize(self):
+ self._logged_in = False
+
+ def _login(self, site):
+ username, password = self._get_login_info()
+ if username is None or password is None:
+ return
+
+ login_page = self._download_webpage(
+ 'http://www.%s.be/user/login' % site,
+ None, note='Logging in', errnote='Unable to log in',
+ data=urlencode_postdata({
+ 'form_id': 'user_login',
+ 'name': username,
+ 'pass': password,
+ }),
+ headers={'Content-Type': 'application/x-www-form-urlencoded'})
+
+ login_error = self._html_search_regex(
+ r'(?s)<div class="messages error">\s*<div>\s*<h2.+?</h2>(.+?)<',
+ login_page, 'login error', default=None)
+ if login_error:
+ self.report_warning('Unable to log in: %s' % login_error)
+ else:
+ self._logged_in = True
+
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
embed_id = mobj.group('embed_id')
display_id = mobj.group('display_id') or embed_id
+ video_id = mobj.group('id') or embed_id
site = mobj.group('site')
+ if not self._logged_in:
+ self._login(site)
+
webpage = self._download_webpage(url, display_id)
+ if r'id="user-login"' in webpage:
+ self.report_warning(
+ 'Log in to extract metadata', video_id=display_id)
+ webpage = self._download_webpage(
+ 'http://www.%s.be/video/v3/embed/%s' % (site, video_id),
+ display_id)
+
video_id = self._search_regex(
[r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
- webpage, 'video id')
+ webpage, 'video id', default=video_id or display_id)
application = self._search_regex(
[r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
webpage, 'application', default=site + '_vod')
@@ -63,12 +141,25 @@ class VierIE(InfoExtractor):
webpage, 'filename')
playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
- formats = self._extract_wowza_formats(playlist_url, display_id, skip_protocols=['dash'])
+ formats = self._extract_wowza_formats(
+ playlist_url, display_id, skip_protocols=['dash'])
self._sort_formats(formats)
title = self._og_search_title(webpage, default=display_id)
- description = self._og_search_description(webpage, default=None)
+ description = self._html_search_regex(
+ r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-type-text-with-summary\b[^>]*?\1[^>]*>.*?<p>(?P<value>.+?)</p>',
+ webpage, 'description', default=None, group='value')
thumbnail = self._og_search_thumbnail(webpage, default=None)
+ upload_date = unified_strdate(self._html_search_regex(
+ r'(?s)<div\b[^>]+\bclass=(["\'])[^>]*?\bfield-name-post-date\b[^>]*?\1[^>]*>.*?(?P<value>\d{2}/\d{2}/\d{4})',
+ webpage, 'upload date', default=None, group='value'))
+
+ series = self._search_regex(
+ r'data-program=(["\'])(?P<value>(?:(?!\1).)+)\1', webpage,
+ 'series', default=None, group='value')
+ episode_number = int_or_none(self._search_regex(
+ r'(?i)aflevering (\d+)', title, 'episode number', default=None))
+ tags = re.findall(r'<a\b[^>]+\bhref=["\']/tags/[^>]+>([^<]+)<', webpage)
return {
'id': video_id,
@@ -76,6 +167,10 @@ class VierIE(InfoExtractor):
'title': title,
'description': description,
'thumbnail': thumbnail,
+ 'upload_date': upload_date,
+ 'series': series,
+ 'episode_number': episode_number,
+ 'tags': tags,
'formats': formats,
}
diff --git a/youtube_dl/extractor/viewlift.py b/youtube_dl/extractor/viewlift.py
index 18735cf..1f29c27 100644
--- a/youtube_dl/extractor/viewlift.py
+++ b/youtube_dl/extractor/viewlift.py
@@ -68,7 +68,7 @@ class ViewLiftEmbedIE(ViewLiftBaseIE):
type_ = source.get('type')
ext = determine_ext(file_)
format_id = source.get('label') or ext
- if all(v == 'm3u8' or v == 'hls' for v in (type_, ext)):
+ if all(v in ('m3u8', 'hls') for v in (type_, ext)):
formats.extend(self._extract_m3u8_formats(
file_, video_id, 'mp4', m3u8_id='hls'))
else:
diff --git a/youtube_dl/extractor/viewster.py b/youtube_dl/extractor/viewster.py
index fcf0cb1..d5d5b4c 100644
--- a/youtube_dl/extractor/viewster.py
+++ b/youtube_dl/extractor/viewster.py
@@ -176,8 +176,7 @@ class ViewsterIE(InfoExtractor):
if m3u8_formats:
self._sort_formats(m3u8_formats)
m3u8_formats = list(filter(
- lambda f: f.get('vcodec') != 'none' and f.get('resolution') != 'multiple',
- m3u8_formats))
+ lambda f: f.get('vcodec') != 'none', m3u8_formats))
if len(qualities) == len(m3u8_formats):
for q, m3u8_format in zip(qualities, m3u8_formats):
f = m3u8_format.copy()
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
index b971890..e589406 100644
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -70,9 +70,9 @@ class VLiveIE(InfoExtractor):
status, long_video_id, key = params[2], params[5], params[6]
status = remove_start(status, 'PRODUCT_')
- if status == 'LIVE_ON_AIR' or status == 'BIG_EVENT_ON_AIR':
+ if status in ('LIVE_ON_AIR', 'BIG_EVENT_ON_AIR'):
return self._live(video_id, webpage)
- elif status == 'VOD_ON_AIR' or status == 'BIG_EVENT_INTRO':
+ elif status in ('VOD_ON_AIR', 'BIG_EVENT_INTRO'):
if long_video_id and key:
return self._replay(video_id, webpage, long_video_id, key)
else:
diff --git a/youtube_dl/extractor/vrt.py b/youtube_dl/extractor/vrt.py
index 00c72e3..444295d 100644
--- a/youtube_dl/extractor/vrt.py
+++ b/youtube_dl/extractor/vrt.py
@@ -10,6 +10,7 @@ from ..utils import (
class VRTIE(InfoExtractor):
+ IE_DESC = 'deredactie.be, sporza.be, cobra.be and cobra.canvas.be'
_VALID_URL = r'https?://(?:deredactie|sporza|cobra(?:\.canvas)?)\.be/cm/(?:[^/]+/)+(?P<id>[^/]+)/*'
_TESTS = [
# deredactie.be
diff --git a/youtube_dl/extractor/vrv.py b/youtube_dl/extractor/vrv.py
new file mode 100644
index 0000000..9959627
--- /dev/null
+++ b/youtube_dl/extractor/vrv.py
@@ -0,0 +1,212 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import base64
+import json
+import hashlib
+import hmac
+import random
+import string
+import time
+
+from .common import InfoExtractor
+from ..compat import (
+ compat_urllib_parse_urlencode,
+ compat_urlparse,
+)
+from ..utils import (
+ float_or_none,
+ int_or_none,
+)
+
+
+class VRVBaseIE(InfoExtractor):
+ _API_DOMAIN = None
+ _API_PARAMS = {}
+ _CMS_SIGNING = {}
+
+ def _call_api(self, path, video_id, note, data=None):
+ base_url = self._API_DOMAIN + '/core/' + path
+ encoded_query = compat_urllib_parse_urlencode({
+ 'oauth_consumer_key': self._API_PARAMS['oAuthKey'],
+ 'oauth_nonce': ''.join([random.choice(string.ascii_letters) for _ in range(32)]),
+ 'oauth_signature_method': 'HMAC-SHA1',
+ 'oauth_timestamp': int(time.time()),
+ 'oauth_version': '1.0',
+ })
+ headers = self.geo_verification_headers()
+ if data:
+ data = json.dumps(data).encode()
+ headers['Content-Type'] = 'application/json'
+ method = 'POST' if data else 'GET'
+ base_string = '&'.join([method, compat_urlparse.quote(base_url, ''), compat_urlparse.quote(encoded_query, '')])
+ oauth_signature = base64.b64encode(hmac.new(
+ (self._API_PARAMS['oAuthSecret'] + '&').encode('ascii'),
+ base_string.encode(), hashlib.sha1).digest()).decode()
+ encoded_query += '&oauth_signature=' + compat_urlparse.quote(oauth_signature, '')
+ return self._download_json(
+ '?'.join([base_url, encoded_query]), video_id,
+ note='Downloading %s JSON metadata' % note, headers=headers, data=data)
+
+ def _call_cms(self, path, video_id, note):
+ if not self._CMS_SIGNING:
+ self._CMS_SIGNING = self._call_api('index', video_id, 'CMS Signing')['cms_signing']
+ return self._download_json(
+ self._API_DOMAIN + path, video_id, query=self._CMS_SIGNING,
+ note='Downloading %s JSON metadata' % note, headers=self.geo_verification_headers())
+
+ def _set_api_params(self, webpage, video_id):
+ if not self._API_PARAMS:
+ self._API_PARAMS = self._parse_json(self._search_regex(
+ r'window\.__APP_CONFIG__\s*=\s*({.+?})</script>',
+ webpage, 'api config'), video_id)['cxApiParams']
+ self._API_DOMAIN = self._API_PARAMS.get('apiDomain', 'https://api.vrv.co')
+
+ def _get_cms_resource(self, resource_key, video_id):
+ return self._call_api(
+ 'cms_resource', video_id, 'resource path', data={
+ 'resource_key': resource_key,
+ })['__links__']['cms_resource']['href']
+
+
+class VRVIE(VRVBaseIE):
+ IE_NAME = 'vrv'
+ _VALID_URL = r'https?://(?:www\.)?vrv\.co/watch/(?P<id>[A-Z0-9]+)'
+ _TEST = {
+ 'url': 'https://vrv.co/watch/GR9PNZ396/Hidden-America-with-Jonah-Ray:BOSTON-WHERE-THE-PAST-IS-THE-PRESENT',
+ 'info_dict': {
+ 'id': 'GR9PNZ396',
+ 'ext': 'mp4',
+ 'title': 'BOSTON: WHERE THE PAST IS THE PRESENT',
+ 'description': 'md5:4ec8844ac262ca2df9e67c0983c6b83f',
+ 'uploader_id': 'seeso',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, video_id,
+ headers=self.geo_verification_headers())
+ media_resource = self._parse_json(self._search_regex(
+ r'window\.__INITIAL_STATE__\s*=\s*({.+?})</script>',
+ webpage, 'inital state'), video_id).get('watch', {}).get('mediaResource') or {}
+
+ video_data = media_resource.get('json')
+ if not video_data:
+ self._set_api_params(webpage, video_id)
+ episode_path = self._get_cms_resource(
+ 'cms:/episodes/' + video_id, video_id)
+ video_data = self._call_cms(episode_path, video_id, 'video')
+ title = video_data['title']
+
+ streams_json = media_resource.get('streams', {}).get('json', {})
+ if not streams_json:
+ self._set_api_params(webpage, video_id)
+ streams_path = video_data['__links__']['streams']['href']
+ streams_json = self._call_cms(streams_path, video_id, 'streams')
+
+ audio_locale = streams_json.get('audio_locale')
+ formats = []
+ for stream_type, streams in streams_json.get('streams', {}).items():
+ if stream_type in ('adaptive_hls', 'adaptive_dash'):
+ for stream in streams.values():
+ stream_url = stream.get('url')
+ if not stream_url:
+ continue
+ stream_id = stream.get('hardsub_locale') or audio_locale
+ format_id = '%s-%s' % (stream_type.split('_')[1], stream_id)
+ if stream_type == 'adaptive_hls':
+ adaptive_formats = self._extract_m3u8_formats(
+ stream_url, video_id, 'mp4', m3u8_id=format_id,
+ note='Downloading %s m3u8 information' % stream_id,
+ fatal=False)
+ else:
+ adaptive_formats = self._extract_mpd_formats(
+ stream_url, video_id, mpd_id=format_id,
+ note='Downloading %s MPD information' % stream_id,
+ fatal=False)
+ if audio_locale:
+ for f in adaptive_formats:
+ if f.get('acodec') != 'none':
+ f['language'] = audio_locale
+ formats.extend(adaptive_formats)
+ self._sort_formats(formats)
+
+ subtitles = {}
+ for subtitle in streams_json.get('subtitles', {}).values():
+ subtitle_url = subtitle.get('url')
+ if not subtitle_url:
+ continue
+ subtitles.setdefault(subtitle.get('locale', 'en-US'), []).append({
+ 'url': subtitle_url,
+ 'ext': subtitle.get('format', 'ass'),
+ })
+
+ thumbnails = []
+ for thumbnail in video_data.get('images', {}).get('thumbnails', []):
+ thumbnail_url = thumbnail.get('source')
+ if not thumbnail_url:
+ continue
+ thumbnails.append({
+ 'url': thumbnail_url,
+ 'width': int_or_none(thumbnail.get('width')),
+ 'height': int_or_none(thumbnail.get('height')),
+ })
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'subtitles': subtitles,
+ 'thumbnails': thumbnails,
+ 'description': video_data.get('description'),
+ 'duration': float_or_none(video_data.get('duration_ms'), 1000),
+ 'uploader_id': video_data.get('channel_id'),
+ 'series': video_data.get('series_title'),
+ 'season': video_data.get('season_title'),
+ 'season_number': int_or_none(video_data.get('season_number')),
+ 'season_id': video_data.get('season_id'),
+ 'episode': title,
+ 'episode_number': int_or_none(video_data.get('episode_number')),
+ 'episode_id': video_data.get('production_episode_id'),
+ }
+
+
+class VRVSeriesIE(VRVBaseIE):
+ IE_NAME = 'vrv:series'
+ _VALID_URL = r'https?://(?:www\.)?vrv\.co/series/(?P<id>[A-Z0-9]+)'
+ _TEST = {
+ 'url': 'https://vrv.co/series/G68VXG3G6/The-Perfect-Insider',
+ 'info_dict': {
+ 'id': 'G68VXG3G6',
+ },
+ 'playlist_mincount': 11,
+ }
+
+ def _real_extract(self, url):
+ series_id = self._match_id(url)
+ webpage = self._download_webpage(
+ url, series_id,
+ headers=self.geo_verification_headers())
+
+ self._set_api_params(webpage, series_id)
+ seasons_path = self._get_cms_resource(
+ 'cms:/seasons?series_id=' + series_id, series_id)
+ seasons_data = self._call_cms(seasons_path, series_id, 'seasons')
+
+ entries = []
+ for season in seasons_data.get('items', []):
+ episodes_path = season['__links__']['season/episodes']['href']
+ episodes = self._call_cms(episodes_path, series_id, 'episodes')
+ for episode in episodes.get('items', []):
+ episode_id = episode['id']
+ entries.append(self.url_result(
+ 'https://vrv.co/watch/' + episode_id,
+ 'VRV', episode_id, episode.get('title')))
+
+ return self.playlist_result(entries, series_id)
diff --git a/youtube_dl/extractor/vshare.py b/youtube_dl/extractor/vshare.py
new file mode 100644
index 0000000..5addbc2
--- /dev/null
+++ b/youtube_dl/extractor/vshare.py
@@ -0,0 +1,38 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+
+
+class VShareIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?vshare\.io/[dv]/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'https://vshare.io/d/0f64ce6',
+ 'md5': '16d7b8fef58846db47419199ff1ab3e7',
+ 'info_dict': {
+ 'id': '0f64ce6',
+ 'title': 'vl14062007715967',
+ 'ext': 'mp4',
+ }
+ }, {
+ 'url': 'https://vshare.io/v/0f64ce6/width-650/height-430/1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://vshare.io/d/%s' % video_id, video_id)
+
+ title = self._html_search_regex(
+ r'(?s)<div id="root-container">(.+?)<br/>', webpage, 'title')
+ video_url = self._search_regex(
+ r'<a[^>]+href=(["\'])(?P<url>(?:https?:)?//.+?)\1[^>]*>[Cc]lick\s+here',
+ webpage, 'video url', group='url')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'url': video_url,
+ }
diff --git a/youtube_dl/extractor/washingtonpost.py b/youtube_dl/extractor/washingtonpost.py
index 839cad9..625d0a1 100644
--- a/youtube_dl/extractor/washingtonpost.py
+++ b/youtube_dl/extractor/washingtonpost.py
@@ -13,6 +13,7 @@ from ..utils import (
class WashingtonPostIE(InfoExtractor):
IE_NAME = 'washingtonpost'
_VALID_URL = r'(?:washingtonpost:|https?://(?:www\.)?washingtonpost\.com/video/(?:[^/]+/)*)(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _EMBED_URL = r'https?://(?:www\.)?washingtonpost\.com/video/c/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}'
_TEST = {
'url': 'https://www.washingtonpost.com/video/c/video/480ba4ee-1ec7-11e6-82c2-a7dcb313287d',
'md5': '6f537e1334b714eb15f9563bd4b9cdfa',
@@ -27,6 +28,11 @@ class WashingtonPostIE(InfoExtractor):
},
}
+ @classmethod
+ def _extract_urls(cls, webpage):
+ return re.findall(
+ r'<iframe[^>]+\bsrc=["\'](%s)' % cls._EMBED_URL, webpage)
+
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
diff --git a/youtube_dl/extractor/wistia.py b/youtube_dl/extractor/wistia.py
index c634b8d..2182d6f 100644
--- a/youtube_dl/extractor/wistia.py
+++ b/youtube_dl/extractor/wistia.py
@@ -1,10 +1,13 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
ExtractorError,
int_or_none,
float_or_none,
+ unescapeHTML,
)
@@ -34,6 +37,25 @@ class WistiaIE(InfoExtractor):
'only_matching': True,
}]
+ @staticmethod
+ def _extract_url(webpage):
+ match = re.search(
+ r'<(?:meta[^>]+?content|iframe[^>]+?src)=(["\'])(?P<url>(?:https?:)?//(?:fast\.)?wistia\.net/embed/iframe/.+?)\1', webpage)
+ if match:
+ return unescapeHTML(match.group('url'))
+
+ match = re.search(r'(?:id=["\']wistia_|data-wistia-?id=["\']|Wistia\.embed\(["\'])(?P<id>[^"\']+)', webpage)
+ if match:
+ return 'wistia:%s' % match.group('id')
+
+ match = re.search(
+ r'''(?sx)
+ <script[^>]+src=(["'])(?:https?:)?//fast\.wistia\.com/assets/external/E-v1\.js\1[^>]*>.*?
+ <div[^>]+class=(["']).*?\bwistia_async_(?P<id>[a-z0-9]+)\b.*?\2
+ ''', webpage)
+ if match:
+ return 'wistia:%s' % match.group('id')
+
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/worldstarhiphop.py b/youtube_dl/extractor/worldstarhiphop.py
index 09415b5..82587b4 100644
--- a/youtube_dl/extractor/worldstarhiphop.py
+++ b/youtube_dl/extractor/worldstarhiphop.py
@@ -1,12 +1,10 @@
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
class WorldStarHipHopIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?v=(?P<id>.*)'
+ _VALID_URL = r'https?://(?:www|m)\.worldstar(?:candy|hiphop)\.com/(?:videos|android)/video\.php\?.*?\bv=(?P<id>[^&]+)'
_TESTS = [{
'url': 'http://www.worldstarhiphop.com/videos/video.php?v=wshh6a7q1ny0G34ZwuIO',
'md5': '9d04de741161603bf7071bbf4e883186',
@@ -17,48 +15,26 @@ class WorldStarHipHopIE(InfoExtractor):
}
}, {
'url': 'http://m.worldstarhiphop.com/android/video.php?v=wshh6a7q1ny0G34ZwuIO',
- 'md5': 'dc1c76c83ecc4190bb1eb143899b87d3',
- 'info_dict': {
- 'id': 'wshh6a7q1ny0G34ZwuIO',
- 'ext': 'mp4',
- 'title': 'KO Of The Week: MMA Fighter Gets Knocked Out By Swift Head Kick!'
- }
+ 'only_matching': True,
}]
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
- m_vevo_id = re.search(r'videoId=(.*?)&amp?', webpage)
- if m_vevo_id is not None:
- return self.url_result('vevo:%s' % m_vevo_id.group(1), ie='Vevo')
-
- video_url = self._search_regex(
- [r'so\.addVariable\("file","(.*?)"\)',
- r'<div class="artlist">\s*<a[^>]+href="([^"]+)">'],
- webpage, 'video URL')
+ entries = self._parse_html5_media_entries(url, webpage, video_id)
- if 'youtube' in video_url:
- return self.url_result(video_url, ie='Youtube')
+ if not entries:
+ return self.url_result(url, 'Generic')
- video_title = self._html_search_regex(
+ title = self._html_search_regex(
[r'(?s)<div class="content-heading">\s*<h1>(.*?)</h1>',
r'<span[^>]+class="tc-sp-pinned-title">(.*)</span>'],
webpage, 'title')
- # Getting thumbnail and if not thumbnail sets correct title for WSHH candy video.
- thumbnail = self._html_search_regex(
- r'rel="image_src" href="(.*)" />', webpage, 'thumbnail',
- default=None)
- if not thumbnail:
- _title = r'candytitles.*>(.*)</span>'
- mobj = re.search(_title, webpage)
- if mobj is not None:
- video_title = mobj.group(1)
-
- return {
+ info = entries[0]
+ info.update({
'id': video_id,
- 'url': video_url,
- 'title': video_title,
- 'thumbnail': thumbnail,
- }
+ 'title': title,
+ })
+ return info
diff --git a/youtube_dl/extractor/wsj.py b/youtube_dl/extractor/wsj.py
index deb7483..45cfca7 100644
--- a/youtube_dl/extractor/wsj.py
+++ b/youtube_dl/extractor/wsj.py
@@ -10,12 +10,14 @@ from ..utils import (
class WSJIE(InfoExtractor):
- _VALID_URL = r'''(?x)https?://
- (?:
- video-api\.wsj\.com/api-video/player/iframe\.html\?guid=|
- (?:www\.)?wsj\.com/video/[^/]+/
- )
- (?P<id>[a-zA-Z0-9-]+)'''
+ _VALID_URL = r'''(?x)
+ (?:
+ https?://video-api\.wsj\.com/api-video/player/iframe\.html\?.*?\bguid=|
+ https?://(?:www\.)?wsj\.com/video/[^/]+/|
+ wsj:
+ )
+ (?P<id>[a-fA-F0-9-]{36})
+ '''
IE_DESC = 'Wall Street Journal'
_TESTS = [{
'url': 'http://video-api.wsj.com/api-video/player/iframe.html?guid=1BD01A4C-BFE8-40A5-A42F-8A8AF9898B1A',
@@ -38,12 +40,17 @@ class WSJIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
- api_url = (
- 'http://video-api.wsj.com/api-video/find_all_videos.asp?'
- 'type=guid&count=1&query=%s&fields=type,hls,videoMP4List,'
- 'thumbnailList,author,description,name,duration,videoURL,'
- 'titletag,formattedCreationDate,keywords,editor' % video_id)
- info = self._download_json(api_url, video_id)['items'][0]
+ info = self._download_json(
+ 'http://video-api.wsj.com/api-video/find_all_videos.asp', video_id,
+ query={
+ 'type': 'guid',
+ 'count': 1,
+ 'query': video_id,
+ 'fields': ','.join((
+ 'type', 'hls', 'videoMP4List', 'thumbnailList', 'author',
+ 'description', 'name', 'duration', 'videoURL', 'titletag',
+ 'formattedCreationDate', 'keywords', 'editor')),
+ })['items'][0]
title = info.get('name', info.get('titletag'))
formats = []
@@ -87,3 +94,24 @@ class WSJIE(InfoExtractor):
'title': title,
'categories': info.get('keywords'),
}
+
+
+class WSJArticleIE(InfoExtractor):
+ _VALID_URL = r'(?i)https?://(?:www\.)?wsj\.com/articles/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://www.wsj.com/articles/dont-like-china-no-pandas-for-you-1490366939?',
+ 'info_dict': {
+ 'id': '4B13FA62-1D8C-45DB-8EA1-4105CB20B362',
+ 'ext': 'mp4',
+ 'upload_date': '20170221',
+ 'uploader_id': 'ralcaraz',
+ 'title': 'Bao Bao the Panda Leaves for China',
+ }
+ }
+
+ def _real_extract(self, url):
+ article_id = self._match_id(url)
+ webpage = self._download_webpage(url, article_id)
+ video_id = self._search_regex(
+ r'data-src=["\']([a-fA-F0-9-]{36})', webpage, 'video id')
+ return self.url_result('wsj:%s' % video_id, WSJIE.ie_key(), video_id)
diff --git a/youtube_dl/extractor/xfileshare.py b/youtube_dl/extractor/xfileshare.py
index e616adc..13f8be6 100644
--- a/youtube_dl/extractor/xfileshare.py
+++ b/youtube_dl/extractor/xfileshare.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
decode_packed_codes,
+ determine_ext,
ExtractorError,
int_or_none,
NO_DEFAULT,
@@ -16,21 +17,24 @@ from ..utils import (
class XFileShareIE(InfoExtractor):
_SITES = (
- ('daclips.in', 'DaClips'),
- ('filehoot.com', 'FileHoot'),
- ('gorillavid.in', 'GorillaVid'),
- ('movpod.in', 'MovPod'),
- ('powerwatch.pw', 'PowerWatch'),
- ('rapidvideo.ws', 'Rapidvideo.ws'),
- ('thevideobee.to', 'TheVideoBee'),
- ('vidto.me', 'Vidto'),
- ('streamin.to', 'Streamin.To'),
- ('xvidstage.com', 'XVIDSTAGE'),
+ (r'daclips\.(?:in|com)', 'DaClips'),
+ (r'filehoot\.com', 'FileHoot'),
+ (r'gorillavid\.(?:in|com)', 'GorillaVid'),
+ (r'movpod\.in', 'MovPod'),
+ (r'powerwatch\.pw', 'PowerWatch'),
+ (r'rapidvideo\.ws', 'Rapidvideo.ws'),
+ (r'thevideobee\.to', 'TheVideoBee'),
+ (r'vidto\.me', 'Vidto'),
+ (r'streamin\.to', 'Streamin.To'),
+ (r'xvidstage\.com', 'XVIDSTAGE'),
+ (r'vidabc\.com', 'Vid ABC'),
+ (r'vidbom\.com', 'VidBom'),
+ (r'vidlo\.us', 'vidlo'),
)
IE_DESC = 'XFileShare based sites: %s' % ', '.join(list(zip(*_SITES))[1])
_VALID_URL = (r'https?://(?P<host>(?:www\.)?(?:%s))/(?:embed-)?(?P<id>[0-9a-zA-Z]+)'
- % '|'.join(re.escape(site) for site in list(zip(*_SITES))[0]))
+ % '|'.join(site for site in list(zip(*_SITES))[0]))
_FILE_NOT_FOUND_REGEXES = (
r'>(?:404 - )?File Not Found<',
@@ -95,6 +99,16 @@ class XFileShareIE(InfoExtractor):
# removed by administrator
'url': 'http://xvidstage.com/amfy7atlkx25',
'only_matching': True,
+ }, {
+ 'url': 'http://vidabc.com/i8ybqscrphfv',
+ 'info_dict': {
+ 'id': 'i8ybqscrphfv',
+ 'ext': 'mp4',
+ 'title': 're:Beauty and the Beast 2017',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -133,31 +147,45 @@ class XFileShareIE(InfoExtractor):
webpage, 'title', default=None) or self._og_search_title(
webpage, default=None) or video_id).strip()
- def extract_video_url(default=NO_DEFAULT):
- return self._search_regex(
- (r'file\s*:\s*(["\'])(?P<url>http.+?)\1,',
- r'file_link\s*=\s*(["\'])(?P<url>http.+?)\1',
- r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http.+?)\2\)',
- r'<embed[^>]+src=(["\'])(?P<url>http.+?)\1'),
- webpage, 'file url', default=default, group='url')
-
- video_url = extract_video_url(default=None)
-
- if not video_url:
+ def extract_formats(default=NO_DEFAULT):
+ urls = []
+ for regex in (
+ r'file\s*:\s*(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1',
+ r'file_link\s*=\s*(["\'])(?P<url>http(?:(?!\1).)+)\1',
+ r'addVariable\((\\?["\'])file\1\s*,\s*(\\?["\'])(?P<url>http(?:(?!\2).)+)\2\)',
+ r'<embed[^>]+src=(["\'])(?P<url>http(?:(?!\1).)+\.(?:m3u8|mp4|flv)(?:(?!\1).)*)\1'):
+ for mobj in re.finditer(regex, webpage):
+ video_url = mobj.group('url')
+ if video_url not in urls:
+ urls.append(video_url)
+ formats = []
+ for video_url in urls:
+ if determine_ext(video_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls',
+ fatal=False))
+ else:
+ formats.append({
+ 'url': video_url,
+ 'format_id': 'sd',
+ })
+ if not formats and default is not NO_DEFAULT:
+ return default
+ self._sort_formats(formats)
+ return formats
+
+ formats = extract_formats(default=None)
+
+ if not formats:
webpage = decode_packed_codes(self._search_regex(
r"(}\('(.+)',(\d+),(\d+),'[^']*\b(?:file|embed)\b[^']*'\.split\('\|'\))",
webpage, 'packed code'))
- video_url = extract_video_url()
+ formats = extract_formats()
thumbnail = self._search_regex(
r'image\s*:\s*["\'](http[^"\']+)["\'],', webpage, 'thumbnail', default=None)
- formats = [{
- 'format_id': 'sd',
- 'url': video_url,
- 'quality': 1,
- }]
-
return {
'id': video_id,
'title': title,
diff --git a/youtube_dl/extractor/xtube.py b/youtube_dl/extractor/xtube.py
index 5584674..bea9b87 100644
--- a/youtube_dl/extractor/xtube.py
+++ b/youtube_dl/extractor/xtube.py
@@ -6,6 +6,7 @@ import re
from .common import InfoExtractor
from ..utils import (
int_or_none,
+ js_to_json,
orderedSet,
parse_duration,
sanitized_Request,
@@ -38,6 +39,22 @@ class XTubeIE(InfoExtractor):
'age_limit': 18,
}
}, {
+ # FLV videos with duplicated formats
+ 'url': 'http://www.xtube.com/video-watch/A-Super-Run-Part-1-YT-9299752',
+ 'md5': 'a406963eb349dd43692ec54631efd88b',
+ 'info_dict': {
+ 'id': '9299752',
+ 'display_id': 'A-Super-Run-Part-1-YT',
+ 'ext': 'flv',
+ 'title': 'A Super Run - Part 1 (YT)',
+ 'description': 'md5:ca0d47afff4a9b2942e4b41aa970fd93',
+ 'uploader': 'tshirtguy59',
+ 'duration': 579,
+ 'view_count': int,
+ 'comment_count': int,
+ 'age_limit': 18,
+ },
+ }, {
# new URL schema
'url': 'http://www.xtube.com/video-watch/strange-erotica-625837',
'only_matching': True,
@@ -68,8 +85,9 @@ class XTubeIE(InfoExtractor):
})
sources = self._parse_json(self._search_regex(
- r'(["\'])sources\1\s*:\s*(?P<sources>{.+?}),',
- webpage, 'sources', group='sources'), video_id)
+ r'(["\'])?sources\1?\s*:\s*(?P<sources>{.+?}),',
+ webpage, 'sources', group='sources'), video_id,
+ transform_source=js_to_json)
formats = []
for format_id, format_url in sources.items():
@@ -78,6 +96,7 @@ class XTubeIE(InfoExtractor):
'format_id': format_id,
'height': int_or_none(format_id),
})
+ self._remove_duplicate_formats(formats)
self._sort_formats(formats)
title = self._search_regex(
diff --git a/youtube_dl/extractor/xvideos.py b/youtube_dl/extractor/xvideos.py
index 30825da..eca6030 100644
--- a/youtube_dl/extractor/xvideos.py
+++ b/youtube_dl/extractor/xvideos.py
@@ -6,8 +6,10 @@ from .common import InfoExtractor
from ..compat import compat_urllib_parse_unquote
from ..utils import (
clean_html,
- ExtractorError,
determine_ext,
+ ExtractorError,
+ int_or_none,
+ parse_duration,
)
@@ -20,6 +22,7 @@ class XVideosIE(InfoExtractor):
'id': '4588838',
'ext': 'mp4',
'title': 'Biker Takes his Girl',
+ 'duration': 108,
'age_limit': 18,
}
}
@@ -36,6 +39,11 @@ class XVideosIE(InfoExtractor):
r'<title>(.*?)\s+-\s+XVID', webpage, 'title')
video_thumbnail = self._search_regex(
r'url_bigthumb=(.+?)&amp', webpage, 'thumbnail', fatal=False)
+ video_duration = int_or_none(self._og_search_property(
+ 'duration', webpage, default=None)) or parse_duration(
+ self._search_regex(
+ r'<span[^>]+class=["\']duration["\'][^>]*>.*?(\d[^<]+)',
+ webpage, 'duration', fatal=False))
formats = []
@@ -67,6 +75,7 @@ class XVideosIE(InfoExtractor):
'id': video_id,
'formats': formats,
'title': video_title,
+ 'duration': video_duration,
'thumbnail': video_thumbnail,
'age_limit': 18,
}
diff --git a/youtube_dl/extractor/yahoo.py b/youtube_dl/extractor/yahoo.py
index 4951414..38f82bf 100644
--- a/youtube_dl/extractor/yahoo.py
+++ b/youtube_dl/extractor/yahoo.py
@@ -258,7 +258,7 @@ class YahooIE(InfoExtractor):
return self.url_result(bc_url, BrightcoveLegacyIE.ie_key())
# Look for Brightcove New Studio embeds
- bc_url = BrightcoveNewIE._extract_url(webpage)
+ bc_url = BrightcoveNewIE._extract_url(self, webpage)
if bc_url:
return self.url_result(bc_url, BrightcoveNewIE.ie_key())
diff --git a/youtube_dl/extractor/yandexmusic.py b/youtube_dl/extractor/yandexmusic.py
index fd6268b..eb10621 100644
--- a/youtube_dl/extractor/yandexmusic.py
+++ b/youtube_dl/extractor/yandexmusic.py
@@ -234,7 +234,8 @@ class YandexMusicPlaylistIE(YandexMusicPlaylistBaseIE):
'overembed': 'false',
})['playlist']
- tracks, track_ids = playlist['tracks'], map(compat_str, playlist['trackIds'])
+ tracks = playlist['tracks']
+ track_ids = [compat_str(track_id) for track_id in playlist['trackIds']]
# tracks dictionary shipped with playlist.jsx API is limited to 150 tracks,
# missing tracks should be retrieved manually.
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index e37f237..73ebe57 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -10,12 +10,14 @@ import time
from .common import InfoExtractor
from ..compat import (
- compat_urllib_parse_urlencode,
compat_ord,
+ compat_str,
+ compat_urllib_parse_urlencode,
)
from ..utils import (
ExtractorError,
get_element_by_attribute,
+ try_get,
)
@@ -105,7 +107,9 @@ class YoukuIE(InfoExtractor):
if stream.get('channel_type') == 'tail':
continue
format = stream.get('stream_type')
- fileid = stream['stream_fileid']
+ fileid = try_get(
+ stream, lambda x: x['segs'][0]['fileid'],
+ compat_str) or stream['stream_fileid']
fileid_dict[format] = fileid
def get_fileid(format, n):
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index ca40de5..44a3928 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -38,7 +38,6 @@ from ..utils import (
parse_duration,
remove_quotes,
remove_start,
- sanitized_Request,
smuggle_url,
str_to_int,
try_get,
@@ -54,7 +53,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
"""Provide base functions for Youtube extractors"""
_LOGIN_URL = 'https://accounts.google.com/ServiceLogin'
_TWOFACTOR_URL = 'https://accounts.google.com/signin/challenge'
- _PASSWORD_CHALLENGE_URL = 'https://accounts.google.com/signin/challenge/sl/password'
+
+ _LOOKUP_URL = 'https://accounts.google.com/_/signin/sl/lookup'
+ _CHALLENGE_URL = 'https://accounts.google.com/_/signin/sl/challenge'
+ _TFA_URL = 'https://accounts.google.com/_/signin/challenge?hl=en&TL={0}'
+
_NETRC_MACHINE = 'youtube'
# If True it will raise an error if no login info is provided
_LOGIN_REQUIRED = False
@@ -96,72 +99,150 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
login_form = self._hidden_inputs(login_page)
- login_form.update({
- 'checkConnection': 'youtube',
- 'Email': username,
- 'Passwd': password,
- })
+ def req(url, f_req, note, errnote):
+ data = login_form.copy()
+ data.update({
+ 'pstMsg': 1,
+ 'checkConnection': 'youtube',
+ 'checkedDomains': 'youtube',
+ 'hl': 'en',
+ 'deviceinfo': '[null,null,null,[],null,"US",null,null,[],"GlifWebSignIn",null,[null,null,[]]]',
+ 'f.req': json.dumps(f_req),
+ 'flowName': 'GlifWebSignIn',
+ 'flowEntry': 'ServiceLogin',
+ })
+ return self._download_json(
+ url, None, note=note, errnote=errnote,
+ transform_source=lambda s: re.sub(r'^[^[]*', '', s),
+ fatal=False,
+ data=urlencode_postdata(data), headers={
+ 'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
+ 'Google-Accounts-XSRF': 1,
+ })
- login_results = self._download_webpage(
- self._PASSWORD_CHALLENGE_URL, None,
- note='Logging in', errnote='unable to log in', fatal=False,
- data=urlencode_postdata(login_form))
- if login_results is False:
- return False
+ def warn(message):
+ self._downloader.report_warning(message)
+
+ lookup_req = [
+ username,
+ None, [], None, 'US', None, None, 2, False, True,
+ [
+ None, None,
+ [2, 1, None, 1,
+ 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn',
+ None, [], 4],
+ 1, [None, None, []], None, None, None, True
+ ],
+ username,
+ ]
- error_msg = self._html_search_regex(
- r'<[^>]+id="errormsg_0_Passwd"[^>]*>([^<]+)<',
- login_results, 'error message', default=None)
- if error_msg:
- raise ExtractorError('Unable to login: %s' % error_msg, expected=True)
+ lookup_results = req(
+ self._LOOKUP_URL, lookup_req,
+ 'Looking up account info', 'Unable to look up account info')
- if re.search(r'id="errormsg_0_Passwd"', login_results) is not None:
- raise ExtractorError('Please use your account password and a two-factor code instead of an application-specific password.', expected=True)
+ if lookup_results is False:
+ return False
- # Two-Factor
- # TODO add SMS and phone call support - these require making a request and then prompting the user
+ user_hash = try_get(lookup_results, lambda x: x[0][2], compat_str)
+ if not user_hash:
+ warn('Unable to extract user hash')
+ return False
- if re.search(r'(?i)<form[^>]+id="challenge"', login_results) is not None:
- tfa_code = self._get_tfa_info('2-step verification code')
+ challenge_req = [
+ user_hash,
+ None, 1, None, [1, None, None, None, [password, None, True]],
+ [
+ None, None, [2, 1, None, 1, 'https://accounts.google.com/ServiceLogin?passive=true&continue=https%3A%2F%2Fwww.youtube.com%2Fsignin%3Fnext%3D%252F%26action_handle_signin%3Dtrue%26hl%3Den%26app%3Ddesktop%26feature%3Dsign_in_button&hl=en&service=youtube&uilel=3&requestPath=%2FServiceLogin&Page=PasswordSeparationSignIn', None, [], 4],
+ 1, [None, None, []], None, None, None, True
+ ]]
- if not tfa_code:
- self._downloader.report_warning(
- 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
- '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
- return False
+ challenge_results = req(
+ self._CHALLENGE_URL, challenge_req,
+ 'Logging in', 'Unable to log in')
- tfa_code = remove_start(tfa_code, 'G-')
+ if challenge_results is False:
+ return
- tfa_form_strs = self._form_hidden_inputs('challenge', login_results)
+ login_res = try_get(challenge_results, lambda x: x[0][5], list)
+ if login_res:
+ login_msg = try_get(login_res, lambda x: x[5], compat_str)
+ warn(
+ 'Unable to login: %s' % 'Invalid password'
+ if login_msg == 'INCORRECT_ANSWER_ENTERED' else login_msg)
+ return False
- tfa_form_strs.update({
- 'Pin': tfa_code,
- 'TrustDevice': 'on',
- })
+ res = try_get(challenge_results, lambda x: x[0][-1], list)
+ if not res:
+ warn('Unable to extract result entry')
+ return False
- tfa_data = urlencode_postdata(tfa_form_strs)
+ tfa = try_get(res, lambda x: x[0][0], list)
+ if tfa:
+ tfa_str = try_get(tfa, lambda x: x[2], compat_str)
+ if tfa_str == 'TWO_STEP_VERIFICATION':
+ # SEND_SUCCESS - TFA code has been successfully sent to phone
+ # QUOTA_EXCEEDED - reached the limit of TFA codes
+ status = try_get(tfa, lambda x: x[5], compat_str)
+ if status == 'QUOTA_EXCEEDED':
+ warn('Exceeded the limit of TFA codes, try later')
+ return False
+
+ tl = try_get(challenge_results, lambda x: x[1][2], compat_str)
+ if not tl:
+ warn('Unable to extract TL')
+ return False
+
+ tfa_code = self._get_tfa_info('2-step verification code')
+
+ if not tfa_code:
+ warn(
+ 'Two-factor authentication required. Provide it either interactively or with --twofactor <code>'
+ '(Note that only TOTP (Google Authenticator App) codes work at this time.)')
+ return False
+
+ tfa_code = remove_start(tfa_code, 'G-')
+
+ tfa_req = [
+ user_hash, None, 2, None,
+ [
+ 9, None, None, None, None, None, None, None,
+ [None, tfa_code, True, 2]
+ ]]
+
+ tfa_results = req(
+ self._TFA_URL.format(tl), tfa_req,
+ 'Submitting TFA code', 'Unable to submit TFA code')
+
+ if tfa_results is False:
+ return False
+
+ tfa_res = try_get(tfa_results, lambda x: x[0][5], list)
+ if tfa_res:
+ tfa_msg = try_get(tfa_res, lambda x: x[5], compat_str)
+ warn(
+ 'Unable to finish TFA: %s' % 'Invalid TFA code'
+ if tfa_msg == 'INCORRECT_ANSWER_ENTERED' else tfa_msg)
+ return False
+
+ check_cookie_url = try_get(
+ tfa_results, lambda x: x[0][-1][2], compat_str)
+ else:
+ check_cookie_url = try_get(res, lambda x: x[2], compat_str)
- tfa_req = sanitized_Request(self._TWOFACTOR_URL, tfa_data)
- tfa_results = self._download_webpage(
- tfa_req, None,
- note='Submitting TFA code', errnote='unable to submit tfa', fatal=False)
+ if not check_cookie_url:
+ warn('Unable to extract CheckCookie URL')
+ return False
- if tfa_results is False:
- return False
+ check_cookie_results = self._download_webpage(
+ check_cookie_url, None, 'Checking cookie', fatal=False)
- if re.search(r'(?i)<form[^>]+id="challenge"', tfa_results) is not None:
- self._downloader.report_warning('Two-factor code expired or invalid. Please try again, or use a one-use backup code instead.')
- return False
- if re.search(r'(?i)<form[^>]+id="gaia_loginform"', tfa_results) is not None:
- self._downloader.report_warning('unable to log in - did the page structure change?')
- return False
- if re.search(r'smsauth-interstitial-reviewsettings', tfa_results) is not None:
- self._downloader.report_warning('Your Google account has a security notice. Please log in on your web browser, resolve the notice, and try again.')
- return False
+ if check_cookie_results is False:
+ return False
- if re.search(r'(?i)<form[^>]+id="gaia_loginform"', login_results) is not None:
- self._downloader.report_warning('unable to log in: bad username or password')
+ if 'https://myaccount.google.com/' not in check_cookie_results:
+ warn('Unable to log in')
return False
+
return True
def _real_initialize(self):
@@ -317,60 +398,60 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'151': {'ext': 'mp4', 'height': 72, 'format_note': 'HLS', 'acodec': 'aac', 'abr': 24, 'vcodec': 'h264', 'preference': -10},
# DASH mp4 video
- '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
- '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
- '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
- '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60, 'preference': -40},
- '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264', 'preference': -40},
+ '133': {'ext': 'mp4', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '134': {'ext': 'mp4', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '135': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '136': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '137': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '138': {'ext': 'mp4', 'format_note': 'DASH video', 'vcodec': 'h264'}, # Height can vary (https://github.com/rg3/youtube-dl/issues/4559)
+ '160': {'ext': 'mp4', 'height': 144, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '212': {'ext': 'mp4', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '264': {'ext': 'mp4', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'h264'},
+ '298': {'ext': 'mp4', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '299': {'ext': 'mp4', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'h264', 'fps': 60},
+ '266': {'ext': 'mp4', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'h264'},
# Dash mp4 audio
- '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'preference': -50, 'container': 'm4a_dash'},
- '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'preference': -50, 'container': 'm4a_dash'},
- '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'preference': -50, 'container': 'm4a_dash'},
- '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
- '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'preference': -50, 'container': 'm4a_dash'},
- '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'preference': -50, 'container': 'm4a_dash'},
- '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'preference': -50, 'container': 'm4a_dash'},
+ '139': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 48, 'container': 'm4a_dash'},
+ '140': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 128, 'container': 'm4a_dash'},
+ '141': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'abr': 256, 'container': 'm4a_dash'},
+ '256': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '258': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'aac', 'container': 'm4a_dash'},
+ '325': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'dtse', 'container': 'm4a_dash'},
+ '328': {'ext': 'm4a', 'format_note': 'DASH audio', 'acodec': 'ec-3', 'container': 'm4a_dash'},
# Dash webm
- '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8', 'preference': -40},
- '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9', 'preference': -40},
- '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
+ '167': {'ext': 'webm', 'height': 360, 'width': 640, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '168': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '169': {'ext': 'webm', 'height': 720, 'width': 1280, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '170': {'ext': 'webm', 'height': 1080, 'width': 1920, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '218': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '219': {'ext': 'webm', 'height': 480, 'width': 854, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp8'},
+ '278': {'ext': 'webm', 'height': 144, 'format_note': 'DASH video', 'container': 'webm', 'vcodec': 'vp9'},
+ '242': {'ext': 'webm', 'height': 240, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '243': {'ext': 'webm', 'height': 360, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '244': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '245': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '246': {'ext': 'webm', 'height': 480, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '247': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '248': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '271': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9'},
# itag 272 videos are either 3840x2160 (e.g. RtoitU2A-3E) or 7680x4320 (sLprVF6d7Ug)
- '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
- '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
- '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
- '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'preference': -40},
- '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60, 'preference': -40},
+ '272': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '302': {'ext': 'webm', 'height': 720, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '303': {'ext': 'webm', 'height': 1080, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '308': {'ext': 'webm', 'height': 1440, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
+ '313': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9'},
+ '315': {'ext': 'webm', 'height': 2160, 'format_note': 'DASH video', 'vcodec': 'vp9', 'fps': 60},
# Dash webm audio
- '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128, 'preference': -50},
- '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256, 'preference': -50},
+ '171': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 128},
+ '172': {'ext': 'webm', 'acodec': 'vorbis', 'format_note': 'DASH audio', 'abr': 256},
# Dash webm audio with opus inside
- '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50, 'preference': -50},
- '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70, 'preference': -50},
- '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160, 'preference': -50},
+ '249': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 50},
+ '250': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 70},
+ '251': {'ext': 'webm', 'format_note': 'DASH audio', 'acodec': 'opus', 'abr': 160},
# RTMP (unnamed)
'_rtmp': {'protocol': 'rtmp'},
@@ -963,7 +1044,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
def _extract_signature_function(self, video_id, player_url, example_sig):
id_m = re.match(
- r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|/base)?\.(?P<ext>[a-z]+)$',
+ r'.*?-(?P<id>[a-zA-Z0-9_-]+)(?:/watch_as3|/html5player(?:-new)?|(?:/[a-z]{2}_[A-Z]{2})?/base)?\.(?P<ext>[a-z]+)$',
player_url)
if not id_m:
raise ExtractorError('Cannot identify player %r' % player_url)
@@ -1253,25 +1334,39 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_id = mobj.group(2)
return video_id
- def _extract_from_m3u8(self, manifest_url, video_id):
- url_map = {}
-
- def _get_urls(_manifest):
- lines = _manifest.split('\n')
- urls = filter(lambda l: l and not l.startswith('#'),
- lines)
- return urls
- manifest = self._download_webpage(manifest_url, video_id, 'Downloading formats manifest')
- formats_urls = _get_urls(manifest)
- for format_url in formats_urls:
- itag = self._search_regex(r'itag/(\d+?)/', format_url, 'itag')
- url_map[itag] = format_url
- return url_map
-
def _extract_annotations(self, video_id):
url = 'https://www.youtube.com/annotations_invideo?features=1&legacy=1&video_id=%s' % video_id
return self._download_webpage(url, video_id, note='Searching for annotations.', errnote='Unable to download video annotations.')
+ @staticmethod
+ def _extract_chapters(description, duration):
+ if not description:
+ return None
+ chapter_lines = re.findall(
+ r'(?:^|<br\s*/>)([^<]*<a[^>]+onclick=["\']yt\.www\.watch\.player\.seekTo[^>]+>(\d{1,2}:\d{1,2}(?::\d{1,2})?)</a>[^>]*)(?=$|<br\s*/>)',
+ description)
+ if not chapter_lines:
+ return None
+ chapters = []
+ for next_num, (chapter_line, time_point) in enumerate(
+ chapter_lines, start=1):
+ start_time = parse_duration(time_point)
+ if start_time is None:
+ continue
+ end_time = (duration if next_num == len(chapter_lines)
+ else parse_duration(chapter_lines[next_num][1]))
+ if end_time is None:
+ continue
+ chapter_title = re.sub(
+ r'<a[^>]+>[^<]+</a>', '', chapter_line).strip(' \t-')
+ chapter_title = re.sub(r'\s+', ' ', chapter_title)
+ chapters.append({
+ 'start_time': start_time,
+ 'end_time': end_time,
+ 'title': chapter_title,
+ })
+ return chapters
+
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -1414,9 +1509,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_title = '_'
# description
- video_description = get_element_by_id("eow-description", video_webpage)
+ description_original = video_description = get_element_by_id("eow-description", video_webpage)
if video_description:
- video_description = re.sub(r'''(?x)
+ description_original = video_description = re.sub(r'''(?x)
<a\s+
(?:[a-zA-Z-]+="[^"]*"\s+)*?
(?:title|href)="([^"]+)"\s+
@@ -1573,18 +1668,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if self._downloader.params.get('writeannotations', False):
video_annotations = self._extract_annotations(video_id)
- def _map_to_format_list(urlmap):
- formats = []
- for itag, video_real_url in urlmap.items():
- dct = {
- 'format_id': itag,
- 'url': video_real_url,
- 'player_url': player_url,
- }
- if itag in self._formats:
- dct.update(self._formats[itag])
- formats.append(dct)
- return formats
+ chapters = self._extract_chapters(description_original, video_duration)
if 'conn' in video_info and video_info['conn'][0].startswith('rtmp'):
self.report_rtmp_download()
@@ -1657,7 +1741,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
player_desc = 'flash player %s' % player_version
else:
player_version = self._search_regex(
- [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js', r'(?:www|player)-([^/]+)/base\.js'],
+ [r'html5player-([^/]+?)(?:/html5player(?:-new)?)?\.js',
+ r'(?:www|player)-([^/]+)(?:/[a-z]{2}_[A-Z]{2})?/base\.js'],
player_url,
'html5 player', fatal=False)
player_desc = 'html5 player %s' % player_version
@@ -1718,11 +1803,22 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
formats.append(dct)
elif video_info.get('hlsvp'):
manifest_url = video_info['hlsvp'][0]
- url_map = self._extract_from_m3u8(manifest_url, video_id)
- formats = _map_to_format_list(url_map)
- # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
- for a_format in formats:
+ formats = []
+ m3u8_formats = self._extract_m3u8_formats(
+ manifest_url, video_id, 'mp4', fatal=False)
+ for a_format in m3u8_formats:
+ itag = self._search_regex(
+ r'/itag/(\d+)/', a_format['url'], 'itag', default=None)
+ if itag:
+ a_format['format_id'] = itag
+ if itag in self._formats:
+ dct = self._formats[itag].copy()
+ dct.update(a_format)
+ a_format = dct
+ a_format['player_url'] = player_url
+ # Accept-Encoding header causes failures in live streams on Youtube and Youtube Gaming
a_format.setdefault('http_headers', {})['Youtubedl-no-compression'] = 'True'
+ formats.append(a_format)
else:
unavailable_message = self._html_search_regex(
r'(?s)<h1[^>]+id="unavailable-message"[^>]*>(.+?)</h1>',
@@ -1806,6 +1902,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
'duration': video_duration,
'age_limit': 18 if age_gate else 0,
'annotations': video_annotations,
+ 'chapters': chapters,
'webpage_url': proto + '://www.youtube.com/watch?v=%s' % video_id,
'view_count': view_count,
'like_count': like_count,
diff --git a/youtube_dl/extractor/zaq1.py b/youtube_dl/extractor/zaq1.py
new file mode 100644
index 0000000..889aff5
--- /dev/null
+++ b/youtube_dl/extractor/zaq1.py
@@ -0,0 +1,101 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ unified_timestamp,
+)
+
+
+class Zaq1IE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?zaq1\.pl/video/(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://zaq1.pl/video/xev0e',
+ 'md5': '24a5eb3f052e604ae597c4d0d19b351e',
+ 'info_dict': {
+ 'id': 'xev0e',
+ 'title': 'DJ NA WESELE. TANIEC Z FIGURAMI.węgrów/sokołów podlaski/siedlce/mińsk mazowiecki/warszawa',
+ 'description': 'www.facebook.com/weseledjKontakt: 728 448 199 / 505 419 147',
+ 'ext': 'mp4',
+ 'duration': 511,
+ 'timestamp': 1490896361,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170330',
+ 'view_count': int,
+ }
+ }, {
+ # malformed JSON-LD
+ 'url': 'http://zaq1.pl/video/x81vn',
+ 'info_dict': {
+ 'id': 'x81vn',
+ 'title': 'SEKRETNE ŻYCIE WALTERA MITTY',
+ 'ext': 'mp4',
+ 'duration': 6234,
+ 'timestamp': 1493494860,
+ 'uploader': 'Anonim',
+ 'upload_date': '20170429',
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to parse JSON'],
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'data-video-url=(["\'])(?P<url>(?:(?!\1).)+)\1', webpage,
+ 'video url', group='url')
+
+ info = self._search_json_ld(webpage, video_id, fatal=False)
+
+ def extract_data(field, name, fatal=False):
+ return self._search_regex(
+ r'data-%s=(["\'])(?P<field>(?:(?!\1).)+)\1' % field,
+ webpage, field, fatal=fatal, group='field')
+
+ if not info.get('title'):
+ info['title'] = extract_data('file-name', 'title', fatal=True)
+
+ if not info.get('duration'):
+ info['duration'] = int_or_none(extract_data('duration', 'duration'))
+
+ if not info.get('thumbnail'):
+ info['thumbnail'] = extract_data('photo-url', 'thumbnail')
+
+ if not info.get('timestamp'):
+ info['timestamp'] = unified_timestamp(self._html_search_meta(
+ 'uploadDate', webpage, 'timestamp'))
+
+ if not info.get('interactionCount'):
+ info['view_count'] = int_or_none(self._html_search_meta(
+ 'interactionCount', webpage, 'view count'))
+
+ uploader = self._html_search_regex(
+ r'Wideo dodał:\s*<a[^>]*>([^<]+)</a>', webpage, 'uploader',
+ fatal=False)
+
+ width = int_or_none(self._html_search_meta(
+ 'width', webpage, fatal=False))
+ height = int_or_none(self._html_search_meta(
+ 'height', webpage, fatal=False))
+
+ info.update({
+ 'id': video_id,
+ 'formats': [{
+ 'url': video_url,
+ 'width': width,
+ 'height': height,
+ 'http_headers': {
+ 'Referer': url,
+ },
+ }],
+ 'uploader': uploader,
+ })
+
+ return info
diff --git a/youtube_dl/jsinterp.py b/youtube_dl/jsinterp.py
index 24cdec2..7bda596 100644
--- a/youtube_dl/jsinterp.py
+++ b/youtube_dl/jsinterp.py
@@ -6,6 +6,7 @@ import re
from .utils import (
ExtractorError,
+ remove_quotes,
)
_OPERATORS = [
@@ -57,7 +58,6 @@ class JSInterpreter(object):
def interpret_expression(self, expr, local_vars, allow_recursion):
expr = expr.strip()
-
if expr == '': # Empty expression
return None
@@ -121,11 +121,19 @@ class JSInterpreter(object):
pass
m = re.match(
- r'(?P<var>%s)\.(?P<member>[^(]+)(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
+ r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
+ if m:
+ val = local_vars[m.group('in')]
+ idx = self.interpret_expression(
+ m.group('idx'), local_vars, allow_recursion - 1)
+ return val[idx]
+
+ m = re.match(
+ r'(?P<var>%s)(?:\.(?P<member>[^(]+)|\[(?P<member2>[^]]+)\])\s*(?:\(+(?P<args>[^()]*)\))?$' % _NAME_RE,
expr)
if m:
variable = m.group('var')
- member = m.group('member')
+ member = remove_quotes(m.group('member') or m.group('member2'))
arg_str = m.group('args')
if variable in local_vars:
@@ -173,14 +181,6 @@ class JSInterpreter(object):
return obj[member](argvals)
- m = re.match(
- r'(?P<in>%s)\[(?P<idx>.+)\]$' % _NAME_RE, expr)
- if m:
- val = local_vars[m.group('in')]
- idx = self.interpret_expression(
- m.group('idx'), local_vars, allow_recursion - 1)
- return val[idx]
-
for op, opfunc in _OPERATORS:
m = re.match(r'(?P<x>.+?)%s(?P<y>.+)' % re.escape(op), expr)
if not m:
@@ -211,21 +211,25 @@ class JSInterpreter(object):
raise ExtractorError('Unsupported JS expression %r' % expr)
def extract_object(self, objname):
+ _FUNC_NAME_RE = r'''(?:[a-zA-Z$0-9]+|"[a-zA-Z$0-9]+"|'[a-zA-Z$0-9]+')'''
obj = {}
obj_m = re.search(
- (r'(?<!this\.)%s\s*=\s*\{' % re.escape(objname)) +
- r'\s*(?P<fields>([a-zA-Z$0-9]+\s*:\s*function\(.*?\)\s*\{.*?\}(?:,\s*)?)*)' +
- r'\}\s*;',
+ r'''(?x)
+ (?<!this\.)%s\s*=\s*{\s*
+ (?P<fields>(%s\s*:\s*function\s*\(.*?\)\s*{.*?}(?:,\s*)?)*)
+ }\s*;
+ ''' % (re.escape(objname), _FUNC_NAME_RE),
self.code)
fields = obj_m.group('fields')
# Currently, it only supports function definitions
fields_m = re.finditer(
- r'(?P<key>[a-zA-Z$0-9]+)\s*:\s*function'
- r'\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}',
+ r'''(?x)
+ (?P<key>%s)\s*:\s*function\s*\((?P<args>[a-z,]+)\){(?P<code>[^}]+)}
+ ''' % _FUNC_NAME_RE,
fields)
for f in fields_m:
argnames = f.group('args').split(',')
- obj[f.group('key')] = self.build_function(argnames, f.group('code'))
+ obj[remove_quotes(f.group('key'))] = self.build_function(argnames, f.group('code'))
return obj
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 6b81153..3021a6f 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -459,16 +459,20 @@ def parseOpts(overrideArguments=None):
downloader.add_option(
'--fragment-retries',
dest='fragment_retries', metavar='RETRIES', default=10,
- help='Number of retries for a fragment (default is %default), or "infinite" (DASH and hlsnative only)')
+ help='Number of retries for a fragment (default is %default), or "infinite" (DASH, hlsnative and ISM)')
downloader.add_option(
'--skip-unavailable-fragments',
action='store_true', dest='skip_unavailable_fragments', default=True,
- help='Skip unavailable fragments (DASH and hlsnative only)')
+ help='Skip unavailable fragments (DASH, hlsnative and ISM)')
downloader.add_option(
'--abort-on-unavailable-fragment',
action='store_false', dest='skip_unavailable_fragments',
help='Abort downloading when some fragment is not available')
downloader.add_option(
+ '--keep-fragments',
+ action='store_true', dest='keep_fragments', default=False,
+ help='Keep downloaded fragments on disk after downloading is finished; fragments are erased by default')
+ downloader.add_option(
'--buffer-size',
dest='buffersize', metavar='SIZE', default='1024',
help='Size of download buffer (e.g. 1024 or 16K) (default is %default)')
@@ -810,11 +814,12 @@ def parseOpts(overrideArguments=None):
'--metadata-from-title',
metavar='FORMAT', dest='metafromtitle',
help='Parse additional metadata like song title / artist from the video title. '
- 'The format syntax is the same as --output, '
- 'the parsed parameters replace existing values. '
- 'Additional templates: %(album)s, %(artist)s. '
+ 'The format syntax is the same as --output. Regular expression with '
+ 'named capture groups may also be used. '
+ 'The parsed parameters replace existing values. '
'Example: --metadata-from-title "%(artist)s - %(title)s" matches a title like '
- '"Coldplay - Paradise"')
+ '"Coldplay - Paradise". '
+ 'Example (regex): --metadata-from-title "(?P<artist>.+?) - (?P<title>.+)"')
postproc.add_option(
'--xattrs',
action='store_true', dest='xattrs', default=False,
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index 7c162d9..c91ec85 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -4,6 +4,7 @@ import io
import os
import subprocess
import time
+import re
from .common import AudioConversionError, PostProcessor
@@ -22,6 +23,7 @@ from ..utils import (
subtitles_filename,
dfxp2srt,
ISO639Utils,
+ replace_extension,
)
@@ -429,17 +431,40 @@ class FFmpegMetadataPP(FFmpegPostProcessor):
filename = info['filepath']
temp_filename = prepend_extension(filename, 'temp')
+ in_filenames = [filename]
+ options = []
if info['ext'] == 'm4a':
- options = ['-vn', '-acodec', 'copy']
+ options.extend(['-vn', '-acodec', 'copy'])
else:
- options = ['-c', 'copy']
+ options.extend(['-c', 'copy'])
for (name, value) in metadata.items():
options.extend(['-metadata', '%s=%s' % (name, value)])
+ chapters = info.get('chapters', [])
+ if chapters:
+ metadata_filename = encodeFilename(replace_extension(filename, 'meta'))
+ with io.open(metadata_filename, 'wt', encoding='utf-8') as f:
+ def ffmpeg_escape(text):
+ return re.sub(r'(=|;|#|\\|\n)', r'\\\1', text)
+
+ metadata_file_content = ';FFMETADATA1\n'
+ for chapter in chapters:
+ metadata_file_content += '[CHAPTER]\nTIMEBASE=1/1000\n'
+ metadata_file_content += 'START=%d\n' % (chapter['start_time'] * 1000)
+ metadata_file_content += 'END=%d\n' % (chapter['end_time'] * 1000)
+ chapter_title = chapter.get('title')
+ if chapter_title:
+ metadata_file_content += 'title=%s\n' % ffmpeg_escape(chapter_title)
+ f.write(metadata_file_content)
+ in_filenames.append(metadata_filename)
+ options.extend(['-map_metadata', '1'])
+
self._downloader.to_screen('[ffmpeg] Adding metadata to \'%s\'' % filename)
- self.run_ffmpeg(filename, temp_filename, options)
+ self.run_ffmpeg_multiple_files(in_filenames, temp_filename, options)
+ if chapters:
+ os.remove(metadata_filename)
os.remove(encodeFilename(filename))
os.rename(encodeFilename(temp_filename), encodeFilename(filename))
return [], info
@@ -552,7 +577,7 @@ class FFmpegSubtitlesConvertorPP(FFmpegPostProcessor):
sub_filenames.append(old_file)
new_file = subtitles_filename(filename, lang, new_ext)
- if ext == 'dfxp' or ext == 'ttml' or ext == 'tt':
+ if ext in ('dfxp', 'ttml', 'tt'):
self._downloader.report_warning(
'You have requested to convert dfxp (TTML) subtitles into another format, '
'which results in style information loss')
diff --git a/youtube_dl/postprocessor/metadatafromtitle.py b/youtube_dl/postprocessor/metadatafromtitle.py
index 164edd3..c73f024 100644
--- a/youtube_dl/postprocessor/metadatafromtitle.py
+++ b/youtube_dl/postprocessor/metadatafromtitle.py
@@ -9,7 +9,9 @@ class MetadataFromTitlePP(PostProcessor):
def __init__(self, downloader, titleformat):
super(MetadataFromTitlePP, self).__init__(downloader)
self._titleformat = titleformat
- self._titleregex = self.format_to_regex(titleformat)
+ self._titleregex = (self.format_to_regex(titleformat)
+ if re.search(r'%\(\w+\)s', titleformat)
+ else titleformat)
def format_to_regex(self, fmt):
r"""
@@ -26,7 +28,7 @@ class MetadataFromTitlePP(PostProcessor):
regex += r'(?P<' + match.group(1) + '>.+)'
lastpos = match.end()
if lastpos < len(fmt):
- regex += re.escape(fmt[lastpos:len(fmt)])
+ regex += re.escape(fmt[lastpos:])
return regex
def run(self, info):
diff --git a/youtube_dl/socks.py b/youtube_dl/socks.py
index 0f5d7bd..5d4adbe 100644
--- a/youtube_dl/socks.py
+++ b/youtube_dl/socks.py
@@ -193,9 +193,10 @@ class sockssocket(socket.socket):
self._check_response_version(SOCKS5_VERSION, version)
- if method == Socks5Auth.AUTH_NO_ACCEPTABLE:
+ if method == Socks5Auth.AUTH_NO_ACCEPTABLE or (
+ method == Socks5Auth.AUTH_USER_PASS and (not self._proxy.username or not self._proxy.password)):
self.close()
- raise Socks5Error(method)
+ raise Socks5Error(Socks5Auth.AUTH_NO_ACCEPTABLE)
if method == Socks5Auth.AUTH_USER_PASS:
username = self._proxy.username.encode('utf-8')
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 2340bc3..4293a77 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -11,6 +11,7 @@ import contextlib
import ctypes
import datetime
import email.utils
+import email.header
import errno
import functools
import gzip
@@ -421,8 +422,8 @@ def clean_html(html):
# Newline vs <br />
html = html.replace('\n', ' ')
- html = re.sub(r'\s*<\s*br\s*/?\s*>\s*', '\n', html)
- html = re.sub(r'<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
+ html = re.sub(r'(?u)\s*<\s*br\s*/?\s*>\s*', '\n', html)
+ html = re.sub(r'(?u)<\s*/\s*p\s*>\s*<\s*p[^>]*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
@@ -1194,6 +1195,11 @@ def unified_timestamp(date_str, day_first=True):
# Remove AM/PM + timezone
date_str = re.sub(r'(?i)\s*(?:AM|PM)(?:\s+[A-Z]+)?', '', date_str)
+ # Remove unrecognized timezones from ISO 8601 alike timestamps
+ m = re.search(r'\d{1,2}:\d{1,2}(?:\.\d+)?(?P<tz>\s*[A-Z]+)$', date_str)
+ if m:
+ date_str = date_str[:-len(m.group('tz'))]
+
for expression in date_formats(day_first):
try:
dt = datetime.datetime.strptime(date_str, expression) - timezone + datetime.timedelta(hours=pm_delta)
@@ -2092,6 +2098,58 @@ def update_Request(req, url=None, data=None, headers={}, query={}):
return new_req
+def _multipart_encode_impl(data, boundary):
+ content_type = 'multipart/form-data; boundary=%s' % boundary
+
+ out = b''
+ for k, v in data.items():
+ out += b'--' + boundary.encode('ascii') + b'\r\n'
+ if isinstance(k, compat_str):
+ k = k.encode('utf-8')
+ if isinstance(v, compat_str):
+ v = v.encode('utf-8')
+ # RFC 2047 requires non-ASCII field names to be encoded, while RFC 7578
+ # suggests sending UTF-8 directly. Firefox sends UTF-8, too
+ content = b'Content-Disposition: form-data; name="' + k + b'"\r\n\r\n' + v + b'\r\n'
+ if boundary.encode('ascii') in content:
+ raise ValueError('Boundary overlaps with data')
+ out += content
+
+ out += b'--' + boundary.encode('ascii') + b'--\r\n'
+
+ return out, content_type
+
+
+def multipart_encode(data, boundary=None):
+ '''
+ Encode a dict to RFC 7578-compliant form-data
+
+ data:
+ A dict where keys and values can be either Unicode or bytes-like
+ objects.
+ boundary:
+ If specified a Unicode object, it's used as the boundary. Otherwise
+ a random boundary is generated.
+
+ Reference: https://tools.ietf.org/html/rfc7578
+ '''
+ has_specified_boundary = boundary is not None
+
+ while True:
+ if boundary is None:
+ boundary = '---------------' + str(random.randrange(0x0fffffff, 0xffffffff))
+
+ try:
+ out, content_type = _multipart_encode_impl(data, boundary)
+ break
+ except ValueError:
+ if has_specified_boundary:
+ raise
+ boundary = None
+
+ return out, content_type
+
+
def dict_get(d, key_or_keys, default=None, skip_false_values=True):
if isinstance(key_or_keys, (list, tuple)):
for key in key_or_keys:
@@ -2103,13 +2161,16 @@ def dict_get(d, key_or_keys, default=None, skip_false_values=True):
def try_get(src, getter, expected_type=None):
- try:
- v = getter(src)
- except (AttributeError, KeyError, TypeError, IndexError):
- pass
- else:
- if expected_type is None or isinstance(v, expected_type):
- return v
+ if not isinstance(getter, (list, tuple)):
+ getter = [getter]
+ for get in getter:
+ try:
+ v = get(src)
+ except (AttributeError, KeyError, TypeError, IndexError):
+ pass
+ else:
+ if expected_type is None or isinstance(v, expected_type):
+ return v
def encode_compat_str(string, encoding=preferredencoding(), errors='strict'):
@@ -2270,10 +2331,8 @@ def mimetype2ext(mt):
return {
'3gpp': '3gp',
'smptett+xml': 'tt',
- 'srt': 'srt',
'ttaf+xml': 'dfxp',
'ttml+xml': 'ttml',
- 'vtt': 'vtt',
'x-flv': 'flv',
'x-mp4-fragmented': 'mp4',
'x-ms-wmv': 'wmv',
@@ -2281,11 +2340,11 @@ def mimetype2ext(mt):
'x-mpegurl': 'm3u8',
'vnd.apple.mpegurl': 'm3u8',
'dash+xml': 'mpd',
- 'f4m': 'f4m',
'f4m+xml': 'f4m',
'hds+xml': 'f4m',
'vnd.ms-sstr+xml': 'ism',
'quicktime': 'mov',
+ 'mp2t': 'ts',
}.get(res, res)
@@ -2301,11 +2360,11 @@ def parse_codecs(codecs_str):
if codec in ('avc1', 'avc2', 'avc3', 'avc4', 'vp9', 'vp8', 'hev1', 'hev2', 'h263', 'h264', 'mp4v'):
if not vcodec:
vcodec = full_codec
- elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3'):
+ elif codec in ('mp4a', 'opus', 'vorbis', 'mp3', 'aac', 'ac-3', 'ec-3', 'eac3', 'dtsc', 'dtse', 'dtsh', 'dtsl'):
if not acodec:
acodec = full_codec
else:
- write_string('WARNING: Unknown codec %s' % full_codec, sys.stderr)
+ write_string('WARNING: Unknown codec %s\n' % full_codec, sys.stderr)
if not vcodec and not acodec:
if len(splited_codecs) == 2:
return {
@@ -2508,27 +2567,97 @@ def srt_subtitles_timecode(seconds):
def dfxp2srt(dfxp_data):
+ LEGACY_NAMESPACES = (
+ ('http://www.w3.org/ns/ttml', [
+ 'http://www.w3.org/2004/11/ttaf1',
+ 'http://www.w3.org/2006/04/ttaf1',
+ 'http://www.w3.org/2006/10/ttaf1',
+ ]),
+ ('http://www.w3.org/ns/ttml#styling', [
+ 'http://www.w3.org/ns/ttml#style',
+ ]),
+ )
+
+ SUPPORTED_STYLING = [
+ 'color',
+ 'fontFamily',
+ 'fontSize',
+ 'fontStyle',
+ 'fontWeight',
+ 'textDecoration'
+ ]
+
_x = functools.partial(xpath_with_ns, ns_map={
'ttml': 'http://www.w3.org/ns/ttml',
- 'ttaf1': 'http://www.w3.org/2006/10/ttaf1',
- 'ttaf1_0604': 'http://www.w3.org/2006/04/ttaf1',
+ 'tts': 'http://www.w3.org/ns/ttml#styling',
})
+ styles = {}
+ default_style = {}
+
class TTMLPElementParser(object):
- out = ''
+ _out = ''
+ _unclosed_elements = []
+ _applied_styles = []
def start(self, tag, attrib):
- if tag in (_x('ttml:br'), _x('ttaf1:br'), 'br'):
- self.out += '\n'
+ if tag in (_x('ttml:br'), 'br'):
+ self._out += '\n'
+ else:
+ unclosed_elements = []
+ style = {}
+ element_style_id = attrib.get('style')
+ if default_style:
+ style.update(default_style)
+ if element_style_id:
+ style.update(styles.get(element_style_id, {}))
+ for prop in SUPPORTED_STYLING:
+ prop_val = attrib.get(_x('tts:' + prop))
+ if prop_val:
+ style[prop] = prop_val
+ if style:
+ font = ''
+ for k, v in sorted(style.items()):
+ if self._applied_styles and self._applied_styles[-1].get(k) == v:
+ continue
+ if k == 'color':
+ font += ' color="%s"' % v
+ elif k == 'fontSize':
+ font += ' size="%s"' % v
+ elif k == 'fontFamily':
+ font += ' face="%s"' % v
+ elif k == 'fontWeight' and v == 'bold':
+ self._out += '<b>'
+ unclosed_elements.append('b')
+ elif k == 'fontStyle' and v == 'italic':
+ self._out += '<i>'
+ unclosed_elements.append('i')
+ elif k == 'textDecoration' and v == 'underline':
+ self._out += '<u>'
+ unclosed_elements.append('u')
+ if font:
+ self._out += '<font' + font + '>'
+ unclosed_elements.append('font')
+ applied_style = {}
+ if self._applied_styles:
+ applied_style.update(self._applied_styles[-1])
+ applied_style.update(style)
+ self._applied_styles.append(applied_style)
+ self._unclosed_elements.append(unclosed_elements)
def end(self, tag):
- pass
+ if tag not in (_x('ttml:br'), 'br'):
+ unclosed_elements = self._unclosed_elements.pop()
+ for element in reversed(unclosed_elements):
+ self._out += '</%s>' % element
+ if unclosed_elements and self._applied_styles:
+ self._applied_styles.pop()
def data(self, data):
- self.out += data
+ self._out += data
def close(self):
- return self.out.strip()
+ return self._out.strip()
def parse_node(node):
target = TTMLPElementParser()
@@ -2536,13 +2665,45 @@ def dfxp2srt(dfxp_data):
parser.feed(xml.etree.ElementTree.tostring(node))
return parser.close()
+ for k, v in LEGACY_NAMESPACES:
+ for ns in v:
+ dfxp_data = dfxp_data.replace(ns, k)
+
dfxp = compat_etree_fromstring(dfxp_data.encode('utf-8'))
out = []
- paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall(_x('.//ttaf1:p')) or dfxp.findall(_x('.//ttaf1_0604:p')) or dfxp.findall('.//p')
+ paras = dfxp.findall(_x('.//ttml:p')) or dfxp.findall('.//p')
if not paras:
raise ValueError('Invalid dfxp/TTML subtitle')
+ repeat = False
+ while True:
+ for style in dfxp.findall(_x('.//ttml:style')):
+ style_id = style.get('id')
+ parent_style_id = style.get('style')
+ if parent_style_id:
+ if parent_style_id not in styles:
+ repeat = True
+ continue
+ styles[style_id] = styles[parent_style_id].copy()
+ for prop in SUPPORTED_STYLING:
+ prop_val = style.get(_x('tts:' + prop))
+ if prop_val:
+ styles.setdefault(style_id, {})[prop] = prop_val
+ if repeat:
+ repeat = False
+ else:
+ break
+
+ for p in ('body', 'div'):
+ ele = xpath_element(dfxp, [_x('.//ttml:' + p), './/' + p])
+ if ele is None:
+ continue
+ style = styles.get(ele.get('style'))
+ if not style:
+ continue
+ default_style.update(style)
+
for para, index in zip(paras, itertools.count(1)):
begin_time = parse_dfxp_time_expr(para.attrib.get('begin'))
end_time = parse_dfxp_time_expr(para.attrib.get('end'))
@@ -3652,3 +3813,11 @@ def write_xattr(path, key, value):
"Couldn't find a tool to set the xattrs. "
"Install either the python 'xattr' module, "
"or the 'xattr' binary.")
+
+
+def random_birthday(year_field, month_field, day_field):
+ return {
+ year_field: str(random.randint(1950, 1995)),
+ month_field: str(random.randint(1, 12)),
+ day_field: str(random.randint(1, 31)),
+ }
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index 94e8198..5e963e7 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2017.03.26'
+__version__ = '2017.05.18.1'