aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorWolfgang Wiedmeyer <wolfgit@wiedmeyer.de>2017-09-12 13:14:44 +0200
committerWolfgang Wiedmeyer <wolfgit@wiedmeyer.de>2017-09-12 13:14:44 +0200
commit6a19bfa390150fde01f5e865605114344ab68409 (patch)
treed87e9c55f031b529d6a6c2ca1a1e680407800f52
parent57e25a9c8a4791a370af77a3529d15c958eae4b3 (diff)
parent2f483758bc6a6661f1215c38161ee626d90ab655 (diff)
downloadyoutube-dl-6a19bfa390150fde01f5e865605114344ab68409.zip
youtube-dl-6a19bfa390150fde01f5e865605114344ab68409.tar.gz
youtube-dl-6a19bfa390150fde01f5e865605114344ab68409.tar.bz2
Merge branch 'upstream'
-rw-r--r--.github/ISSUE_TEMPLATE.md16
-rw-r--r--.github/ISSUE_TEMPLATE_tmpl.md12
-rw-r--r--AUTHORS4
-rw-r--r--CONTRIBUTING.md16
-rw-r--r--ChangeLog292
-rw-r--r--Makefile11
-rw-r--r--README.md58
-rw-r--r--docs/supportedsites.md39
-rw-r--r--test/test_InfoExtractor.py86
-rw-r--r--test/test_YoutubeDL.py37
-rw-r--r--test/test_options.py26
-rw-r--r--test/test_utils.py18
-rw-r--r--test/testdata/mpd/float_duration.mpd18
-rwxr-xr-xyoutube_dl/YoutubeDL.py87
-rw-r--r--youtube_dl/downloader/common.py6
-rw-r--r--youtube_dl/downloader/dash.py14
-rw-r--r--youtube_dl/downloader/hls.py6
-rw-r--r--youtube_dl/downloader/http.py347
-rw-r--r--youtube_dl/downloader/ism.py3
-rw-r--r--youtube_dl/extractor/abc.py17
-rw-r--r--youtube_dl/extractor/abcnews.py7
-rw-r--r--youtube_dl/extractor/adn.py5
-rw-r--r--youtube_dl/extractor/aliexpress.py53
-rw-r--r--youtube_dl/extractor/amcnetworks.py8
-rw-r--r--youtube_dl/extractor/animeondemand.py62
-rw-r--r--youtube_dl/extractor/aparat.py49
-rw-r--r--youtube_dl/extractor/ard.py14
-rw-r--r--youtube_dl/extractor/arte.py12
-rw-r--r--youtube_dl/extractor/audioboom.py2
-rw-r--r--youtube_dl/extractor/bandcamp.py7
-rw-r--r--youtube_dl/extractor/bbc.py8
-rw-r--r--youtube_dl/extractor/bpb.py13
-rw-r--r--youtube_dl/extractor/cbc.py12
-rwxr-xr-xyoutube_dl/extractor/cda.py2
-rw-r--r--youtube_dl/extractor/charlierose.py5
-rw-r--r--youtube_dl/extractor/chilloutzone.py9
-rw-r--r--youtube_dl/extractor/cinchcast.py14
-rw-r--r--youtube_dl/extractor/cjsw.py72
-rw-r--r--youtube_dl/extractor/clipfish.py67
-rw-r--r--youtube_dl/extractor/clippit.py74
-rw-r--r--youtube_dl/extractor/cloudy.py6
-rw-r--r--youtube_dl/extractor/common.py73
-rw-r--r--youtube_dl/extractor/cracked.py7
-rw-r--r--youtube_dl/extractor/crunchyroll.py4
-rw-r--r--youtube_dl/extractor/dailymail.py17
-rw-r--r--youtube_dl/extractor/dailymotion.py6
-rw-r--r--youtube_dl/extractor/dispeak.py6
-rw-r--r--youtube_dl/extractor/dplay.py66
-rw-r--r--youtube_dl/extractor/dramafever.py17
-rw-r--r--youtube_dl/extractor/drtv.py5
-rw-r--r--youtube_dl/extractor/eagleplatform.py61
-rw-r--r--youtube_dl/extractor/egghead.py71
-rw-r--r--youtube_dl/extractor/espn.py48
-rw-r--r--youtube_dl/extractor/extractors.py48
-rw-r--r--youtube_dl/extractor/fivetv.py5
-rw-r--r--youtube_dl/extractor/fourtube.py174
-rw-r--r--youtube_dl/extractor/fox.py125
-rw-r--r--youtube_dl/extractor/funnyordie.py64
-rw-r--r--youtube_dl/extractor/generic.py236
-rw-r--r--youtube_dl/extractor/giantbomb.py14
-rw-r--r--youtube_dl/extractor/googledrive.py239
-rw-r--r--youtube_dl/extractor/itv.py8
-rwxr-xr-xyoutube_dl/extractor/joj.py100
-rw-r--r--youtube_dl/extractor/kaltura.py2
-rw-r--r--youtube_dl/extractor/karrierevideos.py2
-rw-r--r--youtube_dl/extractor/laola1tv.py18
-rw-r--r--youtube_dl/extractor/limelight.py19
-rw-r--r--youtube_dl/extractor/liveleak.py78
-rw-r--r--youtube_dl/extractor/manyvids.py48
-rw-r--r--youtube_dl/extractor/megaphone.py55
-rw-r--r--youtube_dl/extractor/mixcloud.py48
-rw-r--r--youtube_dl/extractor/mlb.py8
-rw-r--r--youtube_dl/extractor/mpora.py62
-rw-r--r--youtube_dl/extractor/mtv.py31
-rw-r--r--youtube_dl/extractor/nexx.py271
-rw-r--r--youtube_dl/extractor/nick.py24
-rw-r--r--youtube_dl/extractor/niconico.py296
-rw-r--r--youtube_dl/extractor/npo.py9
-rw-r--r--youtube_dl/extractor/nrk.py2
-rw-r--r--youtube_dl/extractor/pbs.py20
-rw-r--r--youtube_dl/extractor/pearvideo.py63
-rw-r--r--youtube_dl/extractor/periscope.py22
-rw-r--r--youtube_dl/extractor/pluralsight.py53
-rw-r--r--youtube_dl/extractor/podomatic.py63
-rw-r--r--youtube_dl/extractor/pornhd.py5
-rw-r--r--youtube_dl/extractor/pornhub.py20
-rw-r--r--youtube_dl/extractor/qqmusic.py160
-rw-r--r--youtube_dl/extractor/radiocanada.py53
-rw-r--r--youtube_dl/extractor/rai.py4
-rw-r--r--youtube_dl/extractor/reddit.py114
-rw-r--r--youtube_dl/extractor/redtube.py21
-rw-r--r--youtube_dl/extractor/rutube.py185
-rw-r--r--youtube_dl/extractor/slideshare.py2
-rw-r--r--youtube_dl/extractor/soundcloud.py176
-rw-r--r--youtube_dl/extractor/spiegel.py30
-rw-r--r--youtube_dl/extractor/spiegeltv.py113
-rw-r--r--youtube_dl/extractor/sportbox.py61
-rw-r--r--youtube_dl/extractor/svt.py3
-rw-r--r--youtube_dl/extractor/tbs.py9
-rw-r--r--youtube_dl/extractor/teamfourstar.py48
-rw-r--r--youtube_dl/extractor/ted.py30
-rw-r--r--youtube_dl/extractor/thisoldhouse.py19
-rw-r--r--youtube_dl/extractor/toutv.py5
-rw-r--r--youtube_dl/extractor/twentymin.py2
-rw-r--r--youtube_dl/extractor/twitter.py174
-rw-r--r--youtube_dl/extractor/udemy.py49
-rw-r--r--youtube_dl/extractor/veoh.py73
-rw-r--r--youtube_dl/extractor/vh1.py12
-rw-r--r--youtube_dl/extractor/vice.py7
-rw-r--r--youtube_dl/extractor/vidio.py3
-rw-r--r--youtube_dl/extractor/vidme.py60
-rw-r--r--youtube_dl/extractor/vier.py47
-rw-r--r--youtube_dl/extractor/viidea.py19
-rw-r--r--youtube_dl/extractor/vine.py6
-rw-r--r--youtube_dl/extractor/vk.py7
-rw-r--r--youtube_dl/extractor/vlive.py62
-rw-r--r--youtube_dl/extractor/voot.py98
-rw-r--r--youtube_dl/extractor/vzaar.py8
-rw-r--r--youtube_dl/extractor/watchbox.py151
-rw-r--r--youtube_dl/extractor/xhamster.py96
-rw-r--r--youtube_dl/extractor/xuite.py88
-rw-r--r--youtube_dl/extractor/xxxymovies.py4
-rw-r--r--youtube_dl/extractor/yam.py123
-rw-r--r--youtube_dl/extractor/yandexdisk.py118
-rw-r--r--youtube_dl/extractor/youjizz.py78
-rw-r--r--youtube_dl/extractor/youku.py78
-rw-r--r--youtube_dl/extractor/youtube.py102
-rw-r--r--youtube_dl/options.py38
-rw-r--r--youtube_dl/postprocessor/ffmpeg.py2
-rw-r--r--youtube_dl/utils.py12
-rw-r--r--youtube_dl/version.py2
131 files changed, 5001 insertions, 1778 deletions
diff --git a/.github/ISSUE_TEMPLATE.md b/.github/ISSUE_TEMPLATE.md
index 82bbbda..f40cb2c 100644
--- a/.github/ISSUE_TEMPLATE.md
+++ b/.github/ISSUE_TEMPLATE.md
@@ -1,16 +1,16 @@
## Please follow the guide below
- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly
-- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x])
-- Use *Preview* tab to see how your issue will actually look like
+- Put an `x` into all the boxes [ ] relevant to your *issue* (like this: `[x]`)
+- Use the *Preview* tab to see what your issue will actually look like
---
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.06.25*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
-- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.06.25**
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *2017.09.11*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+- [ ] I've **verified** and **I assure** that I'm running youtube-dl **2017.09.11**
### Before submitting an *issue* make sure you have:
-- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
+- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones
### What is the purpose of your *issue*?
@@ -28,14 +28,14 @@
### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows:
-Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```):
+Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl -v <your command line>`), copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```):
+
```
-$ youtube-dl -v <your command line>
[debug] System config: []
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
-[debug] youtube-dl version 2017.06.25
+[debug] youtube-dl version 2017.09.11
[debug] Python version 2.7.11 - Windows-2003Server-5.2.3790-SP2
[debug] exe versions: ffmpeg N-75573-g1d0487f, ffprobe N-75573-g1d0487f, rtmpdump 2.4
[debug] Proxy map: {}
diff --git a/.github/ISSUE_TEMPLATE_tmpl.md b/.github/ISSUE_TEMPLATE_tmpl.md
index df79503..26f61d3 100644
--- a/.github/ISSUE_TEMPLATE_tmpl.md
+++ b/.github/ISSUE_TEMPLATE_tmpl.md
@@ -1,16 +1,16 @@
## Please follow the guide below
- You will be asked some questions and requested to provide some information, please read them **carefully** and answer honestly
-- Put an `x` into all the boxes [ ] relevant to your *issue* (like that [x])
-- Use *Preview* tab to see how your issue will actually look like
+- Put an `x` into all the boxes [ ] relevant to your *issue* (like this: `[x]`)
+- Use the *Preview* tab to see what your issue will actually look like
---
-### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *%(version)s*. If it's not read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
+### Make sure you are using the *latest* version: run `youtube-dl --version` and ensure your version is *%(version)s*. If it's not, read [this FAQ entry](https://github.com/rg3/youtube-dl/blob/master/README.md#how-do-i-update-youtube-dl) and update. Issues with outdated version will be rejected.
- [ ] I've **verified** and **I assure** that I'm running youtube-dl **%(version)s**
### Before submitting an *issue* make sure you have:
-- [ ] At least skimmed through [README](https://github.com/rg3/youtube-dl/blob/master/README.md) and **most notably** [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
+- [ ] At least skimmed through the [README](https://github.com/rg3/youtube-dl/blob/master/README.md), **most notably** the [FAQ](https://github.com/rg3/youtube-dl#faq) and [BUGS](https://github.com/rg3/youtube-dl#bugs) sections
- [ ] [Searched](https://github.com/rg3/youtube-dl/search?type=Issues) the bugtracker for similar issues including closed ones
### What is the purpose of your *issue*?
@@ -28,9 +28,9 @@
### If the purpose of this *issue* is a *bug report*, *site support request* or you are not completely sure provide the full verbose output as follows:
-Add `-v` flag to **your command line** you run youtube-dl with, copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```):
+Add the `-v` flag to **your command line** you run youtube-dl with (`youtube-dl -v <your command line>`), copy the **whole** output and insert it here. It should look similar to one below (replace it with **your** log inserted between triple ```):
+
```
-$ youtube-dl -v <your command line>
[debug] System config: []
[debug] User config: []
[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
diff --git a/AUTHORS b/AUTHORS
index e2bdebe..478c787 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -220,3 +220,7 @@ gritstub
Adam Voss
Mike Fährmann
Jan Kundrát
+Giuseppe Fabiano
+Örn Guðjónsson
+Parmjit Virk
+Genki Sky
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index d606eab..333acee 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -3,7 +3,7 @@
$ youtube-dl -v <your command line>
[debug] System config: []
[debug] User config: []
-[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
+[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2015.12.06
[debug] Git HEAD: 135392e
@@ -34,7 +34,7 @@ For bug reports, this means that your report should contain the *complete* outpu
If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/).
-**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL.
+**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL.
### Are you using the latest version?
@@ -70,7 +70,7 @@ It may sound strange, but some bug reports we receive are completely unrelated t
# DEVELOPER INSTRUCTIONS
-Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
+Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
To run youtube-dl as a developer, you don't need to build anything either. Simply execute
@@ -82,6 +82,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file
python test/test_download.py
nosetests
+See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases.
+
If you want to create a build of youtube-dl yourself, you'll need
* python
@@ -118,7 +120,7 @@ After you have ensured this site is distributing its content legally, you can fo
class YourExtractorIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://yourextractor.com/watch/42',
+ 'url': 'https://yourextractor.com/watch/42',
'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
'info_dict': {
'id': '42',
@@ -149,10 +151,10 @@ After you have ensured this site is distributing its content legally, you can fo
}
```
5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
-6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
-8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
-9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this:
$ git add youtube_dl/extractor/extractors.py
$ git add youtube_dl/extractor/yourextractor.py
diff --git a/ChangeLog b/ChangeLog
index 7b3c6c6..c286da6 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,296 @@
-version <unreleased>
+version 2017.09.11
Extractors
+* [rutube:playlist] Fix suitable (#14166)
+
+
+version 2017.09.10
+
+Core
++ [utils] Introduce bool_or_none
+* [YoutubeDL] Ensure dir existence for each requested format (#14116)
+
+Extractors
+* [fox] Fix extraction (#14147)
+* [rutube] Use bool_or_none
+* [rutube] Rework and generalize playlist extractors (#13565)
++ [rutube:playlist] Add support for playlists (#13534, #13565)
++ [radiocanada] Add fallback for title extraction (#14145)
+* [vk] Use dedicated YouTube embeds extraction routine
+* [vice] Use dedicated YouTube embeds extraction routine
+* [cracked] Use dedicated YouTube embeds extraction routine
+* [chilloutzone] Use dedicated YouTube embeds extraction routine
+* [abcnews] Use dedicated YouTube embeds extraction routine
+* [youtube] Separate methods for embeds extraction
+* [redtube] Fix formats extraction (#14122)
+* [arte] Relax unavailability check (#14112)
++ [manyvids] Add support for preview videos from manyvids.com (#14053, #14059)
+* [vidme:user] Relax URL regular expression (#14054)
+* [bpb] Fix extraction (#14043, #14086)
+* [soundcloud] Fix download URL with private tracks (#14093)
+* [aliexpress:live] Add support for live.aliexpress.com (#13698, #13707)
+* [viidea] Capture and output lecture error message (#14099)
+* [radiocanada] Skip unsupported platforms (#14100)
+
+
+version 2017.09.02
+
+Extractors
+* [youtube] Force old layout for each webpage (#14068, #14072, #14074, #14076,
+ #14077, #14079, #14082, #14083, #14094, #14095, #14096)
+* [youtube] Fix upload date extraction (#14065)
++ [charlierose] Add support for episodes (#14062)
++ [bbccouk] Add support for w-prefixed ids (#14056)
+* [googledrive] Extend URL regular expression (#9785)
++ [googledrive] Add support for source format (#14046)
+* [pornhd] Fix extraction (#14005)
+
+
+version 2017.08.27.1
+
+Extractors
+
+* [youtube] Fix extraction with --youtube-skip-dash-manifest enabled (#14037)
+
+
+version 2017.08.27
+
+Core
++ [extractor/common] Extract height and format id for HTML5 videos (#14034)
+* [downloader/http] Rework HTTP downloader (#506, #809, #2849, #4240, #6023,
+ #8625, #9483)
+ * Simplify code and split into separate routines to facilitate maintaining
+ * Make retry mechanism work on errors during actual download not only
+ during connection establishment phase
+ * Retry on ECONNRESET and ETIMEDOUT during reading data from network
+ * Retry on content too short
+ * Show error description on retry
+
+Extractors
+* [generic] Lower preference for extraction from LD-JSON
+* [rai] Fix audio formats extraction (#14024)
+* [youtube] Fix controversy videos extraction (#14027, #14029)
+* [mixcloud] Fix extraction (#14015, #14020)
+
+
+version 2017.08.23
+
+Core
++ [extractor/common] Introduce _parse_xml
+* [extractor/common] Make HLS and DASH extraction in_parse_html5_media_entries
+ non fatal (#13970)
+* [utils] Fix unescapeHTML for misformed string like "&a&quot;" (#13935)
+
+Extractors
+* [cbc:watch] Bypass geo restriction (#13993)
+* [toutv] Relax DRM check (#13994)
++ [googledrive] Add support for subtitles (#13619, #13638)
+* [pornhub] Relax uploader regular expression (#13906, #13975)
+* [bandcamp:album] Extract track titles (#13962)
++ [bbccouk] Add support for events URLs (#13893)
++ [liveleak] Support multi-video pages (#6542)
++ [liveleak] Support another liveleak embedding pattern (#13336)
+* [cda] Fix extraction (#13935)
++ [laola1tv] Add support for tv.ittf.com (#13965)
+* [mixcloud] Fix extraction (#13958, #13974, #13980, #14003)
+
+
+version 2017.08.18
+
+Core
+* [YoutubeDL] Sanitize byte string format URLs (#13951)
++ [extractor/common] Add support for float durations in _parse_mpd_formats
+ (#13919)
+
+Extractors
+* [arte] Detect unavailable videos (#13945)
+* [generic] Convert redirect URLs to unicode strings (#13951)
+* [udemy] Fix paid course detection (#13943)
+* [pluralsight] Use RPC API for course extraction (#13937)
++ [clippit] Add support for clippituser.tv
++ [qqmusic] Support new URL schemes (#13805)
+* [periscope] Renew HLS extraction (#13917)
+* [mixcloud] Extract decrypt key
+
+
+version 2017.08.13
+
+Core
+* [YoutubeDL] Make sure format id is not empty
+* [extractor/common] Make _family_friendly_search optional
+* [extractor/common] Respect source's type attribute for HTML5 media (#13892)
+
+Extractors
+* [pornhub:playlistbase] Skip videos from drop-down menu (#12819, #13902)
++ [fourtube] Add support pornerbros.com (#6022)
++ [fourtube] Add support porntube.com (#7859, #13901)
++ [fourtube] Add support fux.com
+* [limelight] Improve embeds detection (#13895)
++ [reddit] Add support for v.redd.it and reddit.com (#13847)
+* [aparat] Extract all formats (#13887)
+* [mixcloud] Fix play info decryption (#13885)
++ [generic] Add support for vzaar embeds (#13876)
+
+
+version 2017.08.09
+
+Core
+* [utils] Skip missing params in cli_bool_option (#13865)
+
+Extractors
+* [xxxymovies] Fix title extraction (#13868)
++ [nick] Add support for nick.com.pl (#13860)
+* [mixcloud] Fix play info decryption (#13867)
+* [20min] Fix embeds extraction (#13852)
+* [dplayit] Fix extraction (#13851)
++ [niconico] Support videos with multiple formats (#13522)
++ [niconico] Support HTML5-only videos (#13806)
+
+
+version 2017.08.06
+
+Core
+* Use relative paths for DASH fragments (#12990)
+
+Extractors
+* [pluralsight] Fix format selection
+- [mpora] Remove extractor (#13826)
++ [voot] Add support for voot.com (#10255, #11644, #11814, #12350, #13218)
+* [vlive:channel] Limit number of videos per page to 100 (#13830)
+* [podomatic] Extend URL regular expression (#13827)
+* [cinchcast] Extend URL regular expression
+* [yandexdisk] Relax URL regular expression (#13824)
+* [vidme] Extract DASH and HLS formats
+- [teamfour] Remove extractor (#13782)
+* [pornhd] Fix extraction (#13783)
+* [udemy] Fix subtitles extraction (#13812)
+* [mlb] Extend URL regular expression (#13740, #13773)
++ [pbs] Add support for new URL schema (#13801)
+* [nrktv] Update API host (#13796)
+
+
+version 2017.07.30.1
+
+Core
+* [downloader/hls] Use redirect URL as manifest base (#13755)
+* [options] Correctly hide login info from debug outputs (#13696)
+
+Extractors
++ [watchbox] Add support for watchbox.de (#13739)
+- [clipfish] Remove extractor
++ [youjizz] Fix extraction (#13744)
++ [generic] Add support for another ooyala embed pattern (#13727)
++ [ard] Add support for lives (#13771)
+* [soundcloud] Update client id
++ [soundcloud:trackstation] Add support for track stations (#13733)
+* [svtplay] Use geo verification proxy for API request
+* [svtplay] Update API URL (#13767)
++ [yandexdisk] Add support for yadi.sk (#13755)
++ [megaphone] Add support for megaphone.fm
+* [amcnetworks] Make rating optional (#12453)
+* [cloudy] Fix extraction (#13737)
++ [nickru] Add support for nickelodeon.ru
+* [mtv] Improve thumbnal extraction
+* [nick] Automate geo-restriction bypass (#13711)
+* [niconico] Improve error reporting (#13696)
+
+
+version 2017.07.23
+
+Core
+* [YoutubeDL] Improve default format specification (#13704)
+* [YoutubeDL] Do not override id, extractor and extractor_key for
+ url_transparent entities
+* [extractor/common] Fix playlist_from_matches
+
+Extractors
+* [itv] Fix production id extraction (#13671, #13703)
+* [vidio] Make duration non fatal and fix typo
+* [mtv] Skip missing video parts (#13690)
+* [sportbox:embed] Fix extraction
++ [npo] Add support for npo3.nl URLs (#13695)
+* [dramafever] Remove video id from title (#13699)
++ [egghead:lesson] Add support for lessons (#6635)
+* [funnyordie] Extract more metadata (#13677)
+* [youku:show] Fix playlist extraction (#13248)
++ [dispeak] Recognize sevt subdomain (#13276)
+* [adn] Improve error reporting (#13663)
+* [crunchyroll] Relax series and season regular expression (#13659)
++ [spiegel:article] Add support for nexx iframe embeds (#13029)
++ [nexx:embed] Add support for iframe embeds
+* [nexx] Improve JS embed extraction
++ [pearvideo] Add support for pearvideo.com (#13031)
+
+
+version 2017.07.15
+
+Core
+* [YoutubeDL] Don't expand environment variables in meta fields (#13637)
+
+Extractors
+* [spiegeltv] Delegate extraction to nexx extractor (#13159)
++ [nexx] Add support for nexx.cloud (#10807, #13465)
+* [generic] Fix rutube embeds extraction (#13641)
+* [karrierevideos] Fix title extraction (#13641)
+* [youtube] Don't capture YouTube Red ad for creator meta field (#13621)
+* [slideshare] Fix extraction (#13617)
++ [5tv] Add another video URL pattern (#13354, #13606)
+* [drtv] Make HLS and HDS extraction non fatal
+* [ted] Fix subtitles extraction (#13628, #13629)
+* [vine] Make sure the title won't be empty
++ [twitter] Support HLS streams in vmap URLs
++ [periscope] Support pscp.tv URLs in embedded frames
+* [twitter] Extract mp4 urls via mobile API (#12726)
+* [niconico] Fix authentication error handling (#12486)
+* [giantbomb] Extract m3u8 formats (#13626)
++ [vlive:playlist] Add support for playlists (#13613)
+
+
+version 2017.07.09
+
+Core
++ [extractor/common] Add support for AMP tags in _parse_html5_media_entries
++ [utils] Support attributes with no values in get_elements_by_attribute
+
+Extractors
++ [dailymail] Add support for embeds
++ [joj] Add support for joj.sk (#13268)
+* [abc.net.au:iview] Extract more formats (#13492, #13489)
+* [egghead:course] Fix extraction (#6635, #13370)
++ [cjsw] Add support for cjsw.com (#13525)
++ [eagleplatform] Add support for referrer protected videos (#13557)
++ [eagleplatform] Add support for another embed pattern (#13557)
+* [veoh] Extend URL regular expression (#13601)
+* [npo:live] Fix live stream id extraction (#13568, #13605)
+* [googledrive] Fix height extraction (#13603)
++ [dailymotion] Add support for new layout (#13580)
+- [yam] Remove extractor
+* [xhamster] Extract all formats and fix duration extraction (#13593)
++ [xhamster] Add support for new URL schema (#13593)
+* [espn] Extend URL regular expression (#13244, #13549)
+* [kaltura] Fix typo in subtitles extraction (#13569)
+* [vier] Adapt extraction to redesign (#13575)
+
+
+version 2017.07.02
+
+Core
+* [extractor/common] Improve _json_ld
+
+Extractors
++ [thisoldhouse] Add more fallbacks for video id
+* [thisoldhouse] Fix video id extraction (#13540, #13541)
+* [xfileshare] Extend format regular expression (#13536)
+* [ted] Fix extraction (#13535)
++ [tastytrade] Add support for tastytrade.com (#13521)
+* [dplayit] Relax video id regular expression (#13524)
++ [generic] Extract more generic metadata (#13527)
++ [bbccouk] Capture and output error message (#13501, #13518)
+* [cbsnews] Relax video info regular expression (#13284, #13503)
++ [facebook] Add support for plugin video embeds and multiple embeds (#13493)
+* [soundcloud] Switch to https for API requests (#13502)
+* [pandatv] Switch to https for API and download URLs
++ [pandatv] Add support for https URLs (#13491)
+ [niconico] Support sp subdomain (#13494)
diff --git a/Makefile b/Makefile
index 84ccce2..c74eea7 100644
--- a/Makefile
+++ b/Makefile
@@ -46,8 +46,15 @@ tar: youtube-dl.tar.gz
pypi-files: youtube-dl.bash-completion README.txt youtube-dl.1 youtube-dl.fish
youtube-dl: youtube_dl/*.py youtube_dl/*/*.py
- zip --quiet youtube-dl youtube_dl/*.py youtube_dl/*/*.py
- zip --quiet --junk-paths youtube-dl youtube_dl/__main__.py
+ mkdir -p zip
+ for d in youtube_dl youtube_dl/downloader youtube_dl/extractor youtube_dl/postprocessor ; do \
+ mkdir -p zip/$$d ;\
+ cp -pPR $$d/*.py zip/$$d/ ;\
+ done
+ touch -t 200001010101 zip/youtube_dl/*.py zip/youtube_dl/*/*.py
+ mv zip/youtube_dl/__main__.py zip/
+ cd zip ; zip -q ../youtube-dl youtube_dl/*.py youtube_dl/*/*.py __main__.py
+ rm -rf zip
echo '#!$(PYTHON)' > youtube-dl
cat youtube-dl.zip >> youtube-dl
rm youtube-dl.zip
diff --git a/README.md b/README.md
index fe2bebc..28ee63f 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ If you do not have curl, you can alternatively use a recent wget:
sudo wget https://yt-dl.org/downloads/latest/youtube-dl -O /usr/local/bin/youtube-dl
sudo chmod a+rx /usr/local/bin/youtube-dl
-Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](http://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`).
+Windows users can [download an .exe file](https://yt-dl.org/latest/youtube-dl.exe) and place it in any location on their [PATH](https://en.wikipedia.org/wiki/PATH_%28variable%29) except for `%SYSTEMROOT%\System32` (e.g. **do not** put in `C:\Windows\System32`).
You can also use pip:
@@ -33,7 +33,7 @@ You can also use pip:
This command will update youtube-dl if you have already installed it. See the [pypi page](https://pypi.python.org/pypi/youtube_dl) for more information.
-OS X users can install youtube-dl with [Homebrew](http://brew.sh/):
+OS X users can install youtube-dl with [Homebrew](https://brew.sh/):
brew install youtube-dl
@@ -458,7 +458,7 @@ You can also use `--config-location` if you want to use custom configuration fil
### Authentication with `.netrc` file
-You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](http://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you:
+You may also want to configure automatic credentials storage for extractors that support authentication (by providing login and password with `--username` and `--password`) in order not to pass credentials as command line arguments on every youtube-dl execution and prevent tracking plain text passwords in the shell command history. You can achieve this using a [`.netrc` file](https://stackoverflow.com/tags/.netrc/info) on a per extractor basis. For that you will need to create a `.netrc` file in your `$HOME` and restrict permissions to read/write by only you:
```
touch $HOME/.netrc
chmod a-rwx,u+rw $HOME/.netrc
@@ -485,7 +485,7 @@ The `-o` option allows users to indicate a template for the output file names.
**tl;dr:** [navigate me to examples](#output-template-examples).
-The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "http://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are:
+The basic usage is not to set any template arguments when downloading a single file, like in `youtube-dl -o funny_video.flv "https://some/video"`. However, it may contain special sequences that will be replaced when downloading each video. The special sequences may be formatted according to [python string formatting operations](https://docs.python.org/2/library/stdtypes.html#string-formatting). For example, `%(NAME)s` or `%(NAME)05d`. To clarify, that is a percent symbol followed by a name in parentheses, followed by a formatting operations. Allowed names along with sequence type are:
- `id` (string): Video identifier
- `title` (string): Video title
@@ -584,7 +584,7 @@ If you are using an output template inside a Windows batch file then you must es
#### Output template examples
-Note on Windows you may need to use double quotes instead of single.
+Note that on Windows you may need to use double quotes instead of single.
```bash
$ youtube-dl --get-filename -o '%(title)s.%(ext)s' BaW_jenozKc
@@ -603,7 +603,7 @@ $ youtube-dl -o '%(uploader)s/%(playlist)s/%(playlist_index)s - %(title)s.%(ext)
$ youtube-dl -u user -p password -o '~/MyVideos/%(playlist)s/%(chapter_number)s - %(chapter)s/%(title)s.%(ext)s' https://www.udemy.com/java-tutorial/
# Download entire series season keeping each series and each season in separate directory under C:/MyVideos
-$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" http://videomore.ru/kino_v_detalayah/5_sezon/367617
+$ youtube-dl -o "C:/MyVideos/%(series)s/%(season_number)s - %(season)s/%(episode_number)s - %(episode)s.%(ext)s" https://videomore.ru/kino_v_detalayah/5_sezon/367617
# Stream the video being downloaded to stdout
$ youtube-dl -o - BaW_jenozKc
@@ -671,7 +671,7 @@ If you want to preserve the old format selection behavior (prior to youtube-dl 2
#### Format selection examples
-Note on Windows you may need to use double quotes instead of single.
+Note that on Windows you may need to use double quotes instead of single.
```bash
# Download best mp4 format available or any other best if no mp4 available
@@ -716,17 +716,17 @@ $ youtube-dl --dateafter 20000101 --datebefore 20091231
### How do I update youtube-dl?
-If you've followed [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`).
+If you've followed [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html), you can simply run `youtube-dl -U` (or, on Linux, `sudo youtube-dl -U`).
If you have used pip, a simple `sudo pip install -U youtube-dl` is sufficient to update.
-If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to http://yt-dl.org/ to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum.
+If you have installed youtube-dl using a package manager like *apt-get* or *yum*, use the standard system update mechanism to update. Note that distribution packages are often outdated. As a rule of thumb, youtube-dl releases at least once a month, and often weekly or even daily. Simply go to https://yt-dl.org to find out the current version. Unfortunately, there is nothing we youtube-dl developers can do if your distribution serves a really outdated version. You can (and should) complain to your distribution in their bugtracker or support forum.
As a last resort, you can also uninstall the version installed by your package manager and follow our manual installation instructions. For that, remove the distribution's package, with a line like
sudo apt-get remove -y youtube-dl
-Afterwards, simply follow [our manual installation instructions](http://rg3.github.io/youtube-dl/download.html):
+Afterwards, simply follow [our manual installation instructions](https://rg3.github.io/youtube-dl/download.html):
```
sudo wget https://yt-dl.org/latest/youtube-dl -O /usr/local/bin/youtube-dl
@@ -766,11 +766,11 @@ Apparently YouTube requires you to pass a CAPTCHA test if you download too much.
youtube-dl works fine on its own on most sites. However, if you want to convert video/audio, you'll need [avconv](https://libav.org/) or [ffmpeg](https://www.ffmpeg.org/). On some sites - most notably YouTube - videos can be retrieved in a higher quality format without sound. youtube-dl will detect whether avconv/ffmpeg is present and automatically pick the best option.
-Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](http://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed.
+Videos or video formats streamed via RTMP protocol can only be downloaded when [rtmpdump](https://rtmpdump.mplayerhq.hu/) is installed. Downloading MMS and RTSP videos requires either [mplayer](https://mplayerhq.hu/) or [mpv](https://mpv.io/) to be installed.
### I have downloaded a video but how can I play it?
-Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](http://www.videolan.org/) or [mplayer](http://www.mplayerhq.hu/).
+Once the video is fully downloaded, use any video player, such as [mpv](https://mpv.io/), [vlc](https://www.videolan.org/) or [mplayer](https://www.mplayerhq.hu/).
### I extracted a video URL with `-g`, but it does not play on another machine / in my web browser.
@@ -845,10 +845,10 @@ Use the `-o` to specify an [output template](#output-template), for example `-o
### How do I download a video starting with a `-`?
-Either prepend `http://www.youtube.com/watch?v=` or separate the ID from the options with `--`:
+Either prepend `https://www.youtube.com/watch?v=` or separate the ID from the options with `--`:
youtube-dl -- -wNyEUrxzFU
- youtube-dl "http://www.youtube.com/watch?v=-wNyEUrxzFU"
+ youtube-dl "https://www.youtube.com/watch?v=-wNyEUrxzFU"
### How do I pass cookies to youtube-dl?
@@ -862,9 +862,9 @@ Passing cookies to youtube-dl is a good way to workaround login when a particula
### How do I stream directly to media player?
-You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](http://www.videolan.org/) can be achieved with:
+You will first need to tell youtube-dl to stream media to stdout with `-o -`, and also tell your media player to read from stdin (it must be capable of this for streaming) and then pipe former to latter. For example, streaming to [vlc](https://www.videolan.org/) can be achieved with:
- youtube-dl -o - "http://www.youtube.com/watch?v=BaW_jenozKcj" | vlc -
+ youtube-dl -o - "https://www.youtube.com/watch?v=BaW_jenozKcj" | vlc -
### How do I download only new videos from a playlist?
@@ -884,7 +884,7 @@ When youtube-dl detects an HLS video, it can download it either with the built-i
When youtube-dl knows that one particular downloader works better for a given website, that downloader will be picked. Otherwise, youtube-dl will pick the best downloader for general compatibility, which at the moment happens to be ffmpeg. This choice may change in future versions of youtube-dl, with improvements of the built-in downloader and/or ffmpeg.
-In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](http://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader.
+In particular, the generic extractor (used when your website is not in the [list of supported sites by youtube-dl](https://rg3.github.io/youtube-dl/supportedsites.html) cannot mandate one specific downloader.
If you put either `--hls-prefer-native` or `--hls-prefer-ffmpeg` into your configuration, a different subset of videos will fail to download correctly. Instead, it is much better to [file an issue](https://yt-dl.org/bug) or a pull request which details why the native or the ffmpeg HLS downloader is a better choice for your use case.
@@ -910,7 +910,7 @@ Feel free to bump the issue from time to time by writing a small comment ("Issue
### How can I detect whether a given URL is supported by youtube-dl?
-For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from http://example.com/video/1234567 to http://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug.
+For one, have a look at the [list of supported sites](docs/supportedsites.md). Note that it can sometimes happen that the site changes its URL scheme (say, from https://example.com/video/1234567 to https://example.com/v/1234567 ) and youtube-dl reports an URL of a service in that list as unsupported. In that case, simply report a bug.
It is *not* possible to detect whether a URL is supported or not. That's because youtube-dl contains a generic extractor which matches **all** URLs. You may be tempted to disable, exclude, or remove the generic extractor, but the generic extractor not only allows users to extract videos from lots of websites that embed a video from another service, but may also be used to extract video from a service that it's hosting itself. Therefore, we neither recommend nor support disabling, excluding, or removing the generic extractor.
@@ -924,7 +924,7 @@ youtube-dl is an open-source project manned by too few volunteers, so we'd rathe
# DEVELOPER INSTRUCTIONS
-Most users do not need to build youtube-dl and can [download the builds](http://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
+Most users do not need to build youtube-dl and can [download the builds](https://rg3.github.io/youtube-dl/download.html) or get them from their distribution.
To run youtube-dl as a developer, you don't need to build anything either. Simply execute
@@ -936,6 +936,8 @@ To run the test, simply invoke your favorite test runner, or execute a test file
python test/test_download.py
nosetests
+See item 6 of [new extractor tutorial](#adding-support-for-a-new-site) for how to run extractor specific test cases.
+
If you want to create a build of youtube-dl yourself, you'll need
* python
@@ -972,7 +974,7 @@ After you have ensured this site is distributing its content legally, you can fo
class YourExtractorIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?yourextractor\.com/watch/(?P<id>[0-9]+)'
_TEST = {
- 'url': 'http://yourextractor.com/watch/42',
+ 'url': 'https://yourextractor.com/watch/42',
'md5': 'TODO: md5 sum of the first 10241 bytes of the video file (use --test)',
'info_dict': {
'id': '42',
@@ -1003,10 +1005,10 @@ After you have ensured this site is distributing its content legally, you can fo
}
```
5. Add an import in [`youtube_dl/extractor/extractors.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/extractors.py).
-6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc.
+6. Run `python test/test_download.py TestDownload.test_YourExtractor`. This *should fail* at first, but you can continually re-run it until you're done. If you decide to add more than one test, then rename ``_TEST`` to ``_TESTS`` and make it into a list of dictionaries. The tests will then be named `TestDownload.test_YourExtractor`, `TestDownload.test_YourExtractor_1`, `TestDownload.test_YourExtractor_2`, etc. Note that tests with `only_matching` key in test's dict are not counted in.
7. Have a look at [`youtube_dl/extractor/common.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py) for possible helper methods and a [detailed description of what your extractor should and may return](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/extractor/common.py#L74-L252). Add tests and code for as many as you want.
-8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](http://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
-9. When the tests pass, [add](http://git-scm.com/docs/git-add) the new files and [commit](http://git-scm.com/docs/git-commit) them and [push](http://git-scm.com/docs/git-push) the result, like this:
+8. Make sure your code follows [youtube-dl coding conventions](#youtube-dl-coding-conventions) and check the code with [flake8](https://pypi.python.org/pypi/flake8). Also make sure your code works under all [Python](https://www.python.org/) versions claimed supported by youtube-dl, namely 2.6, 2.7, and 3.2+.
+9. When the tests pass, [add](https://git-scm.com/docs/git-add) the new files and [commit](https://git-scm.com/docs/git-commit) them and [push](https://git-scm.com/docs/git-push) the result, like this:
$ git add youtube_dl/extractor/extractors.py
$ git add youtube_dl/extractor/yourextractor.py
@@ -1162,7 +1164,7 @@ import youtube_dl
ydl_opts = {}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
- ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc'])
+ ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc'])
```
Most likely, you'll want to use various options. For a list of options available, have a look at [`youtube_dl/YoutubeDL.py`](https://github.com/rg3/youtube-dl/blob/master/youtube_dl/YoutubeDL.py#L129-L279). For a start, if you want to intercept youtube-dl's output, set a `logger` object.
@@ -1201,19 +1203,19 @@ ydl_opts = {
'progress_hooks': [my_hook],
}
with youtube_dl.YoutubeDL(ydl_opts) as ydl:
- ydl.download(['http://www.youtube.com/watch?v=BaW_jenozKc'])
+ ydl.download(['https://www.youtube.com/watch?v=BaW_jenozKc'])
```
# BUGS
-Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues>. Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](http://webchat.freenode.net/?randomnick=1&channels=youtube-dl)).
+Bugs and suggestions should be reported at: <https://github.com/rg3/youtube-dl/issues>. Unless you were prompted to or there is another pertinent reason (e.g. GitHub fails to accept the bug report), please do not send bug reports via personal email. For discussions, join us in the IRC channel [#youtube-dl](irc://chat.freenode.net/#youtube-dl) on freenode ([webchat](https://webchat.freenode.net/?randomnick=1&channels=youtube-dl)).
**Please include the full output of youtube-dl when run with `-v`**, i.e. **add** `-v` flag to **your command line**, copy the **whole** output and post it in the issue body wrapped in \`\`\` for better formatting. It should look similar to this:
```
$ youtube-dl -v <your command line>
[debug] System config: []
[debug] User config: []
-[debug] Command-line args: [u'-v', u'http://www.youtube.com/watch?v=BaW_jenozKcj']
+[debug] Command-line args: [u'-v', u'https://www.youtube.com/watch?v=BaW_jenozKcj']
[debug] Encodings: locale cp1251, fs mbcs, out cp866, pref cp1251
[debug] youtube-dl version 2015.12.06
[debug] Git HEAD: 135392e
@@ -1244,7 +1246,7 @@ For bug reports, this means that your report should contain the *complete* outpu
If your server has multiple IPs or you suspect censorship, adding `--call-home` may be a good idea to get more diagnostics. If the error is `ERROR: Unable to extract ...` and you cannot reproduce it from multiple countries, add `--dump-pages` (warning: this will yield a rather large output, redirect it to the file `log.txt` by adding `>log.txt 2>&1` to your command-line) or upload the `.dump` files you get when you add `--write-pages` [somewhere](https://gist.github.com/).
-**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `http://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `http://www.youtube.com/`) is *not* an example URL.
+**Site support requests must contain an example URL**. An example URL is a URL you might want to download, like `https://www.youtube.com/watch?v=BaW_jenozKc`. There should be an obvious video present. Except under very special circumstances, the main page of a video service (e.g. `https://www.youtube.com/`) is *not* an example URL.
### Are you using the latest version?
diff --git a/docs/supportedsites.md b/docs/supportedsites.md
index 010ff76..798a81d 100644
--- a/docs/supportedsites.md
+++ b/docs/supportedsites.md
@@ -38,11 +38,12 @@
- **afreecatv**: afreecatv.com
- **afreecatv:global**: afreecatv.com
- **AirMozilla**
+ - **AliExpressLive**
- **AlJazeera**
- **Allocine**
- **AlphaPorno**
- **AMCNetworks**
- - **anderetijden**: npo.nl and ntr.nl
+ - **anderetijden**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **AnimeOnDemand**
- **anitube.se**
- **Anvato**
@@ -154,8 +155,9 @@
- **chirbit**
- **chirbit:profile**
- **Cinchcast**
- - **Clipfish**
+ - **CJSW**
- **cliphunter**
+ - **Clippit**
- **ClipRs**
- **Clipsyndicate**
- **CloserToTruth**
@@ -237,6 +239,7 @@
- **EbaumsWorld**
- **EchoMsk**
- **egghead:course**: egghead.io course
+ - **egghead:lesson**: egghead.io lesson
- **eHow**
- **Einthusan**
- **eitb.tv**
@@ -293,6 +296,7 @@
- **Funimation**
- **FunnyOrDie**
- **Fusion**
+ - **Fux**
- **FXNetworks**
- **GameInformer**
- **GameOne**
@@ -360,6 +364,7 @@
- **IPrima**
- **iqiyi**: 爱奇艺
- **Ir90Tv**
+ - **ITTF**
- **ITV**
- **ivi**: ivi.ru
- **ivi:compilation**: ivi.ru compilations
@@ -369,6 +374,7 @@
- **Jamendo**
- **JamendoAlbum**
- **JeuxVideo**
+ - **Joj**
- **Jove**
- **jpopsuki.tv**
- **JWPlatform**
@@ -415,6 +421,7 @@
- **limelight:channel_list**
- **LiTV**
- **LiveLeak**
+ - **LiveLeakEmbed**
- **livestream**
- **livestream:original**
- **LnkGo**
@@ -431,12 +438,14 @@
- **MakerTV**
- **mangomolo:live**
- **mangomolo:video**
+ - **ManyVids**
- **MatchTV**
- **MDR**: MDR.DE and KiKA
- **media.ccc.de**
- **Medialaan**
- **Mediaset**
- **Medici**
+ - **megaphone.fm**: megaphone.fm embedded players
- **Meipai**: 美拍
- **MelonVOD**
- **META**
@@ -469,7 +478,6 @@
- **MovieFap**
- **Moviezine**
- **MovingImage**
- - **MPORA**
- **MSN**
- **mtg**: MTG services
- **mtv**
@@ -519,6 +527,8 @@
- **NextMedia**: 蘋果日報
- **NextMediaActionNews**: 蘋果日報 - 動新聞
- **NextTV**: 壹電視
+ - **Nexx**
+ - **NexxEmbed**
- **nfb**: National Film Board of Canada
- **nfl.com**
- **NhkVod**
@@ -528,6 +538,7 @@
- **nhl.com:videocenter:category**: NHL videocenter category
- **nick.com**
- **nick.de**
+ - **nickelodeonru**
- **nicknight**
- **niconico**: ニコニコ動画
- **NiconicoPlaylist**
@@ -549,7 +560,7 @@
- **NowTVList**
- **nowvideo**: NowVideo
- **Noz**
- - **npo**: npo.nl and ntr.nl
+ - **npo**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **npo.nl:live**
- **npo.nl:radio**
- **npo.nl:radio:fragment**
@@ -593,6 +604,7 @@
- **Patreon**
- **pbs**: Public Broadcasting Service (PBS) and member stations: PBS: Public Broadcasting Service, APT - Alabama Public Television (WBIQ), GPB/Georgia Public Broadcasting (WGTV), Mississippi Public Broadcasting (WMPN), Nashville Public Television (WNPT), WFSU-TV (WFSU), WSRE (WSRE), WTCI (WTCI), WPBA/Channel 30 (WPBA), Alaska Public Media (KAKM), Arizona PBS (KAET), KNME-TV/Channel 5 (KNME), Vegas PBS (KLVX), AETN/ARKANSAS ETV NETWORK (KETS), KET (WKLE), WKNO/Channel 10 (WKNO), LPB/LOUISIANA PUBLIC BROADCASTING (WLPB), OETA (KETA), Ozarks Public Television (KOZK), WSIU Public Broadcasting (WSIU), KEET TV (KEET), KIXE/Channel 9 (KIXE), KPBS San Diego (KPBS), KQED (KQED), KVIE Public Television (KVIE), PBS SoCal/KOCE (KOCE), ValleyPBS (KVPT), CONNECTICUT PUBLIC TELEVISION (WEDH), KNPB Channel 5 (KNPB), SOPTV (KSYS), Rocky Mountain PBS (KRMA), KENW-TV3 (KENW), KUED Channel 7 (KUED), Wyoming PBS (KCWC), Colorado Public Television / KBDI 12 (KBDI), KBYU-TV (KBYU), Thirteen/WNET New York (WNET), WGBH/Channel 2 (WGBH), WGBY (WGBY), NJTV Public Media NJ (WNJT), WLIW21 (WLIW), mpt/Maryland Public Television (WMPB), WETA Television and Radio (WETA), WHYY (WHYY), PBS 39 (WLVT), WVPT - Your Source for PBS and More! (WVPT), Howard University Television (WHUT), WEDU PBS (WEDU), WGCU Public Media (WGCU), WPBT2 (WPBT), WUCF TV (WUCF), WUFT/Channel 5 (WUFT), WXEL/Channel 42 (WXEL), WLRN/Channel 17 (WLRN), WUSF Public Broadcasting (WUSF), ETV (WRLK), UNC-TV (WUNC), PBS Hawaii - Oceanic Cable Channel 10 (KHET), Idaho Public Television (KAID), KSPS (KSPS), OPB (KOPB), KWSU/Channel 10 & KTNW/Channel 31 (KWSU), WILL-TV (WILL), Network Knowledge - WSEC/Springfield (WSEC), WTTW11 (WTTW), Iowa Public Television/IPTV (KDIN), Nine Network (KETC), PBS39 Fort Wayne (WFWA), WFYI Indianapolis (WFYI), Milwaukee Public Television (WMVS), WNIN (WNIN), WNIT Public Television (WNIT), WPT (WPNE), WVUT/Channel 22 (WVUT), WEIU/Channel 51 (WEIU), WQPT-TV (WQPT), WYCC PBS Chicago (WYCC), WIPB-TV (WIPB), WTIU (WTIU), CET (WCET), ThinkTVNetwork (WPTD), WBGU-TV (WBGU), WGVU TV (WGVU), NET1 (KUON), Pioneer Public Television (KWCM), SDPB Television (KUSD), TPT (KTCA), KSMQ (KSMQ), KPTS/Channel 8 (KPTS), KTWU/Channel 11 (KTWU), East Tennessee PBS (WSJK), WCTE-TV (WCTE), WLJT, Channel 11 (WLJT), WOSU TV (WOSU), WOUB/WOUC (WOUB), WVPB (WVPB), WKYU-PBS (WKYU), KERA 13 (KERA), MPBN (WCBB), Mountain Lake PBS (WCFE), NHPTV (WENH), Vermont PBS (WETK), witf (WITF), WQED Multimedia (WQED), WMHT Educational Telecommunications (WMHT), Q-TV (WDCQ), WTVS Detroit Public TV (WTVS), CMU Public Television (WCMU), WKAR-TV (WKAR), WNMU-TV Public TV 13 (WNMU), WDSE - WRPT (WDSE), WGTE TV (WGTE), Lakeland Public Television (KAWE), KMOS-TV - Channels 6.1, 6.2 and 6.3 (KMOS), MontanaPBS (KUSM), KRWG/Channel 22 (KRWG), KACV (KACV), KCOS/Channel 13 (KCOS), WCNY/Channel 24 (WCNY), WNED (WNED), WPBS (WPBS), WSKG Public TV (WSKG), WXXI (WXXI), WPSU (WPSU), WVIA Public Media Studios (WVIA), WTVI (WTVI), Western Reserve PBS (WNEO), WVIZ/PBS ideastream (WVIZ), KCTS 9 (KCTS), Basin PBS (KPBT), KUHT / Channel 8 (KUHT), KLRN (KLRN), KLRU (KLRU), WTJX Channel 12 (WTJX), WCVE PBS (WCVE), KBTC Public Television (KBTC)
- **pcmag**
+ - **PearVideo**
- **People**
- **periscope**: Periscope
- **periscope:user**: Periscope user videos
@@ -615,6 +627,7 @@
- **PolskieRadio**
- **PolskieRadioCategory**
- **PornCom**
+ - **PornerBros**
- **PornFlip**
- **PornHd**
- **PornHub**: PornHub and Thumbzilla
@@ -623,6 +636,7 @@
- **Pornotube**
- **PornoVoisines**
- **PornoXO**
+ - **PornTube**
- **PressTV**
- **PrimeShareTV**
- **PromptFile**
@@ -648,6 +662,8 @@
- **RBMARadio**
- **RDS**: RDS.ca
- **RedBullTV**
+ - **Reddit**
+ - **RedditR**
- **RedTube**
- **RegioTV**
- **RENTV**
@@ -687,6 +703,7 @@
- **rutube:embed**: Rutube embedded videos
- **rutube:movie**: Rutube movies
- **rutube:person**: Rutube person videos
+ - **rutube:playlist**: Rutube playlists
- **RUTV**: RUTV.RU
- **Ruutu**
- **Ruv**
@@ -728,6 +745,7 @@
- **soundcloud:playlist**
- **soundcloud:search**: Soundcloud search
- **soundcloud:set**
+ - **soundcloud:trackstation**
- **soundcloud:user**
- **soundgasm**
- **soundgasm:profile**
@@ -768,13 +786,13 @@
- **Tagesschau**
- **tagesschau:player**
- **Tass**
- - **TBS**
+ - **TastyTrade**
+ - **TBS** (Currently broken)
- **TDSLifeway**
- **teachertube**: teachertube.com videos
- **teachertube:user:collection**: teachertube.com user and collection videos
- **TeachingChannel**
- **Teamcoco**
- - **TeamFourStar**
- **TechTalks**
- **techtv.mit.edu**
- **ted**
@@ -939,13 +957,15 @@
- **vk:wallpost**
- **vlive**
- **vlive:channel**
+ - **vlive:playlist**
- **Vodlocker**
- **VODPl**
- **VODPlatform**
- **VoiceRepublic**
+ - **Voot**
- **VoxMedia**
- **Vporn**
- - **vpro**: npo.nl and ntr.nl
+ - **vpro**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **Vrak**
- **VRT**: deredactie.be, sporza.be, cobra.be and cobra.canvas.be
- **vrv**
@@ -960,6 +980,7 @@
- **washingtonpost**
- **washingtonpost:article**
- **wat.tv**
+ - **WatchBox**
- **WatchIndianPorn**: Watch Indian Porn
- **WDR**
- **wdr:mobile**
@@ -971,7 +992,7 @@
- **wholecloud**: WholeCloud
- **Wimp**
- **Wistia**
- - **wnl**: npo.nl and ntr.nl
+ - **wnl**: npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl
- **WorldStarHipHop**
- **wrzuta.pl**
- **wrzuta.pl:playlist**
@@ -995,7 +1016,7 @@
- **XVideos**
- **XXXYMovies**
- **Yahoo**: Yahoo screen and movies
- - **Yam**: 蕃薯藤yam天空部落
+ - **YandexDisk**
- **yandexmusic:album**: Яндекс.Музыка - Альбом
- **yandexmusic:playlist**: Яндекс.Музыка - Плейлист
- **yandexmusic:track**: Яндекс.Музыка - Трек
diff --git a/test/test_InfoExtractor.py b/test/test_InfoExtractor.py
index 6f52e11..f18a823 100644
--- a/test/test_InfoExtractor.py
+++ b/test/test_InfoExtractor.py
@@ -10,6 +10,7 @@ import unittest
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from test.helper import FakeYDL, expect_dict, expect_value
+from youtube_dl.compat import compat_etree_fromstring
from youtube_dl.extractor.common import InfoExtractor
from youtube_dl.extractor import YoutubeIE, get_info_extractor
from youtube_dl.utils import encode_data_uri, strip_jsonp, ExtractorError, RegexNotFoundError
@@ -488,6 +489,91 @@ jwplayer("mediaplayer").setup({"abouttext":"Visit Indie DB","aboutlink":"http:\/
self.ie._sort_formats(formats)
expect_value(self, formats, expected_formats, None)
+ def test_parse_mpd_formats(self):
+ _TEST_CASES = [
+ (
+ # https://github.com/rg3/youtube-dl/issues/13919
+ 'float_duration',
+ 'http://unknown/manifest.mpd',
+ [{
+ 'manifest_url': 'http://unknown/manifest.mpd',
+ 'ext': 'mp4',
+ 'format_id': '318597',
+ 'format_note': 'DASH video',
+ 'protocol': 'http_dash_segments',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.42001f',
+ 'tbr': 318.597,
+ 'width': 340,
+ 'height': 192,
+ }, {
+ 'manifest_url': 'http://unknown/manifest.mpd',
+ 'ext': 'mp4',
+ 'format_id': '638590',
+ 'format_note': 'DASH video',
+ 'protocol': 'http_dash_segments',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.42001f',
+ 'tbr': 638.59,
+ 'width': 512,
+ 'height': 288,
+ }, {
+ 'manifest_url': 'http://unknown/manifest.mpd',
+ 'ext': 'mp4',
+ 'format_id': '1022565',
+ 'format_note': 'DASH video',
+ 'protocol': 'http_dash_segments',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.4d001f',
+ 'tbr': 1022.565,
+ 'width': 688,
+ 'height': 384,
+ }, {
+ 'manifest_url': 'http://unknown/manifest.mpd',
+ 'ext': 'mp4',
+ 'format_id': '2046506',
+ 'format_note': 'DASH video',
+ 'protocol': 'http_dash_segments',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.4d001f',
+ 'tbr': 2046.506,
+ 'width': 1024,
+ 'height': 576,
+ }, {
+ 'manifest_url': 'http://unknown/manifest.mpd',
+ 'ext': 'mp4',
+ 'format_id': '3998017',
+ 'format_note': 'DASH video',
+ 'protocol': 'http_dash_segments',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.640029',
+ 'tbr': 3998.017,
+ 'width': 1280,
+ 'height': 720,
+ }, {
+ 'manifest_url': 'http://unknown/manifest.mpd',
+ 'ext': 'mp4',
+ 'format_id': '5997485',
+ 'format_note': 'DASH video',
+ 'protocol': 'http_dash_segments',
+ 'acodec': 'none',
+ 'vcodec': 'avc1.640032',
+ 'tbr': 5997.485,
+ 'width': 1920,
+ 'height': 1080,
+ }]
+ ),
+ ]
+
+ for mpd_file, mpd_url, expected_formats in _TEST_CASES:
+ with io.open('./test/testdata/mpd/%s.mpd' % mpd_file,
+ mode='r', encoding='utf-8') as f:
+ formats = self.ie._parse_mpd_formats(
+ compat_etree_fromstring(f.read().encode('utf-8')),
+ mpd_url=mpd_url)
+ self.ie._sort_formats(formats)
+ expect_value(self, formats, expected_formats, None)
+
if __name__ == '__main__':
unittest.main()
diff --git a/test/test_YoutubeDL.py b/test/test_YoutubeDL.py
index 75945e3..e70cbcd 100644
--- a/test/test_YoutubeDL.py
+++ b/test/test_YoutubeDL.py
@@ -41,6 +41,7 @@ def _make_result(formats, **kwargs):
'id': 'testid',
'title': 'testttitle',
'extractor': 'testex',
+ 'extractor_key': 'TestEx',
}
res.update(**kwargs)
return res
@@ -370,6 +371,19 @@ class TestFormatSelection(unittest.TestCase):
ydl = YDL({'format': 'best[height>360]'})
self.assertRaises(ExtractorError, ydl.process_ie_result, info_dict.copy())
+ def test_format_selection_issue_10083(self):
+ # See https://github.com/rg3/youtube-dl/issues/10083
+ formats = [
+ {'format_id': 'regular', 'height': 360, 'url': TEST_URL},
+ {'format_id': 'video', 'height': 720, 'acodec': 'none', 'url': TEST_URL},
+ {'format_id': 'audio', 'vcodec': 'none', 'url': TEST_URL},
+ ]
+ info_dict = _make_result(formats)
+
+ ydl = YDL({'format': 'best[height>360]/bestvideo[height>360]+bestaudio'})
+ ydl.process_ie_result(info_dict.copy())
+ self.assertEqual(ydl.downloaded_info_dicts[0]['format_id'], 'video+audio')
+
def test_invalid_format_specs(self):
def assert_syntax_error(format_spec):
ydl = YDL({'format': format_spec})
@@ -448,6 +462,17 @@ class TestFormatSelection(unittest.TestCase):
pass
self.assertEqual(ydl.downloaded_info_dicts, [])
+ def test_default_format_spec(self):
+ ydl = YDL({'simulate': True})
+ self.assertEqual(ydl._default_format_spec({}), 'bestvideo+bestaudio/best')
+
+ ydl = YDL({'outtmpl': '-'})
+ self.assertEqual(ydl._default_format_spec({}), 'best')
+
+ ydl = YDL({})
+ self.assertEqual(ydl._default_format_spec({}, download=False), 'bestvideo+bestaudio/best')
+ self.assertEqual(ydl._default_format_spec({'is_live': True}), 'best')
+
class TestYoutubeDL(unittest.TestCase):
def test_subtitles(self):
@@ -527,6 +552,8 @@ class TestYoutubeDL(unittest.TestCase):
'ext': 'mp4',
'width': None,
'height': 1080,
+ 'title1': '$PATH',
+ 'title2': '%PATH%',
}
def fname(templ):
@@ -545,10 +572,14 @@ class TestYoutubeDL(unittest.TestCase):
self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4')
self.assertEqual(fname('%(height)0 6d.%(ext)s'), ' 01080.mp4')
self.assertEqual(fname('%(height) 0 6d.%(ext)s'), ' 01080.mp4')
+ self.assertEqual(fname('%%'), '%')
+ self.assertEqual(fname('%%%%'), '%%')
self.assertEqual(fname('%%(height)06d.%(ext)s'), '%(height)06d.mp4')
self.assertEqual(fname('%(width)06d.%(ext)s'), 'NA.mp4')
self.assertEqual(fname('%(width)06d.%%(ext)s'), 'NA.%(ext)s')
self.assertEqual(fname('%%(width)06d.%(ext)s'), '%(width)06d.mp4')
+ self.assertEqual(fname('Hello %(title1)s'), 'Hello $PATH')
+ self.assertEqual(fname('Hello %(title2)s'), 'Hello %PATH%')
def test_format_note(self):
ydl = YoutubeDL()
@@ -755,7 +786,8 @@ class TestYoutubeDL(unittest.TestCase):
'_type': 'url_transparent',
'url': 'foo2:',
'ie_key': 'Foo2',
- 'title': 'foo1 title'
+ 'title': 'foo1 title',
+ 'id': 'foo1_id',
}
class Foo2IE(InfoExtractor):
@@ -781,6 +813,9 @@ class TestYoutubeDL(unittest.TestCase):
downloaded = ydl.downloaded_info_dicts[0]
self.assertEqual(downloaded['url'], TEST_URL)
self.assertEqual(downloaded['title'], 'foo1 title')
+ self.assertEqual(downloaded['id'], 'testid')
+ self.assertEqual(downloaded['extractor'], 'testex')
+ self.assertEqual(downloaded['extractor_key'], 'TestEx')
if __name__ == '__main__':
diff --git a/test/test_options.py b/test/test_options.py
new file mode 100644
index 0000000..3a25a6b
--- /dev/null
+++ b/test/test_options.py
@@ -0,0 +1,26 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+# Allow direct execution
+import os
+import sys
+import unittest
+sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+
+from youtube_dl.options import _hide_login_info
+
+
+class TestOptions(unittest.TestCase):
+ def test_hide_login_info(self):
+ self.assertEqual(_hide_login_info(['-u', 'foo', '-p', 'bar']),
+ ['-u', 'PRIVATE', '-p', 'PRIVATE'])
+ self.assertEqual(_hide_login_info(['-u']), ['-u'])
+ self.assertEqual(_hide_login_info(['-u', 'foo', '-u', 'bar']),
+ ['-u', 'PRIVATE', '-u', 'PRIVATE'])
+ self.assertEqual(_hide_login_info(['--username=foo']),
+ ['--username=PRIVATE'])
+
+
+if __name__ == '__main__':
+ unittest.main()
diff --git a/test/test_utils.py b/test/test_utils.py
index 2b93b36..e50f376 100644
--- a/test/test_utils.py
+++ b/test/test_utils.py
@@ -98,6 +98,7 @@ from youtube_dl.compat import (
compat_chr,
compat_etree_fromstring,
compat_getenv,
+ compat_os_name,
compat_setenv,
compat_urlparse,
compat_parse_qs,
@@ -278,6 +279,7 @@ class TestUtil(unittest.TestCase):
self.assertEqual(unescapeHTML('&#47;'), '/')
self.assertEqual(unescapeHTML('&eacute;'), 'é')
self.assertEqual(unescapeHTML('&#2013266066;'), '&#2013266066;')
+ self.assertEqual(unescapeHTML('&a&quot;'), '&a"')
# HTML5 entities
self.assertEqual(unescapeHTML('&period;&apos;'), '.\'')
@@ -448,7 +450,9 @@ class TestUtil(unittest.TestCase):
def test_shell_quote(self):
args = ['ffmpeg', '-i', encodeFilename('ñ€ß\'.mp4')]
- self.assertEqual(shell_quote(args), """ffmpeg -i 'ñ€ß'"'"'.mp4'""")
+ self.assertEqual(
+ shell_quote(args),
+ """ffmpeg -i 'ñ€ß'"'"'.mp4'""" if compat_os_name != 'nt' else '''ffmpeg -i "ñ€ß'.mp4"''')
def test_str_to_int(self):
self.assertEqual(str_to_int('123,456'), 123456)
@@ -932,7 +936,7 @@ class TestUtil(unittest.TestCase):
def test_args_to_str(self):
self.assertEqual(
args_to_str(['foo', 'ba/r', '-baz', '2 be', '']),
- 'foo ba/r -baz \'2 be\' \'\''
+ 'foo ba/r -baz \'2 be\' \'\'' if compat_os_name != 'nt' else 'foo ba/r -baz "2 be" ""'
)
def test_parse_filesize(self):
@@ -1179,6 +1183,10 @@ part 3</font></u>
cli_bool_option(
{'nocheckcertificate': False}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='),
['--check-certificate=true'])
+ self.assertEqual(
+ cli_bool_option(
+ {}, '--check-certificate', 'nocheckcertificate', 'false', 'true', '='),
+ [])
def test_ohdave_rsa_encrypt(self):
N = 0xab86b6371b5318aaa1d3c9e612a9f1264f372323c8c0f19875b5fc3b3fd3afcc1e5bec527aa94bfa85bffc157e4245aebda05389a5357b75115ac94f074aefcd
@@ -1228,6 +1236,12 @@ part 3</font></u>
self.assertEqual(get_element_by_attribute('class', 'foo', html), None)
self.assertEqual(get_element_by_attribute('class', 'no-such-foo', html), None)
+ html = '''
+ <div itemprop="author" itemscope>foo</div>
+ '''
+
+ self.assertEqual(get_element_by_attribute('itemprop', 'author', html), 'foo')
+
def test_get_elements_by_class(self):
html = '''
<span class="foo bar">nice</span><span class="foo bar">also nice</span>
diff --git a/test/testdata/mpd/float_duration.mpd b/test/testdata/mpd/float_duration.mpd
new file mode 100644
index 0000000..8dc1d2d
--- /dev/null
+++ b/test/testdata/mpd/float_duration.mpd
@@ -0,0 +1,18 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<MPD xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns="urn:mpeg:dash:schema:mpd:2011" type="static" minBufferTime="PT2S" profiles="urn:mpeg:dash:profile:isoff-on-demand:2011" mediaPresentationDuration="PT6014S">
+ <Period bitstreamSwitching="true">
+ <AdaptationSet mimeType="audio/mp4" codecs="mp4a.40.2" startWithSAP="1" segmentAlignment="true">
+ <SegmentTemplate timescale="1000000" presentationTimeOffset="0" initialization="ai_$RepresentationID$.mp4d" media="a_$RepresentationID$_$Number$.mp4d" duration="2000000.0" startNumber="0"></SegmentTemplate>
+ <Representation id="318597" bandwidth="61587"></Representation>
+ </AdaptationSet>
+ <AdaptationSet mimeType="video/mp4" startWithSAP="1" segmentAlignment="true">
+ <SegmentTemplate timescale="1000000" presentationTimeOffset="0" initialization="vi_$RepresentationID$.mp4d" media="v_$RepresentationID$_$Number$.mp4d" duration="2000000.0" startNumber="0"></SegmentTemplate>
+ <Representation id="318597" codecs="avc1.42001f" width="340" height="192" bandwidth="318597"></Representation>
+ <Representation id="638590" codecs="avc1.42001f" width="512" height="288" bandwidth="638590"></Representation>
+ <Representation id="1022565" codecs="avc1.4d001f" width="688" height="384" bandwidth="1022565"></Representation>
+ <Representation id="2046506" codecs="avc1.4d001f" width="1024" height="576" bandwidth="2046506"></Representation>
+ <Representation id="3998017" codecs="avc1.640029" width="1280" height="720" bandwidth="3998017"></Representation>
+ <Representation id="5997485" codecs="avc1.640032" width="1920" height="1080" bandwidth="5997485"></Representation>
+ </AdaptationSet>
+ </Period>
+</MPD> \ No newline at end of file
diff --git a/youtube_dl/YoutubeDL.py b/youtube_dl/YoutubeDL.py
index b3a6d4d..4f208f1 100755
--- a/youtube_dl/YoutubeDL.py
+++ b/youtube_dl/YoutubeDL.py
@@ -26,6 +26,8 @@ import tokenize
import traceback
import random
+from string import ascii_letters
+
from .compat import (
compat_basestring,
compat_cookiejar,
@@ -674,7 +676,19 @@ class YoutubeDL(object):
FORMAT_RE.format(numeric_field),
r'%({0})s'.format(numeric_field), outtmpl)
- filename = expand_path(outtmpl % template_dict)
+ # expand_path translates '%%' into '%' and '$$' into '$'
+ # correspondingly that is not what we want since we need to keep
+ # '%%' intact for template dict substitution step. Working around
+ # with boundary-alike separator hack.
+ sep = ''.join([random.choice(ascii_letters) for _ in range(32)])
+ outtmpl = outtmpl.replace('%%', '%{0}%'.format(sep)).replace('$$', '${0}$'.format(sep))
+
+ # outtmpl should be expand_path'ed before template dict substitution
+ # because meta fields may contain env variables we don't want to
+ # be expanded. For example, for outtmpl "%(title)s.%(ext)s" and
+ # title "Hello $PATH", we don't want `$PATH` to be expanded.
+ filename = expand_path(outtmpl).replace(sep, '') % template_dict
+
# Temporary fix for #4787
# 'Treat' all problem characters by passing filename through preferredencoding
# to workaround encoding issues with subprocess on python2 @ Windows
@@ -846,7 +860,7 @@ class YoutubeDL(object):
force_properties = dict(
(k, v) for k, v in ie_result.items() if v is not None)
- for f in ('_type', 'url', 'ie_key'):
+ for f in ('_type', 'url', 'id', 'extractor', 'extractor_key', 'ie_key'):
if f in force_properties:
del force_properties[f]
new_result = info.copy()
@@ -1050,6 +1064,25 @@ class YoutubeDL(object):
return op(actual_value, comparison_value)
return _filter
+ def _default_format_spec(self, info_dict, download=True):
+ req_format_list = []
+
+ def can_have_partial_formats():
+ if self.params.get('simulate', False):
+ return True
+ if not download:
+ return True
+ if self.params.get('outtmpl', DEFAULT_OUTTMPL) == '-':
+ return False
+ if info_dict.get('is_live'):
+ return False
+ merger = FFmpegMergerPP(self)
+ return merger.available and merger.can_merge()
+ if can_have_partial_formats():
+ req_format_list.append('bestvideo+bestaudio')
+ req_format_list.append('best')
+ return '/'.join(req_format_list)
+
def build_format_selector(self, format_spec):
def syntax_error(note, start):
message = (
@@ -1450,12 +1483,14 @@ class YoutubeDL(object):
def is_wellformed(f):
url = f.get('url')
- valid_url = url and isinstance(url, compat_str)
- if not valid_url:
+ if not url:
self.report_warning(
'"url" field is missing or empty - skipping format, '
'there is an error in extractor')
- return valid_url
+ return False
+ if isinstance(url, bytes):
+ sanitize_string_field(f, 'url')
+ return True
# Filter out malformed formats for better extraction robustness
formats = list(filter(is_wellformed, formats))
@@ -1467,7 +1502,7 @@ class YoutubeDL(object):
sanitize_string_field(format, 'format_id')
sanitize_numeric_fields(format)
format['url'] = sanitize_url(format['url'])
- if format.get('format_id') is None:
+ if not format.get('format_id'):
format['format_id'] = compat_str(i)
else:
# Sanitize format_id from characters used in format selector expression
@@ -1520,14 +1555,10 @@ class YoutubeDL(object):
req_format = self.params.get('format')
if req_format is None:
- req_format_list = []
- if (self.params.get('outtmpl', DEFAULT_OUTTMPL) != '-' and
- not info_dict.get('is_live')):
- merger = FFmpegMergerPP(self)
- if merger.available and merger.can_merge():
- req_format_list.append('bestvideo+bestaudio')
- req_format_list.append('best')
- req_format = '/'.join(req_format_list)
+ req_format = self._default_format_spec(info_dict, download=download)
+ if self.params.get('verbose'):
+ self.to_stdout('[debug] Default format spec: %s' % req_format)
+
format_selector = self.build_format_selector(req_format)
# While in format selection we may need to have an access to the original
@@ -1679,12 +1710,17 @@ class YoutubeDL(object):
if filename is None:
return
- try:
- dn = os.path.dirname(sanitize_path(encodeFilename(filename)))
- if dn and not os.path.exists(dn):
- os.makedirs(dn)
- except (OSError, IOError) as err:
- self.report_error('unable to create directory ' + error_to_compat_str(err))
+ def ensure_dir_exists(path):
+ try:
+ dn = os.path.dirname(path)
+ if dn and not os.path.exists(dn):
+ os.makedirs(dn)
+ return True
+ except (OSError, IOError) as err:
+ self.report_error('unable to create directory ' + error_to_compat_str(err))
+ return False
+
+ if not ensure_dir_exists(sanitize_path(encodeFilename(filename))):
return
if self.params.get('writedescription', False):
@@ -1822,8 +1858,11 @@ class YoutubeDL(object):
for f in requested_formats:
new_info = dict(info_dict)
new_info.update(f)
- fname = self.prepare_filename(new_info)
- fname = prepend_extension(fname, 'f%s' % f['format_id'], new_info['ext'])
+ fname = prepend_extension(
+ self.prepare_filename(new_info),
+ 'f%s' % f['format_id'], new_info['ext'])
+ if not ensure_dir_exists(fname):
+ return
downloaded.append(fname)
partial_success = dl(fname, new_info)
success = success and partial_success
@@ -1890,7 +1929,7 @@ class YoutubeDL(object):
info_dict.get('protocol') == 'm3u8' and
self.params.get('hls_prefer_native')):
if fixup_policy == 'warn':
- self.report_warning('%s: malformated aac bitstream.' % (
+ self.report_warning('%s: malformed AAC bitstream detected.' % (
info_dict['id']))
elif fixup_policy == 'detect_or_warn':
fixup_pp = FFmpegFixupM3u8PP(self)
@@ -1899,7 +1938,7 @@ class YoutubeDL(object):
info_dict['__postprocessors'].append(fixup_pp)
else:
self.report_warning(
- '%s: malformated aac bitstream. %s'
+ '%s: malformed AAC bitstream detected. %s'
% (info_dict['id'], INSTALL_FFMPEG_MESSAGE))
else:
assert fixup_policy in ('ignore', 'never')
diff --git a/youtube_dl/downloader/common.py b/youtube_dl/downloader/common.py
index 77242da..75b8166 100644
--- a/youtube_dl/downloader/common.py
+++ b/youtube_dl/downloader/common.py
@@ -304,11 +304,11 @@ class FileDownloader(object):
"""Report attempt to resume at given byte."""
self.to_screen('[download] Resuming download at byte %s' % resume_len)
- def report_retry(self, count, retries):
+ def report_retry(self, err, count, retries):
"""Report retry in case of HTTP error 5xx"""
self.to_screen(
- '[download] Got server HTTP error. Retrying (attempt %d of %s)...'
- % (count, self.format_retries(retries)))
+ '[download] Got server HTTP error: %s. Retrying (attempt %d of %s)...'
+ % (error_to_compat_str(err), count, self.format_retries(retries)))
def report_file_already_downloaded(self, file_name):
"""Report file has already been fully downloaded."""
diff --git a/youtube_dl/downloader/dash.py b/youtube_dl/downloader/dash.py
index 7491fda..576ece6 100644
--- a/youtube_dl/downloader/dash.py
+++ b/youtube_dl/downloader/dash.py
@@ -2,6 +2,7 @@ from __future__ import unicode_literals
from .fragment import FragmentFD
from ..compat import compat_urllib_error
+from ..utils import urljoin
class DashSegmentsFD(FragmentFD):
@@ -12,12 +13,13 @@ class DashSegmentsFD(FragmentFD):
FD_NAME = 'dashsegments'
def real_download(self, filename, info_dict):
- segments = info_dict['fragments'][:1] if self.params.get(
+ fragment_base_url = info_dict.get('fragment_base_url')
+ fragments = info_dict['fragments'][:1] if self.params.get(
'test', False) else info_dict['fragments']
ctx = {
'filename': filename,
- 'total_frags': len(segments),
+ 'total_frags': len(fragments),
}
self._prepare_and_start_frag_download(ctx)
@@ -26,7 +28,7 @@ class DashSegmentsFD(FragmentFD):
skip_unavailable_fragments = self.params.get('skip_unavailable_fragments', True)
frag_index = 0
- for i, segment in enumerate(segments):
+ for i, fragment in enumerate(fragments):
frag_index += 1
if frag_index <= ctx['fragment_index']:
continue
@@ -36,7 +38,11 @@ class DashSegmentsFD(FragmentFD):
count = 0
while count <= fragment_retries:
try:
- success, frag_content = self._download_fragment(ctx, segment['url'], info_dict)
+ fragment_url = fragment.get('url')
+ if not fragment_url:
+ assert fragment_base_url
+ fragment_url = urljoin(fragment_base_url, fragment['path'])
+ success, frag_content = self._download_fragment(ctx, fragment_url, info_dict)
if not success:
return False
self._append_fragment(ctx, frag_content)
diff --git a/youtube_dl/downloader/hls.py b/youtube_dl/downloader/hls.py
index 0e29c8a..46308cf 100644
--- a/youtube_dl/downloader/hls.py
+++ b/youtube_dl/downloader/hls.py
@@ -59,9 +59,9 @@ class HlsFD(FragmentFD):
man_url = info_dict['url']
self.to_screen('[%s] Downloading m3u8 manifest' % self.FD_NAME)
- manifest = self.ydl.urlopen(self._prepare_url(info_dict, man_url)).read()
-
- s = manifest.decode('utf-8', 'ignore')
+ urlh = self.ydl.urlopen(self._prepare_url(info_dict, man_url))
+ man_url = urlh.geturl()
+ s = urlh.read().decode('utf-8', 'ignore')
if not self.can_download(s, info_dict):
if info_dict.get('extra_param_to_segment_url'):
diff --git a/youtube_dl/downloader/http.py b/youtube_dl/downloader/http.py
index af405b9..8a6638c 100644
--- a/youtube_dl/downloader/http.py
+++ b/youtube_dl/downloader/http.py
@@ -22,8 +22,16 @@ from ..utils import (
class HttpFD(FileDownloader):
def real_download(self, filename, info_dict):
url = info_dict['url']
- tmpfilename = self.temp_name(filename)
- stream = None
+
+ class DownloadContext(dict):
+ __getattr__ = dict.get
+ __setattr__ = dict.__setitem__
+ __delattr__ = dict.__delitem__
+
+ ctx = DownloadContext()
+ ctx.filename = filename
+ ctx.tmpfilename = self.temp_name(filename)
+ ctx.stream = None
# Do not include the Accept-Encoding header
headers = {'Youtubedl-no-compression': 'True'}
@@ -38,46 +46,51 @@ class HttpFD(FileDownloader):
if is_test:
request.add_header('Range', 'bytes=0-%s' % str(self._TEST_FILE_SIZE - 1))
- # Establish possible resume length
- if os.path.isfile(encodeFilename(tmpfilename)):
- resume_len = os.path.getsize(encodeFilename(tmpfilename))
- else:
- resume_len = 0
-
- open_mode = 'wb'
- if resume_len != 0:
- if self.params.get('continuedl', True):
- self.report_resuming_byte(resume_len)
- request.add_header('Range', 'bytes=%d-' % resume_len)
- open_mode = 'ab'
- else:
- resume_len = 0
+ ctx.open_mode = 'wb'
+ ctx.resume_len = 0
+
+ if self.params.get('continuedl', True):
+ # Establish possible resume length
+ if os.path.isfile(encodeFilename(ctx.tmpfilename)):
+ ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename))
count = 0
retries = self.params.get('retries', 0)
- while count <= retries:
+
+ class SucceedDownload(Exception):
+ pass
+
+ class RetryDownload(Exception):
+ def __init__(self, source_error):
+ self.source_error = source_error
+
+ def establish_connection():
+ if ctx.resume_len != 0:
+ self.report_resuming_byte(ctx.resume_len)
+ request.add_header('Range', 'bytes=%d-' % ctx.resume_len)
+ ctx.open_mode = 'ab'
# Establish connection
try:
- data = self.ydl.urlopen(request)
+ ctx.data = self.ydl.urlopen(request)
# When trying to resume, Content-Range HTTP header of response has to be checked
# to match the value of requested Range HTTP header. This is due to a webservers
# that don't support resuming and serve a whole file with no Content-Range
# set in response despite of requested Range (see
# https://github.com/rg3/youtube-dl/issues/6057#issuecomment-126129799)
- if resume_len > 0:
- content_range = data.headers.get('Content-Range')
+ if ctx.resume_len > 0:
+ content_range = ctx.data.headers.get('Content-Range')
if content_range:
content_range_m = re.search(r'bytes (\d+)-', content_range)
# Content-Range is present and matches requested Range, resume is possible
- if content_range_m and resume_len == int(content_range_m.group(1)):
- break
+ if content_range_m and ctx.resume_len == int(content_range_m.group(1)):
+ return
# Content-Range is either not present or invalid. Assuming remote webserver is
# trying to send the whole file, resume is not possible, so wiping the local file
# and performing entire redownload
self.report_unable_to_resume()
- resume_len = 0
- open_mode = 'wb'
- break
+ ctx.resume_len = 0
+ ctx.open_mode = 'wb'
+ return
except (compat_urllib_error.HTTPError, ) as err:
if (err.code < 500 or err.code >= 600) and err.code != 416:
# Unexpected HTTP error
@@ -86,15 +99,15 @@ class HttpFD(FileDownloader):
# Unable to resume (requested range not satisfiable)
try:
# Open the connection again without the range header
- data = self.ydl.urlopen(basic_request)
- content_length = data.info()['Content-Length']
+ ctx.data = self.ydl.urlopen(basic_request)
+ content_length = ctx.data.info()['Content-Length']
except (compat_urllib_error.HTTPError, ) as err:
if err.code < 500 or err.code >= 600:
raise
else:
# Examine the reported length
if (content_length is not None and
- (resume_len - 100 < int(content_length) < resume_len + 100)):
+ (ctx.resume_len - 100 < int(content_length) < ctx.resume_len + 100)):
# The file had already been fully downloaded.
# Explanation to the above condition: in issue #175 it was revealed that
# YouTube sometimes adds or removes a few bytes from the end of the file,
@@ -102,152 +115,184 @@ class HttpFD(FileDownloader):
# I decided to implement a suggested change and consider the file
# completely downloaded if the file size differs less than 100 bytes from
# the one in the hard drive.
- self.report_file_already_downloaded(filename)
- self.try_rename(tmpfilename, filename)
+ self.report_file_already_downloaded(ctx.filename)
+ self.try_rename(ctx.tmpfilename, ctx.filename)
self._hook_progress({
- 'filename': filename,
+ 'filename': ctx.filename,
'status': 'finished',
- 'downloaded_bytes': resume_len,
- 'total_bytes': resume_len,
+ 'downloaded_bytes': ctx.resume_len,
+ 'total_bytes': ctx.resume_len,
})
- return True
+ raise SucceedDownload()
else:
# The length does not match, we start the download over
self.report_unable_to_resume()
- resume_len = 0
- open_mode = 'wb'
- break
- except socket.error as e:
- if e.errno != errno.ECONNRESET:
+ ctx.resume_len = 0
+ ctx.open_mode = 'wb'
+ return
+ raise RetryDownload(err)
+ except socket.error as err:
+ if err.errno != errno.ECONNRESET:
# Connection reset is no problem, just retry
raise
+ raise RetryDownload(err)
+
+ def download():
+ data_len = ctx.data.info().get('Content-length', None)
+
+ # Range HTTP header may be ignored/unsupported by a webserver
+ # (e.g. extractor/scivee.py, extractor/bambuser.py).
+ # However, for a test we still would like to download just a piece of a file.
+ # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
+ # block size when downloading a file.
+ if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
+ data_len = self._TEST_FILE_SIZE
+
+ if data_len is not None:
+ data_len = int(data_len) + ctx.resume_len
+ min_data_len = self.params.get('min_filesize')
+ max_data_len = self.params.get('max_filesize')
+ if min_data_len is not None and data_len < min_data_len:
+ self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
+ return False
+ if max_data_len is not None and data_len > max_data_len:
+ self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
+ return False
- # Retry
- count += 1
- if count <= retries:
- self.report_retry(count, retries)
-
- if count > retries:
- self.report_error('giving up after %s retries' % retries)
- return False
-
- data_len = data.info().get('Content-length', None)
-
- # Range HTTP header may be ignored/unsupported by a webserver
- # (e.g. extractor/scivee.py, extractor/bambuser.py).
- # However, for a test we still would like to download just a piece of a file.
- # To achieve this we limit data_len to _TEST_FILE_SIZE and manually control
- # block size when downloading a file.
- if is_test and (data_len is None or int(data_len) > self._TEST_FILE_SIZE):
- data_len = self._TEST_FILE_SIZE
-
- if data_len is not None:
- data_len = int(data_len) + resume_len
- min_data_len = self.params.get('min_filesize')
- max_data_len = self.params.get('max_filesize')
- if min_data_len is not None and data_len < min_data_len:
- self.to_screen('\r[download] File is smaller than min-filesize (%s bytes < %s bytes). Aborting.' % (data_len, min_data_len))
- return False
- if max_data_len is not None and data_len > max_data_len:
- self.to_screen('\r[download] File is larger than max-filesize (%s bytes > %s bytes). Aborting.' % (data_len, max_data_len))
- return False
-
- byte_counter = 0 + resume_len
- block_size = self.params.get('buffersize', 1024)
- start = time.time()
+ byte_counter = 0 + ctx.resume_len
+ block_size = self.params.get('buffersize', 1024)
+ start = time.time()
- # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
- now = None # needed for slow_down() in the first loop run
- before = start # start measuring
- while True:
+ # measure time over whole while-loop, so slow_down() and best_block_size() work together properly
+ now = None # needed for slow_down() in the first loop run
+ before = start # start measuring
- # Download and write
- data_block = data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
- byte_counter += len(data_block)
+ def retry(e):
+ if ctx.tmpfilename != '-':
+ ctx.stream.close()
+ ctx.stream = None
+ ctx.resume_len = os.path.getsize(encodeFilename(ctx.tmpfilename))
+ raise RetryDownload(e)
- # exit loop when download is finished
- if len(data_block) == 0:
- break
+ while True:
+ try:
+ # Download and write
+ data_block = ctx.data.read(block_size if not is_test else min(block_size, data_len - byte_counter))
+ # socket.timeout is a subclass of socket.error but may not have
+ # errno set
+ except socket.timeout as e:
+ retry(e)
+ except socket.error as e:
+ if e.errno not in (errno.ECONNRESET, errno.ETIMEDOUT):
+ raise
+ retry(e)
+
+ byte_counter += len(data_block)
+
+ # exit loop when download is finished
+ if len(data_block) == 0:
+ break
+
+ # Open destination file just in time
+ if ctx.stream is None:
+ try:
+ ctx.stream, ctx.tmpfilename = sanitize_open(
+ ctx.tmpfilename, ctx.open_mode)
+ assert ctx.stream is not None
+ ctx.filename = self.undo_temp_name(ctx.tmpfilename)
+ self.report_destination(ctx.filename)
+ except (OSError, IOError) as err:
+ self.report_error('unable to open for writing: %s' % str(err))
+ return False
+
+ if self.params.get('xattr_set_filesize', False) and data_len is not None:
+ try:
+ write_xattr(ctx.tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8'))
+ except (XAttrUnavailableError, XAttrMetadataError) as err:
+ self.report_error('unable to set filesize xattr: %s' % str(err))
- # Open destination file just in time
- if stream is None:
try:
- (stream, tmpfilename) = sanitize_open(tmpfilename, open_mode)
- assert stream is not None
- filename = self.undo_temp_name(tmpfilename)
- self.report_destination(filename)
- except (OSError, IOError) as err:
- self.report_error('unable to open for writing: %s' % str(err))
+ ctx.stream.write(data_block)
+ except (IOError, OSError) as err:
+ self.to_stderr('\n')
+ self.report_error('unable to write data: %s' % str(err))
return False
- if self.params.get('xattr_set_filesize', False) and data_len is not None:
- try:
- write_xattr(tmpfilename, 'user.ytdl.filesize', str(data_len).encode('utf-8'))
- except (XAttrUnavailableError, XAttrMetadataError) as err:
- self.report_error('unable to set filesize xattr: %s' % str(err))
-
- try:
- stream.write(data_block)
- except (IOError, OSError) as err:
+ # Apply rate limit
+ self.slow_down(start, now, byte_counter - ctx.resume_len)
+
+ # end measuring of one loop run
+ now = time.time()
+ after = now
+
+ # Adjust block size
+ if not self.params.get('noresizebuffer', False):
+ block_size = self.best_block_size(after - before, len(data_block))
+
+ before = after
+
+ # Progress message
+ speed = self.calc_speed(start, now, byte_counter - ctx.resume_len)
+ if data_len is None:
+ eta = None
+ else:
+ eta = self.calc_eta(start, time.time(), data_len - ctx.resume_len, byte_counter - ctx.resume_len)
+
+ self._hook_progress({
+ 'status': 'downloading',
+ 'downloaded_bytes': byte_counter,
+ 'total_bytes': data_len,
+ 'tmpfilename': ctx.tmpfilename,
+ 'filename': ctx.filename,
+ 'eta': eta,
+ 'speed': speed,
+ 'elapsed': now - start,
+ })
+
+ if is_test and byte_counter == data_len:
+ break
+
+ if ctx.stream is None:
self.to_stderr('\n')
- self.report_error('unable to write data: %s' % str(err))
+ self.report_error('Did not get any data blocks')
return False
+ if ctx.tmpfilename != '-':
+ ctx.stream.close()
- # Apply rate limit
- self.slow_down(start, now, byte_counter - resume_len)
+ if data_len is not None and byte_counter != data_len:
+ err = ContentTooShortError(byte_counter, int(data_len))
+ if count <= retries:
+ retry(err)
+ raise err
- # end measuring of one loop run
- now = time.time()
- after = now
+ self.try_rename(ctx.tmpfilename, ctx.filename)
- # Adjust block size
- if not self.params.get('noresizebuffer', False):
- block_size = self.best_block_size(after - before, len(data_block))
-
- before = after
-
- # Progress message
- speed = self.calc_speed(start, now, byte_counter - resume_len)
- if data_len is None:
- eta = None
- else:
- eta = self.calc_eta(start, time.time(), data_len - resume_len, byte_counter - resume_len)
+ # Update file modification time
+ if self.params.get('updatetime', True):
+ info_dict['filetime'] = self.try_utime(ctx.filename, ctx.data.info().get('last-modified', None))
self._hook_progress({
- 'status': 'downloading',
'downloaded_bytes': byte_counter,
- 'total_bytes': data_len,
- 'tmpfilename': tmpfilename,
- 'filename': filename,
- 'eta': eta,
- 'speed': speed,
- 'elapsed': now - start,
+ 'total_bytes': byte_counter,
+ 'filename': ctx.filename,
+ 'status': 'finished',
+ 'elapsed': time.time() - start,
})
- if is_test and byte_counter == data_len:
- break
-
- if stream is None:
- self.to_stderr('\n')
- self.report_error('Did not get any data blocks')
- return False
- if tmpfilename != '-':
- stream.close()
-
- if data_len is not None and byte_counter != data_len:
- raise ContentTooShortError(byte_counter, int(data_len))
- self.try_rename(tmpfilename, filename)
-
- # Update file modification time
- if self.params.get('updatetime', True):
- info_dict['filetime'] = self.try_utime(filename, data.info().get('last-modified', None))
-
- self._hook_progress({
- 'downloaded_bytes': byte_counter,
- 'total_bytes': byte_counter,
- 'filename': filename,
- 'status': 'finished',
- 'elapsed': time.time() - start,
- })
-
- return True
+ return True
+
+ while count <= retries:
+ try:
+ establish_connection()
+ download()
+ return True
+ except RetryDownload as e:
+ count += 1
+ if count <= retries:
+ self.report_retry(e.source_error, count, retries)
+ continue
+ except SucceedDownload:
+ return True
+
+ self.report_error('giving up after %s retries' % retries)
+ return False
diff --git a/youtube_dl/downloader/ism.py b/youtube_dl/downloader/ism.py
index 5f6f9fa..9b001ec 100644
--- a/youtube_dl/downloader/ism.py
+++ b/youtube_dl/downloader/ism.py
@@ -98,7 +98,7 @@ def write_piff_header(stream, params):
if is_audio:
smhd_payload = s88.pack(0) # balance
- smhd_payload = u16.pack(0) # reserved
+ smhd_payload += u16.pack(0) # reserved
media_header_box = full_box(b'smhd', 0, 0, smhd_payload) # Sound Media Header
else:
vmhd_payload = u16.pack(0) # graphics mode
@@ -126,7 +126,6 @@ def write_piff_header(stream, params):
if fourcc == 'AACL':
sample_entry_box = box(b'mp4a', sample_entry_payload)
else:
- sample_entry_payload = sample_entry_payload
sample_entry_payload += u16.pack(0) # pre defined
sample_entry_payload += u16.pack(0) # reserved
sample_entry_payload += u32.pack(0) * 3 # pre defined
diff --git a/youtube_dl/extractor/abc.py b/youtube_dl/extractor/abc.py
index 0247cab..60f753b 100644
--- a/youtube_dl/extractor/abc.py
+++ b/youtube_dl/extractor/abc.py
@@ -3,11 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
ExtractorError,
js_to_json,
int_or_none,
parse_iso8601,
+ try_get,
)
@@ -124,7 +126,20 @@ class ABCIViewIE(InfoExtractor):
title = video_params.get('title') or video_params['seriesTitle']
stream = next(s for s in video_params['playlist'] if s.get('type') == 'program')
- formats = self._extract_akamai_formats(stream['hds-unmetered'], video_id)
+ format_urls = [
+ try_get(stream, lambda x: x['hds-unmetered'], compat_str)]
+
+ # May have higher quality video
+ sd_url = try_get(
+ stream, lambda x: x['streams']['hds']['sd'], compat_str)
+ if sd_url:
+ format_urls.append(sd_url.replace('metered', 'um'))
+
+ formats = []
+ for format_url in format_urls:
+ if format_url:
+ formats.extend(
+ self._extract_akamai_formats(format_url, video_id))
self._sort_formats(formats)
subtitles = {}
diff --git a/youtube_dl/extractor/abcnews.py b/youtube_dl/extractor/abcnews.py
index 74d5456..f770fe9 100644
--- a/youtube_dl/extractor/abcnews.py
+++ b/youtube_dl/extractor/abcnews.py
@@ -7,6 +7,7 @@ import time
from .amp import AMPIE
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..compat import compat_urlparse
@@ -108,9 +109,7 @@ class AbcNewsIE(InfoExtractor):
r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
full_video_url = compat_urlparse.urljoin(url, video_url)
- youtube_url = self._html_search_regex(
- r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"',
- webpage, 'YouTube URL', default=None)
+ youtube_url = YoutubeIE._extract_url(webpage)
timestamp = None
date_str = self._html_search_regex(
@@ -140,7 +139,7 @@ class AbcNewsIE(InfoExtractor):
}
if youtube_url:
- entries = [entry, self.url_result(youtube_url, 'Youtube')]
+ entries = [entry, self.url_result(youtube_url, ie=YoutubeIE.ie_key())]
return self.playlist_result(entries)
return entry
diff --git a/youtube_dl/extractor/adn.py b/youtube_dl/extractor/adn.py
index 39f80b2..cffdab6 100644
--- a/youtube_dl/extractor/adn.py
+++ b/youtube_dl/extractor/adn.py
@@ -107,11 +107,13 @@ class ADNIE(InfoExtractor):
metas = options.get('metas') or {}
title = metas.get('title') or video_info['title']
links = player_config.get('links') or {}
+ error = None
if not links:
links_url = player_config['linksurl']
links_data = self._download_json(urljoin(
self._BASE_URL, links_url), video_id)
links = links_data.get('links') or {}
+ error = links_data.get('error')
formats = []
for format_id, qualities in links.items():
@@ -130,7 +132,8 @@ class ADNIE(InfoExtractor):
for f in m3u8_formats:
f['language'] = 'fr'
formats.extend(m3u8_formats)
- error = options.get('error')
+ if not error:
+ error = options.get('error')
if not formats and error:
raise ExtractorError('%s said: %s' % (self.IE_NAME, error), expected=True)
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/aliexpress.py b/youtube_dl/extractor/aliexpress.py
new file mode 100644
index 0000000..6f241e6
--- /dev/null
+++ b/youtube_dl/extractor/aliexpress.py
@@ -0,0 +1,53 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ float_or_none,
+ try_get,
+)
+
+
+class AliExpressLiveIE(InfoExtractor):
+ _VALID_URL = r'https?://live\.aliexpress\.com/live/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://live.aliexpress.com/live/2800002704436634',
+ 'md5': 'e729e25d47c5e557f2630eaf99b740a5',
+ 'info_dict': {
+ 'id': '2800002704436634',
+ 'ext': 'mp4',
+ 'title': 'CASIMA7.22',
+ 'thumbnail': r're:http://.*\.jpg',
+ 'uploader': 'CASIMA Official Store',
+ 'timestamp': 1500717600,
+ 'upload_date': '20170722',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ data = self._parse_json(
+ self._search_regex(
+ r'(?s)runParams\s*=\s*({.+?})\s*;?\s*var',
+ webpage, 'runParams'),
+ video_id)
+
+ title = data['title']
+
+ formats = self._extract_m3u8_formats(
+ data['replyStreamUrl'], video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls')
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': data.get('coverUrl'),
+ 'uploader': try_get(
+ data, lambda x: x['followBar']['name'], compat_str),
+ 'timestamp': float_or_none(data.get('startTimeLong'), scale=1000),
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/amcnetworks.py b/youtube_dl/extractor/amcnetworks.py
index 3a0ec67..dd3b18d 100644
--- a/youtube_dl/extractor/amcnetworks.py
+++ b/youtube_dl/extractor/amcnetworks.py
@@ -3,9 +3,10 @@ from __future__ import unicode_literals
from .theplatform import ThePlatformIE
from ..utils import (
- update_url_query,
- parse_age_limit,
int_or_none,
+ parse_age_limit,
+ try_get,
+ update_url_query,
)
@@ -68,7 +69,8 @@ class AMCNetworksIE(ThePlatformIE):
info = self._parse_theplatform_metadata(theplatform_metadata)
video_id = theplatform_metadata['pid']
title = theplatform_metadata['title']
- rating = theplatform_metadata['ratings'][0]['rating']
+ rating = try_get(
+ theplatform_metadata, lambda x: x['ratings'][0]['rating'])
auth_required = self._search_regex(
r'window\.authRequired\s*=\s*(true|false);',
webpage, 'auth required')
diff --git a/youtube_dl/extractor/animeondemand.py b/youtube_dl/extractor/animeondemand.py
index 9e28f25..69d3633 100644
--- a/youtube_dl/extractor/animeondemand.py
+++ b/youtube_dl/extractor/animeondemand.py
@@ -3,16 +3,13 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..compat import (
- compat_urlparse,
- compat_str,
-)
+from ..compat import compat_str
from ..utils import (
determine_ext,
extract_attributes,
ExtractorError,
- sanitized_Request,
urlencode_postdata,
+ urljoin,
)
@@ -21,6 +18,8 @@ class AnimeOnDemandIE(InfoExtractor):
_LOGIN_URL = 'https://www.anime-on-demand.de/users/sign_in'
_APPLY_HTML5_URL = 'https://www.anime-on-demand.de/html5apply'
_NETRC_MACHINE = 'animeondemand'
+ # German-speaking countries of Europe
+ _GEO_COUNTRIES = ['AT', 'CH', 'DE', 'LI', 'LU']
_TESTS = [{
# jap, OmU
'url': 'https://www.anime-on-demand.de/anime/161',
@@ -46,6 +45,10 @@ class AnimeOnDemandIE(InfoExtractor):
# Full length film, non-series, ger/jap, Dub/OmU, account required
'url': 'https://www.anime-on-demand.de/anime/185',
'only_matching': True,
+ }, {
+ # Flash videos
+ 'url': 'https://www.anime-on-demand.de/anime/12',
+ 'only_matching': True,
}]
def _login(self):
@@ -72,14 +75,13 @@ class AnimeOnDemandIE(InfoExtractor):
'post url', default=self._LOGIN_URL, group='url')
if not post_url.startswith('http'):
- post_url = compat_urlparse.urljoin(self._LOGIN_URL, post_url)
-
- request = sanitized_Request(
- post_url, urlencode_postdata(login_form))
- request.add_header('Referer', self._LOGIN_URL)
+ post_url = urljoin(self._LOGIN_URL, post_url)
response = self._download_webpage(
- request, None, 'Logging in as %s' % username)
+ post_url, None, 'Logging in as %s' % username,
+ data=urlencode_postdata(login_form), headers={
+ 'Referer': self._LOGIN_URL,
+ })
if all(p not in response for p in ('>Logout<', 'href="/users/sign_out"')):
error = self._search_regex(
@@ -120,10 +122,11 @@ class AnimeOnDemandIE(InfoExtractor):
formats = []
for input_ in re.findall(
- r'<input[^>]+class=["\'].*?streamstarter_html5[^>]+>', html):
+ r'<input[^>]+class=["\'].*?streamstarter[^>]+>', html):
attributes = extract_attributes(input_)
+ title = attributes.get('data-dialog-header')
playlist_urls = []
- for playlist_key in ('data-playlist', 'data-otherplaylist'):
+ for playlist_key in ('data-playlist', 'data-otherplaylist', 'data-stream'):
playlist_url = attributes.get(playlist_key)
if isinstance(playlist_url, compat_str) and re.match(
r'/?[\da-zA-Z]+', playlist_url):
@@ -147,19 +150,38 @@ class AnimeOnDemandIE(InfoExtractor):
format_id_list.append(compat_str(num))
format_id = '-'.join(format_id_list)
format_note = ', '.join(filter(None, (kind, lang_note)))
- request = sanitized_Request(
- compat_urlparse.urljoin(url, playlist_url),
+ item_id_list = []
+ if format_id:
+ item_id_list.append(format_id)
+ item_id_list.append('videomaterial')
+ playlist = self._download_json(
+ urljoin(url, playlist_url), video_id,
+ 'Downloading %s JSON' % ' '.join(item_id_list),
headers={
'X-Requested-With': 'XMLHttpRequest',
'X-CSRF-Token': csrf_token,
'Referer': url,
'Accept': 'application/json, text/javascript, */*; q=0.01',
- })
- playlist = self._download_json(
- request, video_id, 'Downloading %s playlist JSON' % format_id,
- fatal=False)
+ }, fatal=False)
if not playlist:
continue
+ stream_url = playlist.get('streamurl')
+ if stream_url:
+ rtmp = re.search(
+ r'^(?P<url>rtmpe?://(?P<host>[^/]+)/(?P<app>.+/))(?P<playpath>mp[34]:.+)',
+ stream_url)
+ if rtmp:
+ formats.append({
+ 'url': rtmp.group('url'),
+ 'app': rtmp.group('app'),
+ 'play_path': rtmp.group('playpath'),
+ 'page_url': url,
+ 'player_url': 'https://www.anime-on-demand.de/assets/jwplayer.flash-55abfb34080700304d49125ce9ffb4a6.swf',
+ 'rtmp_real_time': True,
+ 'format_id': 'rtmp',
+ 'ext': 'flv',
+ })
+ continue
start_video = playlist.get('startvideo', 0)
playlist = playlist.get('playlist')
if not playlist or not isinstance(playlist, list):
@@ -222,7 +244,7 @@ class AnimeOnDemandIE(InfoExtractor):
f.update({
'id': '%s-%s' % (f['id'], m.group('kind').lower()),
'title': m.group('title'),
- 'url': compat_urlparse.urljoin(url, m.group('href')),
+ 'url': urljoin(url, m.group('href')),
})
entries.append(f)
diff --git a/youtube_dl/extractor/aparat.py b/youtube_dl/extractor/aparat.py
index 025e29a..e394cb6 100644
--- a/youtube_dl/extractor/aparat.py
+++ b/youtube_dl/extractor/aparat.py
@@ -3,13 +3,13 @@ from __future__ import unicode_literals
from .common import InfoExtractor
from ..utils import (
- ExtractorError,
- HEADRequest,
+ int_or_none,
+ mimetype2ext,
)
class AparatIE(InfoExtractor):
- _VALID_URL = r'^https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
+ _VALID_URL = r'https?://(?:www\.)?aparat\.com/(?:v/|video/video/embed/videohash/)(?P<id>[a-zA-Z0-9]+)'
_TEST = {
'url': 'http://www.aparat.com/v/wP8On',
@@ -29,30 +29,41 @@ class AparatIE(InfoExtractor):
# Note: There is an easier-to-parse configuration at
# http://www.aparat.com/video/video/config/videohash/%video_id
# but the URL in there does not work
- embed_url = 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id
- webpage = self._download_webpage(embed_url, video_id)
-
- file_list = self._parse_json(self._search_regex(
- r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage, 'file list'), video_id)
- for i, item in enumerate(file_list[0]):
- video_url = item['file']
- req = HEADRequest(video_url)
- res = self._request_webpage(
- req, video_id, note='Testing video URL %d' % i, errnote=False)
- if res:
- break
- else:
- raise ExtractorError('No working video URLs found')
+ webpage = self._download_webpage(
+ 'http://www.aparat.com/video/video/embed/vt/frame/showvideo/yes/videohash/' + video_id,
+ video_id)
title = self._search_regex(r'\s+title:\s*"([^"]+)"', webpage, 'title')
+
+ file_list = self._parse_json(
+ self._search_regex(
+ r'fileList\s*=\s*JSON\.parse\(\'([^\']+)\'\)', webpage,
+ 'file list'),
+ video_id)
+
+ formats = []
+ for item in file_list[0]:
+ file_url = item.get('file')
+ if not file_url:
+ continue
+ ext = mimetype2ext(item.get('type'))
+ label = item.get('label')
+ formats.append({
+ 'url': file_url,
+ 'ext': ext,
+ 'format_id': label or ext,
+ 'height': int_or_none(self._search_regex(
+ r'(\d+)[pP]', label or '', 'height', default=None)),
+ })
+ self._sort_formats(formats)
+
thumbnail = self._search_regex(
r'image:\s*"([^"]+)"', webpage, 'thumbnail', fatal=False)
return {
'id': video_id,
'title': title,
- 'url': video_url,
- 'ext': 'mp4',
'thumbnail': thumbnail,
'age_limit': self._family_friendly_search(webpage),
+ 'formats': formats,
}
diff --git a/youtube_dl/extractor/ard.py b/youtube_dl/extractor/ard.py
index 2d55994..3f248b1 100644
--- a/youtube_dl/extractor/ard.py
+++ b/youtube_dl/extractor/ard.py
@@ -93,6 +93,7 @@ class ARDMediathekIE(InfoExtractor):
duration = int_or_none(media_info.get('_duration'))
thumbnail = media_info.get('_previewImage')
+ is_live = media_info.get('_isLive') is True
subtitles = {}
subtitle_url = media_info.get('_subtitleUrl')
@@ -106,6 +107,7 @@ class ARDMediathekIE(InfoExtractor):
'id': video_id,
'duration': duration,
'thumbnail': thumbnail,
+ 'is_live': is_live,
'formats': formats,
'subtitles': subtitles,
}
@@ -166,9 +168,11 @@ class ARDMediathekIE(InfoExtractor):
# determine video id from url
m = re.match(self._VALID_URL, url)
+ document_id = None
+
numid = re.search(r'documentId=([0-9]+)', url)
if numid:
- video_id = numid.group(1)
+ document_id = video_id = numid.group(1)
else:
video_id = m.group('video_id')
@@ -228,12 +232,16 @@ class ARDMediathekIE(InfoExtractor):
'formats': formats,
}
else: # request JSON file
+ if not document_id:
+ video_id = self._search_regex(
+ r'/play/(?:config|media)/(\d+)', webpage, 'media id')
info = self._extract_media_info(
- 'http://www.ardmediathek.de/play/media/%s' % video_id, webpage, video_id)
+ 'http://www.ardmediathek.de/play/media/%s' % video_id,
+ webpage, video_id)
info.update({
'id': video_id,
- 'title': title,
+ 'title': self._live_title(title) if info.get('is_live') else title,
'description': description,
'thumbnail': thumbnail,
})
diff --git a/youtube_dl/extractor/arte.py b/youtube_dl/extractor/arte.py
index 56baef2..5cde90c 100644
--- a/youtube_dl/extractor/arte.py
+++ b/youtube_dl/extractor/arte.py
@@ -9,12 +9,13 @@ from ..compat import (
compat_urllib_parse_urlparse,
)
from ..utils import (
+ ExtractorError,
find_xpath_attr,
- unified_strdate,
get_element_by_attribute,
int_or_none,
NO_DEFAULT,
qualities,
+ unified_strdate,
)
# There are different sources of video in arte.tv, the extraction process
@@ -79,6 +80,13 @@ class ArteTVBaseIE(InfoExtractor):
info = self._download_json(json_url, video_id)
player_info = info['videoJsonPlayer']
+ vsr = player_info['VSR']
+
+ if not vsr:
+ raise ExtractorError(
+ 'Video %s is not available' % player_info.get('VID') or video_id,
+ expected=True)
+
upload_date_str = player_info.get('shootingDate')
if not upload_date_str:
upload_date_str = (player_info.get('VRA') or player_info.get('VDA') or '').split(' ')[0]
@@ -107,7 +115,7 @@ class ArteTVBaseIE(InfoExtractor):
langcode = LANGS.get(lang, lang)
formats = []
- for format_id, format_dict in player_info['VSR'].items():
+ for format_id, format_dict in vsr.items():
f = dict(format_dict)
versionCode = f.get('versionCode')
l = re.escape(langcode)
diff --git a/youtube_dl/extractor/audioboom.py b/youtube_dl/extractor/audioboom.py
index e48bb89..393f381 100644
--- a/youtube_dl/extractor/audioboom.py
+++ b/youtube_dl/extractor/audioboom.py
@@ -43,7 +43,7 @@ class AudioBoomIE(InfoExtractor):
def from_clip(field):
if clip:
- clip.get(field)
+ return clip.get(field)
audio_url = from_clip('clipURLPriorToLoading') or self._og_search_property(
'audio', webpage, 'audio url')
diff --git a/youtube_dl/extractor/bandcamp.py b/youtube_dl/extractor/bandcamp.py
index 9ddb9af..be41bd5 100644
--- a/youtube_dl/extractor/bandcamp.py
+++ b/youtube_dl/extractor/bandcamp.py
@@ -242,7 +242,12 @@ class BandcampAlbumIE(InfoExtractor):
raise ExtractorError('The page doesn\'t contain any tracks')
# Only tracks with duration info have songs
entries = [
- self.url_result(compat_urlparse.urljoin(url, t_path), ie=BandcampIE.ie_key())
+ self.url_result(
+ compat_urlparse.urljoin(url, t_path),
+ ie=BandcampIE.ie_key(),
+ video_title=self._search_regex(
+ r'<span\b[^>]+\bitemprop=["\']name["\'][^>]*>([^<]+)',
+ elem_content, 'track title', fatal=False))
for elem_content, t_path in track_elements
if self._html_search_meta('duration', elem_content, default=None)]
diff --git a/youtube_dl/extractor/bbc.py b/youtube_dl/extractor/bbc.py
index 79ded6b..8b20c03 100644
--- a/youtube_dl/extractor/bbc.py
+++ b/youtube_dl/extractor/bbc.py
@@ -29,7 +29,7 @@ from ..compat import (
class BBCCoUkIE(InfoExtractor):
IE_NAME = 'bbc.co.uk'
IE_DESC = 'BBC iPlayer'
- _ID_REGEX = r'[pb][\da-z]{7}'
+ _ID_REGEX = r'[pbw][\da-z]{7}'
_VALID_URL = r'''(?x)
https?://
(?:www\.)?bbc\.co\.uk/
@@ -37,7 +37,8 @@ class BBCCoUkIE(InfoExtractor):
programmes/(?!articles/)|
iplayer(?:/[^/]+)?/(?:episode/|playlist/)|
music/(?:clips|audiovideo/popular)[/#]|
- radio/player/
+ radio/player/|
+ events/[^/]+/play/[^/]+/
)
(?P<id>%s)(?!/(?:episodes|broadcasts|clips))
''' % _ID_REGEX
@@ -232,6 +233,9 @@ class BBCCoUkIE(InfoExtractor):
}, {
'url': 'https://www.bbc.co.uk/music/audiovideo/popular#p055bc55',
'only_matching': True,
+ }, {
+ 'url': 'http://www.bbc.co.uk/programmes/w3csv1y9',
+ 'only_matching': True,
}]
_USP_RE = r'/([^/]+?)\.ism(?:\.hlsv2\.ism)?/[^/]+\.m3u8'
diff --git a/youtube_dl/extractor/bpb.py b/youtube_dl/extractor/bpb.py
index 9661ade..0783353 100644
--- a/youtube_dl/extractor/bpb.py
+++ b/youtube_dl/extractor/bpb.py
@@ -33,13 +33,18 @@ class BpbIE(InfoExtractor):
title = self._html_search_regex(
r'<h2 class="white">(.*?)</h2>', webpage, 'title')
video_info_dicts = re.findall(
- r"({\s*src:\s*'http://film\.bpb\.de/[^}]+})", webpage)
+ r"({\s*src\s*:\s*'https?://film\.bpb\.de/[^}]+})", webpage)
formats = []
for video_info in video_info_dicts:
- video_info = self._parse_json(video_info, video_id, transform_source=js_to_json)
- quality = video_info['quality']
- video_url = video_info['src']
+ video_info = self._parse_json(
+ video_info, video_id, transform_source=js_to_json, fatal=False)
+ if not video_info:
+ continue
+ video_url = video_info.get('src')
+ if not video_url:
+ continue
+ quality = 'high' if '_high' in video_url else 'low'
formats.append({
'url': video_url,
'preference': 10 if quality == 'high' else 0,
diff --git a/youtube_dl/extractor/cbc.py b/youtube_dl/extractor/cbc.py
index 87ad14e..9faf402 100644
--- a/youtube_dl/extractor/cbc.py
+++ b/youtube_dl/extractor/cbc.py
@@ -200,6 +200,7 @@ class CBCWatchBaseIE(InfoExtractor):
'media': 'http://search.yahoo.com/mrss/',
'clearleap': 'http://www.clearleap.com/namespace/clearleap/1.0/',
}
+ _GEO_COUNTRIES = ['CA']
def _call_api(self, path, video_id):
url = path if path.startswith('http') else self._API_BASE_URL + path
@@ -287,6 +288,11 @@ class CBCWatchBaseIE(InfoExtractor):
class CBCWatchVideoIE(CBCWatchBaseIE):
IE_NAME = 'cbc.ca:watch:video'
_VALID_URL = r'https?://api-cbc\.cloud\.clearleap\.com/cloffice/client/web/play/?\?.*?\bcontentId=(?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})'
+ _TEST = {
+ # geo-restricted to Canada, bypassable
+ 'url': 'https://api-cbc.cloud.clearleap.com/cloffice/client/web/play/?contentId=3c84472a-1eea-4dee-9267-2655d5055dcf&categoryId=ebc258f5-ee40-4cca-b66b-ba6bd55b7235',
+ 'only_matching': True,
+ }
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -323,9 +329,10 @@ class CBCWatchIE(CBCWatchBaseIE):
IE_NAME = 'cbc.ca:watch'
_VALID_URL = r'https?://watch\.cbc\.ca/(?:[^/]+/)+(?P<id>[0-9a-f-]+)'
_TESTS = [{
+ # geo-restricted to Canada, bypassable
'url': 'http://watch.cbc.ca/doc-zone/season-6/customer-disservice/38e815a-009e3ab12e4',
'info_dict': {
- 'id': '38e815a-009e3ab12e4',
+ 'id': '9673749a-5e77-484c-8b62-a1092a6b5168',
'ext': 'mp4',
'title': 'Customer (Dis)Service',
'description': 'md5:8bdd6913a0fe03d4b2a17ebe169c7c87',
@@ -337,8 +344,8 @@ class CBCWatchIE(CBCWatchBaseIE):
'skip_download': True,
'format': 'bestvideo',
},
- 'skip': 'Geo-restricted to Canada',
}, {
+ # geo-restricted to Canada, bypassable
'url': 'http://watch.cbc.ca/arthur/all/1ed4b385-cd84-49cf-95f0-80f004680057',
'info_dict': {
'id': '1ed4b385-cd84-49cf-95f0-80f004680057',
@@ -346,7 +353,6 @@ class CBCWatchIE(CBCWatchBaseIE):
'description': 'Arthur, the sweetest 8-year-old aardvark, and his pals solve all kinds of problems with humour, kindness and teamwork.',
},
'playlist_mincount': 30,
- 'skip': 'Geo-restricted to Canada',
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/cda.py b/youtube_dl/extractor/cda.py
index 78b7a92..0c3af23 100755
--- a/youtube_dl/extractor/cda.py
+++ b/youtube_dl/extractor/cda.py
@@ -124,7 +124,7 @@ class CDAIE(InfoExtractor):
}
def extract_format(page, version):
- json_str = self._search_regex(
+ json_str = self._html_search_regex(
r'player_data=(\\?["\'])(?P<player_data>.+?)\1', page,
'%s player_json' % version, fatal=False, group='player_data')
if not json_str:
diff --git a/youtube_dl/extractor/charlierose.py b/youtube_dl/extractor/charlierose.py
index 2d517f2..42c9af2 100644
--- a/youtube_dl/extractor/charlierose.py
+++ b/youtube_dl/extractor/charlierose.py
@@ -5,7 +5,7 @@ from ..utils import remove_end
class CharlieRoseIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?charlierose\.com/video(?:s|/player)/(?P<id>\d+)'
+ _VALID_URL = r'https?://(?:www\.)?charlierose\.com/(?:video|episode)(?:s|/player)/(?P<id>\d+)'
_TESTS = [{
'url': 'https://charlierose.com/videos/27996',
'md5': 'fda41d49e67d4ce7c2411fd2c4702e09',
@@ -24,6 +24,9 @@ class CharlieRoseIE(InfoExtractor):
}, {
'url': 'https://charlierose.com/videos/27996',
'only_matching': True,
+ }, {
+ 'url': 'https://charlierose.com/episodes/30887?autoplay=true',
+ 'only_matching': True,
}]
_PLAYER_BASE = 'https://charlierose.com/video/player/%s'
diff --git a/youtube_dl/extractor/chilloutzone.py b/youtube_dl/extractor/chilloutzone.py
index 0206d96..d4769da 100644
--- a/youtube_dl/extractor/chilloutzone.py
+++ b/youtube_dl/extractor/chilloutzone.py
@@ -5,6 +5,7 @@ import base64
import json
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..utils import (
clean_html,
ExtractorError
@@ -70,11 +71,9 @@ class ChilloutzoneIE(InfoExtractor):
# If nativePlatform is None a fallback mechanism is used (i.e. youtube embed)
if native_platform is None:
- youtube_url = self._html_search_regex(
- r'<iframe.* src="((?:https?:)?//(?:[^.]+\.)?youtube\.com/.+?)"',
- webpage, 'fallback video URL', default=None)
- if youtube_url is not None:
- return self.url_result(youtube_url, ie='Youtube')
+ youtube_url = YoutubeIE._extract_url(webpage)
+ if youtube_url:
+ return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
# Non Fallback: Decide to use native source (e.g. youtube or vimeo) or
# the own CDN
diff --git a/youtube_dl/extractor/cinchcast.py b/youtube_dl/extractor/cinchcast.py
index 562c9bb..b861d54 100644
--- a/youtube_dl/extractor/cinchcast.py
+++ b/youtube_dl/extractor/cinchcast.py
@@ -9,12 +9,20 @@ from ..utils import (
class CinchcastIE(InfoExtractor):
- _VALID_URL = r'https?://player\.cinchcast\.com/.*?assetId=(?P<id>[0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://player\.cinchcast\.com/.*?(?:assetId|show_id)=(?P<id>[0-9]+)'
+ _TESTS = [{
+ 'url': 'http://player.cinchcast.com/?show_id=5258197&platformId=1&assetType=single',
+ 'info_dict': {
+ 'id': '5258197',
+ 'ext': 'mp3',
+ 'title': 'Train Your Brain to Up Your Game with Coach Mandy',
+ 'upload_date': '20130816',
+ },
+ }, {
# Actual test is run in generic, look for undergroundwellness
'url': 'http://player.cinchcast.com/?platformId=1&#038;assetType=single&#038;assetId=7141703',
'only_matching': True,
- }
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/cjsw.py b/youtube_dl/extractor/cjsw.py
new file mode 100644
index 0000000..505bdbe
--- /dev/null
+++ b/youtube_dl/extractor/cjsw.py
@@ -0,0 +1,72 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ determine_ext,
+ unescapeHTML,
+)
+
+
+class CJSWIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?cjsw\.com/program/(?P<program>[^/]+)/episode/(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'http://cjsw.com/program/freshly-squeezed/episode/20170620',
+ 'md5': 'cee14d40f1e9433632c56e3d14977120',
+ 'info_dict': {
+ 'id': '91d9f016-a2e7-46c5-8dcb-7cbcd7437c41',
+ 'ext': 'mp3',
+ 'title': 'Freshly Squeezed – Episode June 20, 2017',
+ 'description': 'md5:c967d63366c3898a80d0c7b0ff337202',
+ 'series': 'Freshly Squeezed',
+ 'episode_id': '20170620',
+ },
+ }, {
+ # no description
+ 'url': 'http://cjsw.com/program/road-pops/episode/20170707/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ program, episode_id = mobj.group('program', 'id')
+ audio_id = '%s/%s' % (program, episode_id)
+
+ webpage = self._download_webpage(url, episode_id)
+
+ title = unescapeHTML(self._search_regex(
+ (r'<h1[^>]+class=["\']episode-header__title["\'][^>]*>(?P<title>[^<]+)',
+ r'data-audio-title=(["\'])(?P<title>(?:(?!\1).)+)\1'),
+ webpage, 'title', group='title'))
+
+ audio_url = self._search_regex(
+ r'<button[^>]+data-audio-src=(["\'])(?P<url>(?:(?!\1).)+)\1',
+ webpage, 'audio url', group='url')
+
+ audio_id = self._search_regex(
+ r'/([\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})\.mp3',
+ audio_url, 'audio id', default=audio_id)
+
+ formats = [{
+ 'url': audio_url,
+ 'ext': determine_ext(audio_url, 'mp3'),
+ 'vcodec': 'none',
+ }]
+
+ description = self._html_search_regex(
+ r'<p>(?P<description>.+?)</p>', webpage, 'description',
+ default=None)
+ series = self._search_regex(
+ r'data-showname=(["\'])(?P<name>(?:(?!\1).)+)\1', webpage,
+ 'series', default=program, group='name')
+
+ return {
+ 'id': audio_id,
+ 'title': title,
+ 'description': description,
+ 'formats': formats,
+ 'series': series,
+ 'episode_id': episode_id,
+ }
diff --git a/youtube_dl/extractor/clipfish.py b/youtube_dl/extractor/clipfish.py
deleted file mode 100644
index 0920f62..0000000
--- a/youtube_dl/extractor/clipfish.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import (
- int_or_none,
- unified_strdate,
-)
-
-
-class ClipfishIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?clipfish\.de/(?:[^/]+/)+video/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://www.clipfish.de/special/ugly-americans/video/4343170/s01-e01-ugly-americans-date-in-der-hoelle/',
- 'md5': 'b9a5dc46294154c1193e2d10e0c95693',
- 'info_dict': {
- 'id': '4343170',
- 'ext': 'mp4',
- 'title': 'S01 E01 - Ugly Americans - Date in der Hölle',
- 'description': 'Mark Lilly arbeitet im Sozialdienst der Stadt New York und soll Immigranten bei ihrer Einbürgerung in die USA zur Seite stehen.',
- 'upload_date': '20161005',
- 'duration': 1291,
- 'view_count': int,
- }
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
-
- video_info = self._download_json(
- 'http://www.clipfish.de/devapi/id/%s?format=json&apikey=hbbtv' % video_id,
- video_id)['items'][0]
-
- formats = []
-
- m3u8_url = video_info.get('media_videourl_hls')
- if m3u8_url:
- formats.append({
- 'url': m3u8_url.replace('de.hls.fra.clipfish.de', 'hls.fra.clipfish.de'),
- 'ext': 'mp4',
- 'format_id': 'hls',
- })
-
- mp4_url = video_info.get('media_videourl')
- if mp4_url:
- formats.append({
- 'url': mp4_url,
- 'format_id': 'mp4',
- 'width': int_or_none(video_info.get('width')),
- 'height': int_or_none(video_info.get('height')),
- 'tbr': int_or_none(video_info.get('bitrate')),
- })
-
- descr = video_info.get('descr')
- if descr:
- descr = descr.strip()
-
- return {
- 'id': video_id,
- 'title': video_info['title'],
- 'description': descr,
- 'formats': formats,
- 'thumbnail': video_info.get('media_content_thumbnail_large') or video_info.get('media_thumbnail'),
- 'duration': int_or_none(video_info.get('media_length')),
- 'upload_date': unified_strdate(video_info.get('pubDate')),
- 'view_count': int_or_none(video_info.get('media_views'))
- }
diff --git a/youtube_dl/extractor/clippit.py b/youtube_dl/extractor/clippit.py
new file mode 100644
index 0000000..a1a7a77
--- /dev/null
+++ b/youtube_dl/extractor/clippit.py
@@ -0,0 +1,74 @@
+# coding: utf-8
+
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ parse_iso8601,
+ qualities,
+)
+
+import re
+
+
+class ClippitIE(InfoExtractor):
+
+ _VALID_URL = r'https?://(?:www\.)?clippituser\.tv/c/(?P<id>[a-z]+)'
+ _TEST = {
+ 'url': 'https://www.clippituser.tv/c/evmgm',
+ 'md5': '963ae7a59a2ec4572ab8bf2f2d2c5f09',
+ 'info_dict': {
+ 'id': 'evmgm',
+ 'ext': 'mp4',
+ 'title': 'Bye bye Brutus. #BattleBots - Clippit',
+ 'uploader': 'lizllove',
+ 'uploader_url': 'https://www.clippituser.tv/p/lizllove',
+ 'timestamp': 1472183818,
+ 'upload_date': '20160826',
+ 'description': 'BattleBots | ABC',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._html_search_regex(r'<title.*>(.+?)</title>', webpage, 'title')
+
+ FORMATS = ('sd', 'hd')
+ quality = qualities(FORMATS)
+ formats = []
+ for format_id in FORMATS:
+ url = self._html_search_regex(r'data-%s-file="(.+?)"' % format_id,
+ webpage, 'url', fatal=False)
+ if not url:
+ continue
+ match = re.search(r'/(?P<height>\d+)\.mp4', url)
+ formats.append({
+ 'url': url,
+ 'format_id': format_id,
+ 'quality': quality(format_id),
+ 'height': int(match.group('height')) if match else None,
+ })
+
+ uploader = self._html_search_regex(r'class="username".*>\s+(.+?)\n',
+ webpage, 'uploader', fatal=False)
+ uploader_url = ('https://www.clippituser.tv/p/' + uploader
+ if uploader else None)
+
+ timestamp = self._html_search_regex(r'datetime="(.+?)"',
+ webpage, 'date', fatal=False)
+ thumbnail = self._html_search_regex(r'data-image="(.+?)"',
+ webpage, 'thumbnail', fatal=False)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'formats': formats,
+ 'uploader': uploader,
+ 'uploader_url': uploader_url,
+ 'timestamp': parse_iso8601(timestamp),
+ 'description': self._og_search_description(webpage),
+ 'thumbnail': thumbnail,
+ }
diff --git a/youtube_dl/extractor/cloudy.py b/youtube_dl/extractor/cloudy.py
index 9bc8dbe..85ca20e 100644
--- a/youtube_dl/extractor/cloudy.py
+++ b/youtube_dl/extractor/cloudy.py
@@ -30,7 +30,11 @@ class CloudyIE(InfoExtractor):
video_id = self._match_id(url)
webpage = self._download_webpage(
- 'http://www.cloudy.ec/embed.php?id=%s' % video_id, video_id)
+ 'https://www.cloudy.ec/embed.php', video_id, query={
+ 'id': video_id,
+ 'playerPage': 1,
+ 'autoplay': 1,
+ })
info = self._parse_html5_media_entries(url, webpage, video_id)[0]
diff --git a/youtube_dl/extractor/common.py b/youtube_dl/extractor/common.py
index afeb4c5..74d30ec 100644
--- a/youtube_dl/extractor/common.py
+++ b/youtube_dl/extractor/common.py
@@ -27,6 +27,7 @@ from ..compat import (
compat_urllib_parse_urlencode,
compat_urllib_request,
compat_urlparse,
+ compat_xml_parse_error,
)
from ..downloader.f4m import remove_encrypted_media
from ..utils import (
@@ -646,15 +647,29 @@ class InfoExtractor(object):
def _download_xml(self, url_or_request, video_id,
note='Downloading XML', errnote='Unable to download XML',
- transform_source=None, fatal=True, encoding=None, data=None, headers={}, query={}):
+ transform_source=None, fatal=True, encoding=None,
+ data=None, headers={}, query={}):
"""Return the xml as an xml.etree.ElementTree.Element"""
xml_string = self._download_webpage(
- url_or_request, video_id, note, errnote, fatal=fatal, encoding=encoding, data=data, headers=headers, query=query)
+ url_or_request, video_id, note, errnote, fatal=fatal,
+ encoding=encoding, data=data, headers=headers, query=query)
if xml_string is False:
return xml_string
+ return self._parse_xml(
+ xml_string, video_id, transform_source=transform_source,
+ fatal=fatal)
+
+ def _parse_xml(self, xml_string, video_id, transform_source=None, fatal=True):
if transform_source:
xml_string = transform_source(xml_string)
- return compat_etree_fromstring(xml_string.encode('utf-8'))
+ try:
+ return compat_etree_fromstring(xml_string.encode('utf-8'))
+ except compat_xml_parse_error as ve:
+ errmsg = '%s: Failed to parse XML ' % video_id
+ if fatal:
+ raise ExtractorError(errmsg, cause=ve)
+ else:
+ self.report_warning(errmsg + str(ve))
def _download_json(self, url_or_request, video_id,
note='Downloading JSON metadata',
@@ -730,12 +745,12 @@ class InfoExtractor(object):
video_info['title'] = video_title
return video_info
- def playlist_from_matches(self, matches, video_id, video_title, getter=None, ie=None):
- urlrs = orderedSet(
+ def playlist_from_matches(self, matches, playlist_id=None, playlist_title=None, getter=None, ie=None):
+ urls = orderedSet(
self.url_result(self._proto_relative_url(getter(m) if getter else m), ie)
for m in matches)
return self.playlist_result(
- urlrs, playlist_id=video_id, playlist_title=video_title)
+ urls, playlist_id=playlist_id, playlist_title=playlist_title)
@staticmethod
def playlist_result(entries, playlist_id=None, playlist_title=None, playlist_description=None):
@@ -940,7 +955,8 @@ class InfoExtractor(object):
def _family_friendly_search(self, html):
# See http://schema.org/VideoObject
- family_friendly = self._html_search_meta('isFamilyFriendly', html)
+ family_friendly = self._html_search_meta(
+ 'isFamilyFriendly', html, default=None)
if not family_friendly:
return None
@@ -1785,7 +1801,7 @@ class InfoExtractor(object):
ms_info['timescale'] = int(timescale)
segment_duration = source.get('duration')
if segment_duration:
- ms_info['segment_duration'] = int(segment_duration)
+ ms_info['segment_duration'] = float(segment_duration)
def extract_Initialization(source):
initialization = source.find(_add_ns('Initialization'))
@@ -1892,9 +1908,13 @@ class InfoExtractor(object):
'Bandwidth': bandwidth,
}
+ def location_key(location):
+ return 'url' if re.match(r'^https?://', location) else 'path'
+
if 'segment_urls' not in representation_ms_info and 'media' in representation_ms_info:
media_template = prepare_template('media', ('Number', 'Bandwidth', 'Time'))
+ media_location_key = location_key(media_template)
# As per [1, 5.3.9.4.4, Table 16, page 55] $Number$ and $Time$
# can't be used at the same time
@@ -1904,7 +1924,7 @@ class InfoExtractor(object):
segment_duration = float_or_none(representation_ms_info['segment_duration'], representation_ms_info['timescale'])
representation_ms_info['total_number'] = int(math.ceil(float(period_duration) / segment_duration))
representation_ms_info['fragments'] = [{
- 'url': media_template % {
+ media_location_key: media_template % {
'Number': segment_number,
'Bandwidth': bandwidth,
},
@@ -1928,7 +1948,7 @@ class InfoExtractor(object):
'Number': segment_number,
}
representation_ms_info['fragments'].append({
- 'url': segment_url,
+ media_location_key: segment_url,
'duration': float_or_none(segment_d, representation_ms_info['timescale']),
})
@@ -1952,8 +1972,9 @@ class InfoExtractor(object):
for s in representation_ms_info['s']:
duration = float_or_none(s['d'], timescale)
for r in range(s.get('r', 0) + 1):
+ segment_uri = representation_ms_info['segment_urls'][segment_index]
fragments.append({
- 'url': representation_ms_info['segment_urls'][segment_index],
+ location_key(segment_uri): segment_uri,
'duration': duration,
})
segment_index += 1
@@ -1962,6 +1983,7 @@ class InfoExtractor(object):
# No fragments key is present in this case.
if 'fragments' in representation_ms_info:
f.update({
+ 'fragment_base_url': base_url,
'fragments': [],
'protocol': 'http_dash_segments',
})
@@ -1969,10 +1991,8 @@ class InfoExtractor(object):
initialization_url = representation_ms_info['initialization_url']
if not f.get('url'):
f['url'] = initialization_url
- f['fragments'].append({'url': initialization_url})
+ f['fragments'].append({location_key(initialization_url): initialization_url})
f['fragments'].extend(representation_ms_info['fragments'])
- for fragment in f['fragments']:
- fragment['url'] = urljoin(base_url, fragment['url'])
try:
existing_format = next(
fo for fo in formats
@@ -2110,19 +2130,19 @@ class InfoExtractor(object):
return f
return {}
- def _media_formats(src, cur_media_type):
+ def _media_formats(src, cur_media_type, type_info={}):
full_url = absolute_url(src)
- ext = determine_ext(full_url)
+ ext = type_info.get('ext') or determine_ext(full_url)
if ext == 'm3u8':
is_plain_url = False
formats = self._extract_m3u8_formats(
full_url, video_id, ext='mp4',
entry_protocol=m3u8_entry_protocol, m3u8_id=m3u8_id,
- preference=preference)
+ preference=preference, fatal=False)
elif ext == 'mpd':
is_plain_url = False
formats = self._extract_mpd_formats(
- full_url, video_id, mpd_id=mpd_id)
+ full_url, video_id, mpd_id=mpd_id, fatal=False)
else:
is_plain_url = True
formats = [{
@@ -2132,15 +2152,18 @@ class InfoExtractor(object):
return is_plain_url, formats
entries = []
+ # amp-video and amp-audio are very similar to their HTML5 counterparts
+ # so we wll include them right here (see
+ # https://www.ampproject.org/docs/reference/components/amp-video)
media_tags = [(media_tag, media_type, '')
for media_tag, media_type
- in re.findall(r'(?s)(<(video|audio)[^>]*/>)', webpage)]
+ in re.findall(r'(?s)(<(?:amp-)?(video|audio)[^>]*/>)', webpage)]
media_tags.extend(re.findall(
# We only allow video|audio followed by a whitespace or '>'.
# Allowing more characters may end up in significant slow down (see
# https://github.com/rg3/youtube-dl/issues/11979, example URL:
# http://www.porntrex.com/maps/videositemap.xml).
- r'(?s)(<(?P<tag>video|audio)(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
+ r'(?s)(<(?P<tag>(?:amp-)?(?:video|audio))(?:\s+[^>]*)?>)(.*?)</(?P=tag)>', webpage))
for media_tag, media_type, media_content in media_tags:
media_info = {
'formats': [],
@@ -2158,9 +2181,15 @@ class InfoExtractor(object):
src = source_attributes.get('src')
if not src:
continue
- is_plain_url, formats = _media_formats(src, media_type)
+ f = parse_content_type(source_attributes.get('type'))
+ is_plain_url, formats = _media_formats(src, media_type, f)
if is_plain_url:
- f = parse_content_type(source_attributes.get('type'))
+ # res attribute is not standard but seen several times
+ # in the wild
+ f.update({
+ 'height': int_or_none(source_attributes.get('res')),
+ 'format_id': source_attributes.get('label'),
+ })
f.update(formats[0])
media_info['formats'].append(f)
else:
diff --git a/youtube_dl/extractor/cracked.py b/youtube_dl/extractor/cracked.py
index 94d03ce..f77a68e 100644
--- a/youtube_dl/extractor/cracked.py
+++ b/youtube_dl/extractor/cracked.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from .youtube import YoutubeIE
from ..utils import (
parse_iso8601,
str_to_int,
@@ -41,11 +42,9 @@ class CrackedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- youtube_url = self._search_regex(
- r'<iframe[^>]+src="((?:https?:)?//www\.youtube\.com/embed/[^"]+)"',
- webpage, 'youtube url', default=None)
+ youtube_url = YoutubeIE._extract_url(webpage)
if youtube_url:
- return self.url_result(youtube_url, 'Youtube')
+ return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
video_url = self._html_search_regex(
[r'var\s+CK_vidSrc\s*=\s*"([^"]+)"', r'<video\s+src="([^"]+)"'],
diff --git a/youtube_dl/extractor/crunchyroll.py b/youtube_dl/extractor/crunchyroll.py
index 2ffa4a7..8bdaf0c 100644
--- a/youtube_dl/extractor/crunchyroll.py
+++ b/youtube_dl/extractor/crunchyroll.py
@@ -510,7 +510,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
# webpage provide more accurate data than series_title from XML
series = self._html_search_regex(
- r'id=["\']showmedia_about_episode_num[^>]+>\s*<a[^>]+>([^<]+)',
+ r'(?s)<h\d[^>]+\bid=["\']showmedia_about_episode_num[^>]+>(.+?)</h\d',
webpage, 'series', fatal=False)
season = xpath_text(metadata, 'series_title')
@@ -518,7 +518,7 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
episode_number = int_or_none(xpath_text(metadata, 'episode_number'))
season_number = int_or_none(self._search_regex(
- r'(?s)<h4[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h4>\s*<h4>\s*Season (\d+)',
+ r'(?s)<h\d[^>]+id=["\']showmedia_about_episode_num[^>]+>.+?</h\d>\s*<h4>\s*Season (\d+)',
webpage, 'season number', default=None))
return {
diff --git a/youtube_dl/extractor/dailymail.py b/youtube_dl/extractor/dailymail.py
index 538565c..af39780 100644
--- a/youtube_dl/extractor/dailymail.py
+++ b/youtube_dl/extractor/dailymail.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..compat import compat_str
from ..utils import (
@@ -12,8 +14,8 @@ from ..utils import (
class DailyMailIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/video/[^/]+/video-(?P<id>[0-9]+)'
- _TEST = {
+ _VALID_URL = r'https?://(?:www\.)?dailymail\.co\.uk/(?:video/[^/]+/video-|embed/video/)(?P<id>[0-9]+)'
+ _TESTS = [{
'url': 'http://www.dailymail.co.uk/video/tvshowbiz/video-1295863/The-Mountain-appears-sparkling-water-ad-Heavy-Bubbles.html',
'md5': 'f6129624562251f628296c3a9ffde124',
'info_dict': {
@@ -22,7 +24,16 @@ class DailyMailIE(InfoExtractor):
'title': 'The Mountain appears in sparkling water ad for \'Heavy Bubbles\'',
'description': 'md5:a93d74b6da172dd5dc4d973e0b766a84',
}
- }
+ }, {
+ 'url': 'http://www.dailymail.co.uk/embed/video/1295863.html',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//(?:www\.)?dailymail\.co\.uk/embed/video/\d+\.html)',
+ webpage)
def _real_extract(self, url):
video_id = self._match_id(url)
diff --git a/youtube_dl/extractor/dailymotion.py b/youtube_dl/extractor/dailymotion.py
index f8db76c..74e9913 100644
--- a/youtube_dl/extractor/dailymotion.py
+++ b/youtube_dl/extractor/dailymotion.py
@@ -147,7 +147,7 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
view_count_str = self._search_regex(
(r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([\s\d,.]+)"',
r'video_views_count[^>]+>\s+([\s\d\,.]+)'),
- webpage, 'view count', fatal=False)
+ webpage, 'view count', default=None)
if view_count_str:
view_count_str = re.sub(r'\s', '', view_count_str)
view_count = str_to_int(view_count_str)
@@ -159,7 +159,9 @@ class DailymotionIE(DailymotionBaseInfoExtractor):
[r'buildPlayer\(({.+?})\);\n', # See https://github.com/rg3/youtube-dl/issues/7826
r'playerV5\s*=\s*dmp\.create\([^,]+?,\s*({.+?})\);',
r'buildPlayer\(({.+?})\);',
- r'var\s+config\s*=\s*({.+?});'],
+ r'var\s+config\s*=\s*({.+?});',
+ # New layout regex (see https://github.com/rg3/youtube-dl/issues/13580)
+ r'__PLAYER_CONFIG__\s*=\s*({.+?});'],
webpage, 'player v5', default=None)
if player_v5:
player = self._parse_json(player_v5, video_id)
diff --git a/youtube_dl/extractor/dispeak.py b/youtube_dl/extractor/dispeak.py
index a78cb8a..c05f601 100644
--- a/youtube_dl/extractor/dispeak.py
+++ b/youtube_dl/extractor/dispeak.py
@@ -13,7 +13,7 @@ from ..utils import (
class DigitallySpeakingIE(InfoExtractor):
- _VALID_URL = r'https?://(?:evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
+ _VALID_URL = r'https?://(?:s?evt\.dispeak|events\.digitallyspeaking)\.com/(?:[^/]+/)+xml/(?P<id>[^.]+)\.xml'
_TESTS = [{
# From http://gdcvault.com/play/1023460/Tenacious-Design-and-The-Interface
@@ -28,6 +28,10 @@ class DigitallySpeakingIE(InfoExtractor):
# From http://www.gdcvault.com/play/1014631/Classic-Game-Postmortem-PAC
'url': 'http://events.digitallyspeaking.com/gdc/sf11/xml/12396_1299111843500GMPX.xml',
'only_matching': True,
+ }, {
+ # From http://www.gdcvault.com/play/1013700/Advanced-Material
+ 'url': 'http://sevt.dispeak.com/ubm/gdc/eur10/xml/11256_1282118587281VNIT.xml',
+ 'only_matching': True,
}]
def _parse_mp4(self, metadata):
diff --git a/youtube_dl/extractor/dplay.py b/youtube_dl/extractor/dplay.py
index 1a41760..76e7841 100644
--- a/youtube_dl/extractor/dplay.py
+++ b/youtube_dl/extractor/dplay.py
@@ -7,16 +7,18 @@ import time
from .common import InfoExtractor
from ..compat import (
- compat_urlparse,
compat_HTTPError,
+ compat_str,
+ compat_urlparse,
)
from ..utils import (
- USER_AGENTS,
ExtractorError,
int_or_none,
- unified_strdate,
remove_end,
+ try_get,
+ unified_strdate,
update_url_query,
+ USER_AGENTS,
)
@@ -183,28 +185,44 @@ class DPlayItIE(InfoExtractor):
webpage = self._download_webpage(url, display_id)
- info_url = self._search_regex(
- r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
- webpage, 'video id')
-
title = remove_end(self._og_search_title(webpage), ' | Dplay')
- try:
- info = self._download_json(
- info_url, display_id, headers={
- 'Authorization': 'Bearer %s' % self._get_cookies(url).get(
- 'dplayit_token').value,
- 'Referer': url,
- })
- except ExtractorError as e:
- if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
- info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
- error = info['errors'][0]
- if error.get('code') == 'access.denied.geoblocked':
- self.raise_geo_restricted(
- msg=error.get('detail'), countries=self._GEO_COUNTRIES)
- raise ExtractorError(info['errors'][0]['detail'], expected=True)
- raise
+ video_id = None
+
+ info = self._search_regex(
+ r'playback_json\s*:\s*JSON\.parse\s*\(\s*("(?:\\.|[^"\\])+?")',
+ webpage, 'playback JSON', default=None)
+ if info:
+ for _ in range(2):
+ info = self._parse_json(info, display_id, fatal=False)
+ if not info:
+ break
+ else:
+ video_id = try_get(info, lambda x: x['data']['id'])
+
+ if not info:
+ info_url = self._search_regex(
+ r'url\s*[:=]\s*["\']((?:https?:)?//[^/]+/playback/videoPlaybackInfo/\d+)',
+ webpage, 'info url')
+
+ video_id = info_url.rpartition('/')[-1]
+
+ try:
+ info = self._download_json(
+ info_url, display_id, headers={
+ 'Authorization': 'Bearer %s' % self._get_cookies(url).get(
+ 'dplayit_token').value,
+ 'Referer': url,
+ })
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code in (400, 403):
+ info = self._parse_json(e.cause.read().decode('utf-8'), display_id)
+ error = info['errors'][0]
+ if error.get('code') == 'access.denied.geoblocked':
+ self.raise_geo_restricted(
+ msg=error.get('detail'), countries=self._GEO_COUNTRIES)
+ raise ExtractorError(info['errors'][0]['detail'], expected=True)
+ raise
hls_url = info['data']['attributes']['streaming']['hls']['url']
@@ -230,7 +248,7 @@ class DPlayItIE(InfoExtractor):
season_number = episode_number = upload_date = None
return {
- 'id': info_url.rpartition('/')[-1],
+ 'id': compat_str(video_id or display_id),
'display_id': display_id,
'title': title,
'description': self._og_search_description(webpage),
diff --git a/youtube_dl/extractor/dramafever.py b/youtube_dl/extractor/dramafever.py
index e7abc88..9a498d7 100644
--- a/youtube_dl/extractor/dramafever.py
+++ b/youtube_dl/extractor/dramafever.py
@@ -12,6 +12,7 @@ from ..utils import (
ExtractorError,
clean_html,
int_or_none,
+ remove_end,
sanitized_Request,
urlencode_postdata
)
@@ -72,15 +73,15 @@ class DramaFeverIE(DramaFeverBaseIE):
'url': 'http://www.dramafever.com/drama/4512/1/Cooking_with_Shin/',
'info_dict': {
'id': '4512.1',
- 'ext': 'mp4',
- 'title': 'Cooking with Shin 4512.1',
+ 'ext': 'flv',
+ 'title': 'Cooking with Shin',
'description': 'md5:a8eec7942e1664a6896fcd5e1287bfd0',
'episode': 'Episode 1',
'episode_number': 1,
'thumbnail': r're:^https?://.*\.jpg',
'timestamp': 1404336058,
'upload_date': '20140702',
- 'duration': 343,
+ 'duration': 344,
},
'params': {
# m3u8 download
@@ -90,15 +91,15 @@ class DramaFeverIE(DramaFeverBaseIE):
'url': 'http://www.dramafever.com/drama/4826/4/Mnet_Asian_Music_Awards_2015/?ap=1',
'info_dict': {
'id': '4826.4',
- 'ext': 'mp4',
- 'title': 'Mnet Asian Music Awards 2015 4826.4',
+ 'ext': 'flv',
+ 'title': 'Mnet Asian Music Awards 2015',
'description': 'md5:3ff2ee8fedaef86e076791c909cf2e91',
'episode': 'Mnet Asian Music Awards 2015 - Part 3',
'episode_number': 4,
'thumbnail': r're:^https?://.*\.jpg',
'timestamp': 1450213200,
'upload_date': '20151215',
- 'duration': 5602,
+ 'duration': 5359,
},
'params': {
# m3u8 download
@@ -122,6 +123,10 @@ class DramaFeverIE(DramaFeverBaseIE):
countries=self._GEO_COUNTRIES)
raise
+ # title is postfixed with video id for some reason, removing
+ if info.get('title'):
+ info['title'] = remove_end(info['title'], video_id).strip()
+
series_id, episode_number = video_id.split('.')
episode_info = self._download_json(
# We only need a single episode info, so restricting page size to one episode
diff --git a/youtube_dl/extractor/drtv.py b/youtube_dl/extractor/drtv.py
index c84624f..69effba 100644
--- a/youtube_dl/extractor/drtv.py
+++ b/youtube_dl/extractor/drtv.py
@@ -118,7 +118,7 @@ class DRTVIE(InfoExtractor):
if target == 'HDS':
f4m_formats = self._extract_f4m_formats(
uri + '?hdcore=3.3.0&plugin=aasp-3.3.0.99.43',
- video_id, preference, f4m_id=format_id)
+ video_id, preference, f4m_id=format_id, fatal=False)
if kind == 'AudioResource':
for f in f4m_formats:
f['vcodec'] = 'none'
@@ -126,7 +126,8 @@ class DRTVIE(InfoExtractor):
elif target == 'HLS':
formats.extend(self._extract_m3u8_formats(
uri, video_id, 'mp4', entry_protocol='m3u8_native',
- preference=preference, m3u8_id=format_id))
+ preference=preference, m3u8_id=format_id,
+ fatal=False))
else:
bitrate = link.get('Bitrate')
if bitrate:
diff --git a/youtube_dl/extractor/eagleplatform.py b/youtube_dl/extractor/eagleplatform.py
index 76d39ad..4278927 100644
--- a/youtube_dl/extractor/eagleplatform.py
+++ b/youtube_dl/extractor/eagleplatform.py
@@ -11,6 +11,7 @@ from ..compat import (
from ..utils import (
ExtractorError,
int_or_none,
+ unsmuggle_url,
)
@@ -50,6 +51,10 @@ class EaglePlatformIE(InfoExtractor):
'view_count': int,
},
'skip': 'Georestricted',
+ }, {
+ # referrer protected video (https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/)
+ 'url': 'eagleplatform:tvrainru.media.eagleplatform.com:582306',
+ 'only_matching': True,
}]
@staticmethod
@@ -60,16 +65,40 @@ class EaglePlatformIE(InfoExtractor):
webpage)
if mobj is not None:
return mobj.group('url')
- # Basic usage embedding (see http://dultonmedia.github.io/eplayer/)
+ PLAYER_JS_RE = r'''
+ <script[^>]+
+ src=(?P<qjs>["\'])(?:https?:)?//(?P<host>(?:(?!(?P=qjs)).)+\.media\.eagleplatform\.com)/player/player\.js(?P=qjs)
+ .+?
+ '''
+ # "Basic usage" embedding (see http://dultonmedia.github.io/eplayer/)
mobj = re.search(
r'''(?xs)
- <script[^>]+
- src=(?P<q1>["\'])(?:https?:)?//(?P<host>.+?\.media\.eagleplatform\.com)/player/player\.js(?P=q1)
- .+?
+ %s
<div[^>]+
- class=(?P<q2>["\'])eagleplayer(?P=q2)[^>]+
+ class=(?P<qclass>["\'])eagleplayer(?P=qclass)[^>]+
data-id=["\'](?P<id>\d+)
- ''', webpage)
+ ''' % PLAYER_JS_RE, webpage)
+ if mobj is not None:
+ return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
+ # Generalization of "Javascript code usage", "Combined usage" and
+ # "Usage without attaching to DOM" embeddings (see
+ # http://dultonmedia.github.io/eplayer/)
+ mobj = re.search(
+ r'''(?xs)
+ %s
+ <script>
+ .+?
+ new\s+EaglePlayer\(
+ (?:[^,]+\s*,\s*)?
+ {
+ .+?
+ \bid\s*:\s*["\']?(?P<id>\d+)
+ .+?
+ }
+ \s*\)
+ .+?
+ </script>
+ ''' % PLAYER_JS_RE, webpage)
if mobj is not None:
return 'eagleplatform:%(host)s:%(id)s' % mobj.groupdict()
@@ -79,9 +108,10 @@ class EaglePlatformIE(InfoExtractor):
if status != 200:
raise ExtractorError(' '.join(response['errors']), expected=True)
- def _download_json(self, url_or_request, video_id, note='Downloading JSON metadata', *args, **kwargs):
+ def _download_json(self, url_or_request, video_id, *args, **kwargs):
try:
- response = super(EaglePlatformIE, self)._download_json(url_or_request, video_id, note)
+ response = super(EaglePlatformIE, self)._download_json(
+ url_or_request, video_id, *args, **kwargs)
except ExtractorError as ee:
if isinstance(ee.cause, compat_HTTPError):
response = self._parse_json(ee.cause.read().decode('utf-8'), video_id)
@@ -93,11 +123,24 @@ class EaglePlatformIE(InfoExtractor):
return self._download_json(url_or_request, video_id, note)['data'][0]
def _real_extract(self, url):
+ url, smuggled_data = unsmuggle_url(url, {})
+
mobj = re.match(self._VALID_URL, url)
host, video_id = mobj.group('custom_host') or mobj.group('host'), mobj.group('id')
+ headers = {}
+ query = {
+ 'id': video_id,
+ }
+
+ referrer = smuggled_data.get('referrer')
+ if referrer:
+ headers['Referer'] = referrer
+ query['referrer'] = referrer
+
player_data = self._download_json(
- 'http://%s/api/player_data?id=%s' % (host, video_id), video_id)
+ 'http://%s/api/player_data' % host, video_id,
+ headers=headers, query=query)
media = player_data['data']['playlist']['viewports'][0]['medialist'][0]
diff --git a/youtube_dl/extractor/egghead.py b/youtube_dl/extractor/egghead.py
index db92146..e4a3046 100644
--- a/youtube_dl/extractor/egghead.py
+++ b/youtube_dl/extractor/egghead.py
@@ -1,15 +1,18 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
-
from .common import InfoExtractor
+from ..utils import (
+ int_or_none,
+ try_get,
+ unified_timestamp,
+)
class EggheadCourseIE(InfoExtractor):
IE_DESC = 'egghead.io course'
IE_NAME = 'egghead:course'
- _VALID_URL = r'https://egghead\.io/courses/(?P<id>[a-zA-Z_0-9-]+)'
+ _VALID_URL = r'https://egghead\.io/courses/(?P<id>[^/?#&]+)'
_TEST = {
'url': 'https://egghead.io/courses/professor-frisby-introduces-composable-functional-javascript',
'playlist_count': 29,
@@ -22,18 +25,60 @@ class EggheadCourseIE(InfoExtractor):
def _real_extract(self, url):
playlist_id = self._match_id(url)
- webpage = self._download_webpage(url, playlist_id)
- title = self._html_search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'title')
- ul = self._search_regex(r'(?s)<ul class="series-lessons-list">(.*?)</ul>', webpage, 'session list')
+ course = self._download_json(
+ 'https://egghead.io/api/v1/series/%s' % playlist_id, playlist_id)
+
+ entries = [
+ self.url_result(
+ 'wistia:%s' % lesson['wistia_id'], ie='Wistia',
+ video_id=lesson['wistia_id'], video_title=lesson.get('title'))
+ for lesson in course['lessons'] if lesson.get('wistia_id')]
+
+ return self.playlist_result(
+ entries, playlist_id, course.get('title'),
+ course.get('description'))
+
+
+class EggheadLessonIE(InfoExtractor):
+ IE_DESC = 'egghead.io lesson'
+ IE_NAME = 'egghead:lesson'
+ _VALID_URL = r'https://egghead\.io/lessons/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'https://egghead.io/lessons/javascript-linear-data-flow-with-container-style-types-box',
+ 'info_dict': {
+ 'id': 'fv5yotjxcg',
+ 'ext': 'mp4',
+ 'title': 'Create linear data flow with container style types (Box)',
+ 'description': 'md5:9aa2cdb6f9878ed4c39ec09e85a8150e',
+ 'thumbnail': r're:^https?:.*\.jpg$',
+ 'timestamp': 1481296768,
+ 'upload_date': '20161209',
+ 'duration': 304,
+ 'view_count': 0,
+ 'tags': ['javascript', 'free'],
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }
+
+ def _real_extract(self, url):
+ lesson_id = self._match_id(url)
- found = re.findall(r'(?s)<a class="[^"]*"\s*href="([^"]+)">\s*<li class="item', ul)
- entries = [self.url_result(m) for m in found]
+ lesson = self._download_json(
+ 'https://egghead.io/api/v1/lessons/%s' % lesson_id, lesson_id)
return {
- '_type': 'playlist',
- 'id': playlist_id,
- 'title': title,
- 'description': self._og_search_description(webpage),
- 'entries': entries,
+ '_type': 'url_transparent',
+ 'ie_key': 'Wistia',
+ 'url': 'wistia:%s' % lesson['wistia_id'],
+ 'id': lesson['wistia_id'],
+ 'title': lesson.get('title'),
+ 'description': lesson.get('summary'),
+ 'thumbnail': lesson.get('thumb_nail'),
+ 'timestamp': unified_timestamp(lesson.get('published_at')),
+ 'duration': int_or_none(lesson.get('duration')),
+ 'view_count': int_or_none(lesson.get('plays_count')),
+ 'tags': try_get(lesson, lambda x: x['tag_list'], list),
}
diff --git a/youtube_dl/extractor/espn.py b/youtube_dl/extractor/espn.py
index 8795e0d..7a74360 100644
--- a/youtube_dl/extractor/espn.py
+++ b/youtube_dl/extractor/espn.py
@@ -10,7 +10,25 @@ from ..utils import (
class ESPNIE(InfoExtractor):
- _VALID_URL = r'https?://(?:espn\.go|(?:www\.)?espn)\.com/video/clip(?:\?.*?\bid=|/_/id/)(?P<id>\d+)'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:(?:\w+\.)+)?espn\.go|
+ (?:www\.)?espn
+ )\.com/
+ (?:
+ (?:
+ video/clip|
+ watch/player
+ )
+ (?:
+ \?.*?\bid=|
+ /_/id/
+ )
+ )
+ (?P<id>\d+)
+ '''
+
_TESTS = [{
'url': 'http://espn.go.com/video/clip?id=10365079',
'info_dict': {
@@ -25,21 +43,35 @@ class ESPNIE(InfoExtractor):
'skip_download': True,
},
}, {
- # intl video, from http://www.espnfc.us/video/mls-highlights/150/video/2743663/must-see-moments-best-of-the-mls-season
- 'url': 'http://espn.go.com/video/clip?id=2743663',
+ 'url': 'https://broadband.espn.go.com/video/clip?id=18910086',
'info_dict': {
- 'id': '2743663',
+ 'id': '18910086',
'ext': 'mp4',
- 'title': 'Must-See Moments: Best of the MLS season',
- 'description': 'md5:4c2d7232beaea572632bec41004f0aeb',
- 'timestamp': 1449446454,
- 'upload_date': '20151207',
+ 'title': 'Kyrie spins around defender for two',
+ 'description': 'md5:2b0f5bae9616d26fba8808350f0d2b9b',
+ 'timestamp': 1489539155,
+ 'upload_date': '20170315',
},
'params': {
'skip_download': True,
},
'expected_warnings': ['Unable to download f4m manifest'],
}, {
+ 'url': 'http://nonredline.sports.espn.go.com/video/clip?id=19744672',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://cdn.espn.go.com/video/clip/_/id/19771774',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.espn.com/watch/player?id=19141491',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.espn.com/watch/player?bucketId=257&id=19505875',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.espn.com/watch/player/_/id/19141491',
+ 'only_matching': True,
+ }, {
'url': 'http://www.espn.com/video/clip?id=10365079',
'only_matching': True,
}, {
diff --git a/youtube_dl/extractor/extractors.py b/youtube_dl/extractor/extractors.py
index bbdb4a2..aefadc5 100644
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@@ -45,6 +45,7 @@ from .anvato import AnvatoIE
from .anysex import AnySexIE
from .aol import AolIE
from .allocine import AllocineIE
+from .aliexpress import AliExpressLiveIE
from .aparat import AparatIE
from .appleconnect import AppleConnectIE
from .appletrailers import (
@@ -185,8 +186,9 @@ from .chirbit import (
ChirbitProfileIE,
)
from .cinchcast import CinchcastIE
-from .clipfish import ClipfishIE
+from .cjsw import CJSWIE
from .cliphunter import CliphunterIE
+from .clippit import ClippitIE
from .cliprs import ClipRsIE
from .clipsyndicate import ClipsyndicateIE
from .closertotruth import CloserToTruthIE
@@ -297,7 +299,10 @@ from .dw import (
from .eagleplatform import EaglePlatformIE
from .ebaumsworld import EbaumsWorldIE
from .echomsk import EchoMskIE
-from .egghead import EggheadCourseIE
+from .egghead import (
+ EggheadCourseIE,
+ EggheadLessonIE,
+)
from .ehow import EHowIE
from .eighttracks import EightTracksIE
from .einthusan import EinthusanIE
@@ -347,7 +352,12 @@ from .flipagram import FlipagramIE
from .folketinget import FolketingetIE
from .footyroom import FootyRoomIE
from .formula1 import Formula1IE
-from .fourtube import FourTubeIE
+from .fourtube import (
+ FourTubeIE,
+ PornTubeIE,
+ PornerBrosIE,
+ FuxIE,
+)
from .fox import FOXIE
from .fox9 import FOX9IE
from .foxgay import FoxgayIE
@@ -469,6 +479,7 @@ from .jamendo import (
)
from .jeuxvideo import JeuxVideoIE
from .jove import JoveIE
+from .joj import JojIE
from .jwplatform import JWPlatformIE
from .jpopsukitv import JpopsukiIE
from .kaltura import KalturaIE
@@ -499,6 +510,7 @@ from .la7 import LA7IE
from .laola1tv import (
Laola1TvEmbedIE,
Laola1TvIE,
+ ITTFIE,
)
from .lci import LCIIE
from .lcp import (
@@ -526,7 +538,10 @@ from .limelight import (
LimelightChannelListIE,
)
from .litv import LiTVIE
-from .liveleak import LiveLeakIE
+from .liveleak import (
+ LiveLeakIE,
+ LiveLeakEmbedIE,
+)
from .livestream import (
LivestreamIE,
LivestreamOriginalIE,
@@ -549,10 +564,12 @@ from .mangomolo import (
MangomoloVideoIE,
MangomoloLiveIE,
)
+from .manyvids import ManyVidsIE
from .matchtv import MatchTVIE
from .mdr import MDRIE
from .mediaset import MediasetIE
from .medici import MediciIE
+from .megaphone import MegaphoneIE
from .meipai import MeipaiIE
from .melonvod import MelonVODIE
from .meta import METAIE
@@ -579,7 +596,6 @@ from .mixcloud import (
)
from .mlb import MLBIE
from .mnet import MnetIE
-from .mpora import MporaIE
from .moevideo import MoeVideoIE
from .mofosex import MofosexIE
from .mojvideo import MojvideoIE
@@ -651,6 +667,10 @@ from .nextmedia import (
AppleDailyIE,
NextTVIE,
)
+from .nexx import (
+ NexxIE,
+ NexxEmbedIE,
+)
from .nfb import NFBIE
from .nfl import NFLIE
from .nhk import NhkVodIE
@@ -664,6 +684,7 @@ from .nick import (
NickIE,
NickDeIE,
NickNightIE,
+ NickRuIE,
)
from .niconico import NiconicoIE, NiconicoPlaylistIE
from .ninecninemedia import (
@@ -759,6 +780,7 @@ from .pandoratv import PandoraTVIE
from .parliamentliveuk import ParliamentLiveUKIE
from .patreon import PatreonIE
from .pbs import PBSIE
+from .pearvideo import PearVideoIE
from .people import PeopleIE
from .periscope import (
PeriscopeIE,
@@ -830,6 +852,10 @@ from .rai import (
from .rbmaradio import RBMARadioIE
from .rds import RDSIE
from .redbulltv import RedBullTVIE
+from .reddit import (
+ RedditIE,
+ RedditRIE,
+)
from .redtube import RedTubeIE
from .regiotv import RegioTVIE
from .rentv import (
@@ -873,6 +899,7 @@ from .rutube import (
RutubeEmbedIE,
RutubeMovieIE,
RutubePersonIE,
+ RutubePlaylistIE,
)
from .rutv import RUTVIE
from .ruutu import RuutuIE
@@ -923,8 +950,9 @@ from .soundcloud import (
SoundcloudIE,
SoundcloudSetIE,
SoundcloudUserIE,
+ SoundcloudTrackStationIE,
SoundcloudPlaylistIE,
- SoundcloudSearchIE
+ SoundcloudSearchIE,
)
from .soundgasm import (
SoundgasmIE,
@@ -982,7 +1010,6 @@ from .teachertube import (
)
from .teachingchannel import TeachingChannelIE
from .teamcoco import TeamcocoIE
-from .teamfourstar import TeamFourStarIE
from .techtalks import TechTalksIE
from .ted import TEDIE
from .tele13 import Tele13IE
@@ -1204,12 +1231,14 @@ from .vk import (
)
from .vlive import (
VLiveIE,
- VLiveChannelIE
+ VLiveChannelIE,
+ VLivePlaylistIE
)
from .vodlocker import VodlockerIE
from .vodpl import VODPlIE
from .vodplatform import VODPlatformIE
from .voicerepublic import VoiceRepublicIE
+from .voot import VootIE
from .voxmedia import VoxMediaIE
from .vporn import VpornIE
from .vrt import VRTIE
@@ -1231,6 +1260,7 @@ from .washingtonpost import (
WashingtonPostArticleIE,
)
from .wat import WatIE
+from .watchbox import WatchBoxIE
from .watchindianporn import WatchIndianPornIE
from .wdr import (
WDRIE,
@@ -1280,12 +1310,12 @@ from .yahoo import (
YahooIE,
YahooSearchIE,
)
-from .yam import YamIE
from .yandexmusic import (
YandexMusicTrackIE,
YandexMusicAlbumIE,
YandexMusicPlaylistIE,
)
+from .yandexdisk import YandexDiskIE
from .yesjapan import YesJapanIE
from .yinyuetai import YinYueTaiIE
from .ynet import YnetIE
diff --git a/youtube_dl/extractor/fivetv.py b/youtube_dl/extractor/fivetv.py
index 15736c9..9f98637 100644
--- a/youtube_dl/extractor/fivetv.py
+++ b/youtube_dl/extractor/fivetv.py
@@ -43,7 +43,7 @@ class FiveTVIE(InfoExtractor):
'info_dict': {
'id': 'glavnoe',
'ext': 'mp4',
- 'title': 'Итоги недели с 8 по 14 июня 2015 года',
+ 'title': r're:^Итоги недели с \d+ по \d+ \w+ \d{4} года$',
'thumbnail': r're:^https?://.*\.jpg$',
},
}, {
@@ -70,7 +70,8 @@ class FiveTVIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
video_url = self._search_regex(
- r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"',
+ [r'<div[^>]+?class="flowplayer[^>]+?data-href="([^"]+)"',
+ r'<a[^>]+?href="([^"]+)"[^>]+?class="videoplayer"'],
webpage, 'video url')
title = self._og_search_title(webpage, default=None) or self._search_regex(
diff --git a/youtube_dl/extractor/fourtube.py b/youtube_dl/extractor/fourtube.py
index e3fd08b..ad273a0 100644
--- a/youtube_dl/extractor/fourtube.py
+++ b/youtube_dl/extractor/fourtube.py
@@ -3,39 +3,22 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_urlparse
from ..utils import (
parse_duration,
parse_iso8601,
- sanitized_Request,
str_to_int,
)
-class FourTubeIE(InfoExtractor):
- IE_NAME = '4tube'
- _VALID_URL = r'https?://(?:www\.)?4tube\.com/videos/(?P<id>\d+)'
+class FourTubeBaseIE(InfoExtractor):
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ kind, video_id, display_id = mobj.group('kind', 'id', 'display_id')
- _TEST = {
- 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
- 'md5': '6516c8ac63b03de06bc8eac14362db4f',
- 'info_dict': {
- 'id': '209733',
- 'ext': 'mp4',
- 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
- 'uploader': 'WCP Club',
- 'uploader_id': 'wcp-club',
- 'upload_date': '20131031',
- 'timestamp': 1383263892,
- 'duration': 583,
- 'view_count': int,
- 'like_count': int,
- 'categories': list,
- 'age_limit': 18,
- }
- }
+ if kind == 'm' or not display_id:
+ url = self._URL_TEMPLATE % video_id
- def _real_extract(self, url):
- video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
title = self._html_search_meta('name', webpage)
@@ -43,10 +26,10 @@ class FourTubeIE(InfoExtractor):
'uploadDate', webpage))
thumbnail = self._html_search_meta('thumbnailUrl', webpage)
uploader_id = self._html_search_regex(
- r'<a class="item-to-subscribe" href="[^"]+/channels/([^/"]+)" title="Go to [^"]+ page">',
+ r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/([^/"]+)" title="Go to [^"]+ page">',
webpage, 'uploader id', fatal=False)
uploader = self._html_search_regex(
- r'<a class="item-to-subscribe" href="[^"]+/channels/[^/"]+" title="Go to ([^"]+) page">',
+ r'<a class="item-to-subscribe" href="[^"]+/(?:channel|user)s?/[^/"]+" title="Go to ([^"]+) page">',
webpage, 'uploader', fatal=False)
categories_html = self._search_regex(
@@ -60,10 +43,10 @@ class FourTubeIE(InfoExtractor):
view_count = str_to_int(self._search_regex(
r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserPlays:([0-9,]+)">',
- webpage, 'view count', fatal=False))
+ webpage, 'view count', default=None))
like_count = str_to_int(self._search_regex(
r'<meta[^>]+itemprop="interactionCount"[^>]+content="UserLikes:([0-9,]+)">',
- webpage, 'like count', fatal=False))
+ webpage, 'like count', default=None))
duration = parse_duration(self._html_search_meta('duration', webpage))
media_id = self._search_regex(
@@ -87,12 +70,12 @@ class FourTubeIE(InfoExtractor):
token_url = 'https://tkn.kodicdn.com/{0}/desktop/{1}'.format(
media_id, '+'.join(sources))
- headers = {
- b'Content-Type': b'application/x-www-form-urlencoded',
- b'Origin': b'https://www.4tube.com',
- }
- token_req = sanitized_Request(token_url, b'{}', headers)
- tokens = self._download_json(token_req, video_id)
+
+ parsed_url = compat_urlparse.urlparse(url)
+ tokens = self._download_json(token_url, video_id, data=b'', headers={
+ 'Origin': '%s://%s' % (parsed_url.scheme, parsed_url.hostname),
+ 'Referer': url,
+ })
formats = [{
'url': tokens[format]['token'],
'format_id': format + 'p',
@@ -115,3 +98,126 @@ class FourTubeIE(InfoExtractor):
'duration': duration,
'age_limit': 18,
}
+
+
+class FourTubeIE(FourTubeBaseIE):
+ IE_NAME = '4tube'
+ _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?4tube\.com/(?:videos|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+ _URL_TEMPLATE = 'https://www.4tube.com/videos/%s/video'
+ _TESTS = [{
+ 'url': 'http://www.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
+ 'md5': '6516c8ac63b03de06bc8eac14362db4f',
+ 'info_dict': {
+ 'id': '209733',
+ 'ext': 'mp4',
+ 'title': 'Hot Babe Holly Michaels gets her ass stuffed by black',
+ 'uploader': 'WCP Club',
+ 'uploader_id': 'wcp-club',
+ 'upload_date': '20131031',
+ 'timestamp': 1383263892,
+ 'duration': 583,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ },
+ }, {
+ 'url': 'http://www.4tube.com/embed/209733',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://m.4tube.com/videos/209733/hot-babe-holly-michaels-gets-her-ass-stuffed-by-black',
+ 'only_matching': True,
+ }]
+
+
+class FuxIE(FourTubeBaseIE):
+ _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?fux\.com/(?:video|embed)/(?P<id>\d+)(?:/(?P<display_id>[^/?#&]+))?'
+ _URL_TEMPLATE = 'https://www.fux.com/video/%s/video'
+ _TESTS = [{
+ 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow',
+ 'info_dict': {
+ 'id': '195359',
+ 'ext': 'mp4',
+ 'title': 'Awesome fucking in the kitchen ends with cum swallow',
+ 'uploader': 'alenci2342',
+ 'uploader_id': 'alenci2342',
+ 'upload_date': '20131230',
+ 'timestamp': 1388361660,
+ 'duration': 289,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.fux.com/embed/195359',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.fux.com/video/195359/awesome-fucking-kitchen-ends-cum-swallow',
+ 'only_matching': True,
+ }]
+
+
+class PornTubeIE(FourTubeBaseIE):
+ _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?porntube\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)'
+ _URL_TEMPLATE = 'https://www.porntube.com/videos/video_%s'
+ _TESTS = [{
+ 'url': 'https://www.porntube.com/videos/teen-couple-doing-anal_7089759',
+ 'info_dict': {
+ 'id': '7089759',
+ 'ext': 'mp4',
+ 'title': 'Teen couple doing anal',
+ 'uploader': 'Alexy',
+ 'uploader_id': 'Alexy',
+ 'upload_date': '20150606',
+ 'timestamp': 1433595647,
+ 'duration': 5052,
+ 'view_count': int,
+ 'like_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.porntube.com/embed/7089759',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://m.porntube.com/videos/teen-couple-doing-anal_7089759',
+ 'only_matching': True,
+ }]
+
+
+class PornerBrosIE(FourTubeBaseIE):
+ _VALID_URL = r'https?://(?:(?P<kind>www|m)\.)?pornerbros\.com/(?:videos/(?P<display_id>[^/]+)_|embed/)(?P<id>\d+)'
+ _URL_TEMPLATE = 'https://www.pornerbros.com/videos/video_%s'
+ _TESTS = [{
+ 'url': 'https://www.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369',
+ 'md5': '6516c8ac63b03de06bc8eac14362db4f',
+ 'info_dict': {
+ 'id': '181369',
+ 'ext': 'mp4',
+ 'title': 'Skinny brunette takes big cock down her anal hole',
+ 'uploader': 'PornerBros HD',
+ 'uploader_id': 'pornerbros-hd',
+ 'upload_date': '20130130',
+ 'timestamp': 1359527401,
+ 'duration': 1224,
+ 'view_count': int,
+ 'categories': list,
+ 'age_limit': 18,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.pornerbros.com/embed/181369',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://m.pornerbros.com/videos/skinny-brunette-takes-big-cock-down-her-anal-hole_181369',
+ 'only_matching': True,
+ }]
diff --git a/youtube_dl/extractor/fox.py b/youtube_dl/extractor/fox.py
index 159fdf9..facc665 100644
--- a/youtube_dl/extractor/fox.py
+++ b/youtube_dl/extractor/fox.py
@@ -3,56 +3,99 @@ from __future__ import unicode_literals
from .adobepass import AdobePassIE
from ..utils import (
- smuggle_url,
- update_url_query,
+ int_or_none,
+ parse_age_limit,
+ parse_duration,
+ try_get,
+ unified_timestamp,
)
class FOXIE(AdobePassIE):
- _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://www.fox.com/watch/255180355939/7684182528',
+ _VALID_URL = r'https?://(?:www\.)?fox\.com/watch/(?P<id>[\da-fA-F]+)'
+ _TESTS = [{
+ # clip
+ 'url': 'https://www.fox.com/watch/4b765a60490325103ea69888fb2bd4e8/',
'md5': 'ebd296fcc41dd4b19f8115d8461a3165',
'info_dict': {
- 'id': '255180355939',
+ 'id': '4b765a60490325103ea69888fb2bd4e8',
'ext': 'mp4',
- 'title': 'Official Trailer: Gotham',
- 'description': 'Tracing the rise of the great DC Comics Super-Villains and vigilantes, Gotham reveals an entirely new chapter that has never been told.',
- 'duration': 129,
- 'timestamp': 1400020798,
- 'upload_date': '20140513',
- 'uploader': 'NEWA-FNG-FOXCOM',
+ 'title': 'Aftermath: Bruce Wayne Develops Into The Dark Knight',
+ 'description': 'md5:549cd9c70d413adb32ce2a779b53b486',
+ 'duration': 102,
+ 'timestamp': 1504291893,
+ 'upload_date': '20170901',
+ 'creator': 'FOX',
+ 'series': 'Gotham',
},
- 'add_ie': ['ThePlatform'],
- }
+ 'params': {
+ 'skip_download': True,
+ },
+ }, {
+ # episode, geo-restricted
+ 'url': 'https://www.fox.com/watch/087036ca7f33c8eb79b08152b4dd75c1/',
+ 'only_matching': True,
+ }, {
+ # episode, geo-restricted, tv provided required
+ 'url': 'https://www.fox.com/watch/30056b295fb57f7452aeeb4920bc3024/',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- settings = self._parse_json(self._search_regex(
- r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
- webpage, 'drupal settings'), video_id)
- fox_pdk_player = settings['fox_pdk_player']
- release_url = fox_pdk_player['release_url']
- query = {
- 'mbr': 'true',
- 'switch': 'http'
- }
- if fox_pdk_player.get('access') == 'locked':
- ap_p = settings['foxAdobePassProvider']
- rating = ap_p.get('videoRating')
- if rating == 'n/a':
- rating = None
- resource = self._get_mvpd_resource('fbc-fox', None, ap_p['videoGUID'], rating)
- query['auth'] = self._extract_mvpd_auth(url, video_id, 'fbc-fox', resource)
-
- info = self._search_json_ld(webpage, video_id, fatal=False)
- info.update({
- '_type': 'url_transparent',
- 'ie_key': 'ThePlatform',
- 'url': smuggle_url(update_url_query(release_url, query), {'force_smil_url': True}),
- 'id': video_id,
- })
- return info
+ video = self._download_json(
+ 'https://api.fox.com/fbc-content/v1_4/video/%s' % video_id,
+ video_id, headers={
+ 'apikey': 'abdcbed02c124d393b39e818a4312055',
+ 'Content-Type': 'application/json',
+ 'Referer': url,
+ })
+
+ title = video['name']
+
+ m3u8_url = self._download_json(
+ video['videoRelease']['url'], video_id)['playURL']
+
+ formats = self._extract_m3u8_formats(
+ m3u8_url, video_id, 'mp4',
+ entry_protocol='m3u8_native', m3u8_id='hls')
+ self._sort_formats(formats)
+
+ description = video.get('description')
+ duration = int_or_none(video.get('durationInSeconds')) or int_or_none(
+ video.get('duration')) or parse_duration(video.get('duration'))
+ timestamp = unified_timestamp(video.get('datePublished'))
+ age_limit = parse_age_limit(video.get('contentRating'))
+
+ data = try_get(
+ video, lambda x: x['trackingData']['properties'], dict) or {}
+
+ creator = data.get('brand') or data.get('network') or video.get('network')
+
+ series = video.get('seriesName') or data.get(
+ 'seriesName') or data.get('show')
+ season_number = int_or_none(video.get('seasonNumber'))
+ episode = video.get('name')
+ episode_number = int_or_none(video.get('episodeNumber'))
+ release_year = int_or_none(video.get('releaseYear'))
+
+ if data.get('authRequired'):
+ # TODO: AP
+ pass
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'age_limit': age_limit,
+ 'creator': creator,
+ 'series': series,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ 'release_year': release_year,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/funnyordie.py b/youtube_dl/extractor/funnyordie.py
index 4940936..f85e7de 100644
--- a/youtube_dl/extractor/funnyordie.py
+++ b/youtube_dl/extractor/funnyordie.py
@@ -1,10 +1,14 @@
from __future__ import unicode_literals
-import json
import re
from .common import InfoExtractor
-from ..utils import ExtractorError
+from ..utils import (
+ ExtractorError,
+ float_or_none,
+ int_or_none,
+ unified_timestamp,
+)
class FunnyOrDieIE(InfoExtractor):
@@ -18,6 +22,10 @@ class FunnyOrDieIE(InfoExtractor):
'title': 'Heart-Shaped Box: Literal Video Version',
'description': 'md5:ea09a01bc9a1c46d9ab696c01747c338',
'thumbnail': r're:^http:.*\.jpg$',
+ 'uploader': 'DASjr',
+ 'timestamp': 1317904928,
+ 'upload_date': '20111006',
+ 'duration': 318.3,
},
}, {
'url': 'http://www.funnyordie.com/embed/e402820827',
@@ -27,6 +35,8 @@ class FunnyOrDieIE(InfoExtractor):
'title': 'Please Use This Song (Jon Lajoie)',
'description': 'Please use this to sell something. www.jonlajoie.com',
'thumbnail': r're:^http:.*\.jpg$',
+ 'timestamp': 1398988800,
+ 'upload_date': '20140502',
},
'params': {
'skip_download': True,
@@ -100,15 +110,53 @@ class FunnyOrDieIE(InfoExtractor):
'url': 'http://www.funnyordie.com%s' % src,
}]
- post_json = self._search_regex(
- r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details')
- post = json.loads(post_json)
+ timestamp = unified_timestamp(self._html_search_meta(
+ 'uploadDate', webpage, 'timestamp', default=None))
+
+ uploader = self._html_search_regex(
+ r'<h\d[^>]+\bclass=["\']channel-preview-name[^>]+>(.+?)</h',
+ webpage, 'uploader', default=None)
+
+ title, description, thumbnail, duration = [None] * 4
+
+ medium = self._parse_json(
+ self._search_regex(
+ r'jsonMedium\s*=\s*({.+?});', webpage, 'JSON medium',
+ default='{}'),
+ video_id, fatal=False)
+ if medium:
+ title = medium.get('title')
+ duration = float_or_none(medium.get('duration'))
+ if not timestamp:
+ timestamp = unified_timestamp(medium.get('publishDate'))
+
+ post = self._parse_json(
+ self._search_regex(
+ r'fb_post\s*=\s*(\{.*?\});', webpage, 'post details',
+ default='{}'),
+ video_id, fatal=False)
+ if post:
+ if not title:
+ title = post.get('name')
+ description = post.get('description')
+ thumbnail = post.get('picture')
+
+ if not title:
+ title = self._og_search_title(webpage)
+ if not description:
+ description = self._og_search_description(webpage)
+ if not duration:
+ duration = int_or_none(self._html_search_meta(
+ ('video:duration', 'duration'), webpage, 'duration', default=False))
return {
'id': video_id,
- 'title': post['name'],
- 'description': post.get('description'),
- 'thumbnail': post.get('picture'),
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'uploader': uploader,
+ 'timestamp': timestamp,
+ 'duration': duration,
'formats': formats,
'subtitles': subtitles,
}
diff --git a/youtube_dl/extractor/generic.py b/youtube_dl/extractor/generic.py
index f9bff43..b83c183 100644
--- a/youtube_dl/extractor/generic.py
+++ b/youtube_dl/extractor/generic.py
@@ -36,6 +36,10 @@ from .brightcove import (
BrightcoveLegacyIE,
BrightcoveNewIE,
)
+from .nexx import (
+ NexxIE,
+ NexxEmbedIE,
+)
from .nbc import NBCSportsVPlayerIE
from .ooyala import OoyalaIE
from .rutv import RUTVIE
@@ -57,6 +61,7 @@ from .dailymotion import (
DailymotionIE,
DailymotionCloudIE,
)
+from .dailymail import DailyMailIE
from .onionstudios import OnionStudiosIE
from .viewlift import ViewLiftEmbedIE
from .mtv import MTVServicesEmbeddedIE
@@ -91,6 +96,9 @@ from .anvato import AnvatoIE
from .washingtonpost import WashingtonPostIE
from .wistia import WistiaIE
from .mediaset import MediasetIE
+from .joj import JojIE
+from .megaphone import MegaphoneIE
+from .vzaar import VzaarIE
class GenericIE(InfoExtractor):
@@ -568,6 +576,19 @@ class GenericIE(InfoExtractor):
},
'skip': 'movie expired',
},
+ # ooyala video embedded with http://player.ooyala.com/static/v4/production/latest/core.min.js
+ {
+ 'url': 'http://wnep.com/2017/07/22/steampunk-fest-comes-to-honesdale/',
+ 'info_dict': {
+ 'id': 'lwYWYxYzE6V5uJMjNGyKtwwiw9ZJD7t2',
+ 'ext': 'mp4',
+ 'title': 'Steampunk Fest Comes to Honesdale',
+ 'duration': 43.276,
+ },
+ 'params': {
+ 'skip_download': True,
+ }
+ },
# embed.ly video
{
'url': 'http://www.tested.com/science/weird/460206-tested-grinding-coffee-2000-frames-second/',
@@ -759,6 +780,20 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Dailymotion'],
},
+ # DailyMail embed
+ {
+ 'url': 'http://www.bumm.sk/krimi/2017/07/05/biztonsagi-kamera-buktatta-le-az-agg-ferfit-utlegelo-apolot',
+ 'info_dict': {
+ 'id': '1495629',
+ 'ext': 'mp4',
+ 'title': 'Care worker punches elderly dementia patient in head 11 times',
+ 'description': 'md5:3a743dee84e57e48ec68bf67113199a5',
+ },
+ 'add_ie': ['DailyMail'],
+ 'params': {
+ 'skip_download': True,
+ },
+ },
# YouTube embed
{
'url': 'http://www.badzine.de/ansicht/datum/2014/06/09/so-funktioniert-die-neue-englische-badminton-liga.html',
@@ -1185,7 +1220,7 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['Kaltura'],
},
- # Eagle.Platform embed (generic URL)
+ # EaglePlatform embed (generic URL)
{
'url': 'http://lenta.ru/news/2015/03/06/navalny/',
# Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1199,8 +1234,26 @@ class GenericIE(InfoExtractor):
'view_count': int,
'age_limit': 0,
},
+ 'params': {
+ 'skip_download': True,
+ },
},
- # ClipYou (Eagle.Platform) embed (custom URL)
+ # referrer protected EaglePlatform embed
+ {
+ 'url': 'https://tvrain.ru/lite/teleshow/kak_vse_nachinalos/namin-418921/',
+ 'info_dict': {
+ 'id': '582306',
+ 'ext': 'mp4',
+ 'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 3382,
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ # ClipYou (EaglePlatform) embed (custom URL)
{
'url': 'http://muz-tv.ru/play/7129/',
# Not checking MD5 as sometimes the direct HTTP link results in 404 and HLS is used
@@ -1212,6 +1265,9 @@ class GenericIE(InfoExtractor):
'duration': 216,
'view_count': int,
},
+ 'params': {
+ 'skip_download': True,
+ },
},
# Pladform embed
{
@@ -1463,14 +1519,27 @@ class GenericIE(InfoExtractor):
# LiveLeak embed
{
'url': 'http://www.wykop.pl/link/3088787/',
- 'md5': 'ace83b9ed19b21f68e1b50e844fdf95d',
+ 'md5': '7619da8c820e835bef21a1efa2a0fc71',
'info_dict': {
'id': '874_1459135191',
'ext': 'mp4',
'title': 'Man shows poor quality of new apartment building',
'description': 'The wall is like a sand pile.',
'uploader': 'Lake8737',
- }
+ },
+ 'add_ie': [LiveLeakIE.ie_key()],
+ },
+ # Another LiveLeak embed pattern (#13336)
+ {
+ 'url': 'https://milo.yiannopoulos.net/2017/06/concealed-carry-robbery/',
+ 'info_dict': {
+ 'id': '2eb_1496309988',
+ 'ext': 'mp4',
+ 'title': 'Thief robs place where everyone was armed',
+ 'description': 'md5:694d73ee79e535953cf2488562288eee',
+ 'uploader': 'brazilwtf',
+ },
+ 'add_ie': [LiveLeakIE.ie_key()],
},
# Duplicated embedded video URLs
{
@@ -1512,6 +1581,22 @@ class GenericIE(InfoExtractor):
},
'add_ie': ['BrightcoveLegacy'],
},
+ # Nexx embed
+ {
+ 'url': 'https://www.funk.net/serien/5940e15073f6120001657956/items/593efbb173f6120001657503',
+ 'info_dict': {
+ 'id': '247746',
+ 'ext': 'mp4',
+ 'title': "Yesterday's Jam (OV)",
+ 'description': 'md5:09bc0984723fed34e2581624a84e05f0',
+ 'timestamp': 1492594816,
+ 'upload_date': '20170419',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ },
# Facebook <iframe> embed
{
'url': 'https://www.hostblogger.de/blog/archives/6181-Auto-jagt-Betonmischer.html',
@@ -1714,6 +1799,21 @@ class GenericIE(InfoExtractor):
'playlist_mincount': 5,
},
{
+ # Limelight embed (LimelightPlayerUtil.embed)
+ 'url': 'https://tv5.ca/videos?v=xuu8qowr291ri',
+ 'info_dict': {
+ 'id': '95d035dc5c8a401588e9c0e6bd1e9c92',
+ 'ext': 'mp4',
+ 'title': '07448641',
+ 'timestamp': 1499890639,
+ 'upload_date': '20170712',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'add_ie': ['LimelightMedia'],
+ },
+ {
'url': 'http://kron4.com/2017/04/28/standoff-with-walnut-creek-murder-suspect-ends-with-arrest/',
'info_dict': {
'id': 'standoff-with-walnut-creek-murder-suspect-ends-with-arrest',
@@ -1749,6 +1849,36 @@ class GenericIE(InfoExtractor):
},
'add_ie': [MediasetIE.ie_key()],
},
+ {
+ # JOJ.sk embeds
+ 'url': 'https://www.noviny.sk/slovensko/238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+ 'info_dict': {
+ 'id': '238543-slovenskom-sa-prehnala-vlna-silnych-burok',
+ 'title': 'Slovenskom sa prehnala vlna silných búrok',
+ },
+ 'playlist_mincount': 5,
+ 'add_ie': [JojIE.ie_key()],
+ },
+ {
+ # AMP embed (see https://www.ampproject.org/docs/reference/components/amp-video)
+ 'url': 'https://tvrain.ru/amp/418921/',
+ 'md5': 'cc00413936695987e8de148b67d14f1d',
+ 'info_dict': {
+ 'id': '418921',
+ 'ext': 'mp4',
+ 'title': 'Стас Намин: «Мы нарушили девственность Кремля»',
+ },
+ },
+ {
+ # vzaar embed
+ 'url': 'http://help.vzaar.com/article/165-embedding-video',
+ 'md5': '7e3919d9d2620b89e3e00bec7fe8c9d4',
+ 'info_dict': {
+ 'id': '8707641',
+ 'ext': 'mp4',
+ 'title': 'Building A Business Online: Principal Chairs Q & A',
+ },
+ },
# {
# # TODO: find another test
# # http://schema.org/VideoObject
@@ -1898,7 +2028,7 @@ class GenericIE(InfoExtractor):
if head_response is not False:
# Check for redirect
- new_url = head_response.geturl()
+ new_url = compat_str(head_response.geturl())
if url != new_url:
self.report_following_redirect(new_url)
if force_videoid:
@@ -1999,7 +2129,7 @@ class GenericIE(InfoExtractor):
elif re.match(r'(?i)^(?:{[^}]+})?MPD$', doc.tag):
info_dict['formats'] = self._parse_mpd_formats(
doc, video_id,
- mpd_base_url=full_response.geturl().rpartition('/')[0],
+ mpd_base_url=compat_str(full_response.geturl()).rpartition('/')[0],
mpd_url=url)
self._sort_formats(info_dict['formats'])
return info_dict
@@ -2076,6 +2206,16 @@ class GenericIE(InfoExtractor):
if bc_urls:
return self.playlist_from_matches(bc_urls, video_id, video_title, ie='BrightcoveNew')
+ # Look for Nexx embeds
+ nexx_urls = NexxIE._extract_urls(webpage)
+ if nexx_urls:
+ return self.playlist_from_matches(nexx_urls, video_id, video_title, ie=NexxIE.ie_key())
+
+ # Look for Nexx iFrame embeds
+ nexx_embed_urls = NexxEmbedIE._extract_urls(webpage)
+ if nexx_embed_urls:
+ return self.playlist_from_matches(nexx_embed_urls, video_id, video_title, ie=NexxEmbedIE.ie_key())
+
# Look for ThePlatform embeds
tp_urls = ThePlatformIE._extract_urls(webpage)
if tp_urls:
@@ -2103,36 +2243,11 @@ class GenericIE(InfoExtractor):
if vid_me_embed_url is not None:
return self.url_result(vid_me_embed_url, 'Vidme')
- # Look for embedded YouTube player
- matches = re.findall(r'''(?x)
- (?:
- <iframe[^>]+?src=|
- data-video-url=|
- <embed[^>]+?src=|
- embedSWF\(?:\s*|
- <object[^>]+data=|
- new\s+SWFObject\(
- )
- (["\'])
- (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
- (?:embed|v|p)/.+?)
- \1''', webpage)
- if matches:
+ # Look for YouTube embeds
+ youtube_urls = YoutubeIE._extract_urls(webpage)
+ if youtube_urls:
return self.playlist_from_matches(
- matches, video_id, video_title, lambda m: unescapeHTML(m[1]))
-
- # Look for lazyYT YouTube embed
- matches = re.findall(
- r'class="lazyYT" data-youtube-id="([^"]+)"', webpage)
- if matches:
- return self.playlist_from_matches(matches, video_id, video_title, lambda m: unescapeHTML(m))
-
- # Look for Wordpress "YouTube Video Importer" plugin
- matches = re.findall(r'''(?x)<div[^>]+
- class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
- data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
- if matches:
- return self.playlist_from_matches(matches, video_id, video_title, lambda m: m[-1])
+ youtube_urls, video_id, video_title, ie=YoutubeIE.ie_key())
matches = DailymotionIE._extract_urls(webpage)
if matches:
@@ -2148,6 +2263,12 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
playlists, video_id, video_title, lambda p: '//dailymotion.com/playlist/%s' % p)
+ # Look for DailyMail embeds
+ dailymail_urls = DailyMailIE._extract_urls(webpage)
+ if dailymail_urls:
+ return self.playlist_from_matches(
+ dailymail_urls, video_id, video_title, ie=DailyMailIE.ie_key())
+
# Look for embedded Wistia player
wistia_url = WistiaIE._extract_url(webpage)
if wistia_url:
@@ -2199,6 +2320,7 @@ class GenericIE(InfoExtractor):
# Look for Ooyala videos
mobj = (re.search(r'player\.ooyala\.com/[^"?]+[?#][^"]*?(?:embedCode|ec)=(?P<ec>[^"&]+)', webpage) or
re.search(r'OO\.Player\.create\([\'"].*?[\'"],\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
+ re.search(r'OO\.Player\.create\.apply\(\s*OO\.Player\s*,\s*op\(\s*\[\s*[\'"][^\'"]*[\'"]\s*,\s*[\'"](?P<ec>.{32})[\'"]', webpage) or
re.search(r'SBN\.VideoLinkset\.ooyala\([\'"](?P<ec>.{32})[\'"]\)', webpage) or
re.search(r'data-ooyala-video-id\s*=\s*[\'"](?P<ec>.{32})[\'"]', webpage))
if mobj is not None:
@@ -2443,12 +2565,12 @@ class GenericIE(InfoExtractor):
if kaltura_url:
return self.url_result(smuggle_url(kaltura_url, {'source_url': url}), KalturaIE.ie_key())
- # Look for Eagle.Platform embeds
+ # Look for EaglePlatform embeds
eagleplatform_url = EaglePlatformIE._extract_url(webpage)
if eagleplatform_url:
- return self.url_result(eagleplatform_url, EaglePlatformIE.ie_key())
+ return self.url_result(smuggle_url(eagleplatform_url, {'referrer': url}), EaglePlatformIE.ie_key())
- # Look for ClipYou (uses Eagle.Platform) embeds
+ # Look for ClipYou (uses EaglePlatform) embeds
mobj = re.search(
r'<iframe[^>]+src="https?://(?P<host>media\.clipyou\.ru)/index/player\?.*\brecord_id=(?P<id>\d+).*"', webpage)
if mobj is not None:
@@ -2623,9 +2745,9 @@ class GenericIE(InfoExtractor):
self._proto_relative_url(instagram_embed_url), InstagramIE.ie_key())
# Look for LiveLeak embeds
- liveleak_url = LiveLeakIE._extract_url(webpage)
- if liveleak_url:
- return self.url_result(liveleak_url, 'LiveLeak')
+ liveleak_urls = LiveLeakIE._extract_urls(webpage)
+ if liveleak_urls:
+ return self.playlist_from_matches(liveleak_urls, video_id, video_title)
# Look for 3Q SDN embeds
threeqsdn_url = ThreeQSDNIE._extract_url(webpage)
@@ -2677,7 +2799,7 @@ class GenericIE(InfoExtractor):
rutube_urls = RutubeIE._extract_urls(webpage)
if rutube_urls:
return self.playlist_from_matches(
- rutube_urls, ie=RutubeIE.ie_key())
+ rutube_urls, video_id, video_title, ie=RutubeIE.ie_key())
# Look for WashingtonPost embeds
wapo_urls = WashingtonPostIE._extract_urls(webpage)
@@ -2691,6 +2813,24 @@ class GenericIE(InfoExtractor):
return self.playlist_from_matches(
mediaset_urls, video_id, video_title, ie=MediasetIE.ie_key())
+ # Look for JOJ.sk embeds
+ joj_urls = JojIE._extract_urls(webpage)
+ if joj_urls:
+ return self.playlist_from_matches(
+ joj_urls, video_id, video_title, ie=JojIE.ie_key())
+
+ # Look for megaphone.fm embeds
+ mpfn_urls = MegaphoneIE._extract_urls(webpage)
+ if mpfn_urls:
+ return self.playlist_from_matches(
+ mpfn_urls, video_id, video_title, ie=MegaphoneIE.ie_key())
+
+ # Look for vzaar embeds
+ vzaar_urls = VzaarIE._extract_urls(webpage)
+ if vzaar_urls:
+ return self.playlist_from_matches(
+ vzaar_urls, video_id, video_title, ie=VzaarIE.ie_key())
+
def merge_dicts(dict1, dict2):
merged = {}
for k, v in dict1.items():
@@ -2706,12 +2846,6 @@ class GenericIE(InfoExtractor):
merged[k] = v
return merged
- # Looking for http://schema.org/VideoObject
- json_ld = self._search_json_ld(
- webpage, video_id, default={}, expected_type='VideoObject')
- if json_ld.get('url'):
- return merge_dicts(json_ld, info_dict)
-
# Look for HTML5 media
entries = self._parse_html5_media_entries(url, webpage, video_id, m3u8_id='hls')
if entries:
@@ -2730,6 +2864,12 @@ class GenericIE(InfoExtractor):
jwplayer_data, video_id, require_title=False, base_url=url)
return merge_dicts(info, info_dict)
+ # Looking for http://schema.org/VideoObject
+ json_ld = self._search_json_ld(
+ webpage, video_id, default={}, expected_type='VideoObject')
+ if json_ld.get('url'):
+ return merge_dicts(json_ld, info_dict)
+
def check_video(vurl):
if YoutubeIE.suitable(vurl):
return True
diff --git a/youtube_dl/extractor/giantbomb.py b/youtube_dl/extractor/giantbomb.py
index 29b684d..6a1b1e9 100644
--- a/youtube_dl/extractor/giantbomb.py
+++ b/youtube_dl/extractor/giantbomb.py
@@ -5,9 +5,10 @@ import json
from .common import InfoExtractor
from ..utils import (
- unescapeHTML,
- qualities,
+ determine_ext,
int_or_none,
+ qualities,
+ unescapeHTML,
)
@@ -15,7 +16,7 @@ class GiantBombIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?giantbomb\.com/videos/(?P<display_id>[^/]+)/(?P<id>\d+-\d+)'
_TEST = {
'url': 'http://www.giantbomb.com/videos/quick-look-destiny-the-dark-below/2300-9782/',
- 'md5': '57badeface303ecf6b98b812de1b9018',
+ 'md5': 'c8ea694254a59246a42831155dec57ac',
'info_dict': {
'id': '2300-9782',
'display_id': 'quick-look-destiny-the-dark-below',
@@ -51,11 +52,16 @@ class GiantBombIE(InfoExtractor):
for format_id, video_url in video['videoStreams'].items():
if format_id == 'f4m_stream':
continue
- if video_url.endswith('.f4m'):
+ ext = determine_ext(video_url)
+ if ext == 'f4m':
f4m_formats = self._extract_f4m_formats(video_url + '?hdcore=3.3.1', display_id)
if f4m_formats:
f4m_formats[0]['quality'] = quality(format_id)
formats.extend(f4m_formats)
+ elif ext == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, display_id, ext='mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
else:
formats.append({
'url': video_url,
diff --git a/youtube_dl/extractor/googledrive.py b/youtube_dl/extractor/googledrive.py
index 9705cfa..3bf462d 100644
--- a/youtube_dl/extractor/googledrive.py
+++ b/youtube_dl/extractor/googledrive.py
@@ -4,17 +4,30 @@ import re
from .common import InfoExtractor
from ..utils import (
+ determine_ext,
ExtractorError,
int_or_none,
lowercase_escape,
+ update_url_query,
)
class GoogleDriveIE(InfoExtractor):
- _VALID_URL = r'https?://(?:(?:docs|drive)\.google\.com/(?:uc\?.*?id=|file/d/)|video\.google\.com/get_player\?.*?docid=)(?P<id>[a-zA-Z0-9_-]{28,})'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:
+ (?:docs|drive)\.google\.com/
+ (?:
+ (?:uc|open)\?.*?id=|
+ file/d/
+ )|
+ video\.google\.com/get_player\?.*?docid=
+ )
+ (?P<id>[a-zA-Z0-9_-]{28,})
+ '''
_TESTS = [{
'url': 'https://drive.google.com/file/d/0ByeS4oOUV-49Zzh4R1J6R09zazQ/edit?pli=1',
- 'md5': 'd109872761f7e7ecf353fa108c0dbe1e',
+ 'md5': '5c602afbbf2c1db91831f5d82f678554',
'info_dict': {
'id': '0ByeS4oOUV-49Zzh4R1J6R09zazQ',
'ext': 'mp4',
@@ -22,8 +35,30 @@ class GoogleDriveIE(InfoExtractor):
'duration': 45,
}
}, {
+ # video can't be watched anonymously due to view count limit reached,
+ # but can be downloaded (see https://github.com/rg3/youtube-dl/issues/14046)
+ 'url': 'https://drive.google.com/file/d/0B-vUyvmDLdWDcEt4WjBqcmI2XzQ/view',
+ 'md5': 'bfbd670d03a470bb1e6d4a257adec12e',
+ 'info_dict': {
+ 'id': '0B-vUyvmDLdWDcEt4WjBqcmI2XzQ',
+ 'ext': 'mp4',
+ 'title': 'Annabelle Creation (2017)- Z.V1 [TH].MP4',
+ }
+ }, {
# video id is longer than 28 characters
'url': 'https://drive.google.com/file/d/1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ/edit',
+ 'info_dict': {
+ 'id': '1ENcQ_jeCuj7y19s66_Ou9dRP4GKGsodiDQ',
+ 'ext': 'mp4',
+ 'title': 'Andreea Banica feat Smiley - Hooky Song (Official Video).mp4',
+ 'duration': 189,
+ },
+ 'only_matching': True,
+ }, {
+ 'url': 'https://drive.google.com/open?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://drive.google.com/uc?id=0B2fjwgkl1A_CX083Tkowdmt6d28',
'only_matching': True,
}]
_FORMATS_EXT = {
@@ -44,6 +79,13 @@ class GoogleDriveIE(InfoExtractor):
'46': 'webm',
'59': 'mp4',
}
+ _BASE_URL_CAPTIONS = 'https://drive.google.com/timedtext'
+ _CAPTIONS_ENTRY_TAG = {
+ 'subtitles': 'track',
+ 'automatic_captions': 'target',
+ }
+ _caption_formats_ext = []
+ _captions_xml = None
@staticmethod
def _extract_url(webpage):
@@ -53,54 +95,183 @@ class GoogleDriveIE(InfoExtractor):
if mobj:
return 'https://drive.google.com/file/d/%s' % mobj.group('id')
+ def _download_subtitles_xml(self, video_id, subtitles_id, hl):
+ if self._captions_xml:
+ return
+ self._captions_xml = self._download_xml(
+ self._BASE_URL_CAPTIONS, video_id, query={
+ 'id': video_id,
+ 'vid': subtitles_id,
+ 'hl': hl,
+ 'v': video_id,
+ 'type': 'list',
+ 'tlangs': '1',
+ 'fmts': '1',
+ 'vssids': '1',
+ }, note='Downloading subtitles XML',
+ errnote='Unable to download subtitles XML', fatal=False)
+ if self._captions_xml:
+ for f in self._captions_xml.findall('format'):
+ if f.attrib.get('fmt_code') and not f.attrib.get('default'):
+ self._caption_formats_ext.append(f.attrib['fmt_code'])
+
+ def _get_captions_by_type(self, video_id, subtitles_id, caption_type,
+ origin_lang_code=None):
+ if not subtitles_id or not caption_type:
+ return
+ captions = {}
+ for caption_entry in self._captions_xml.findall(
+ self._CAPTIONS_ENTRY_TAG[caption_type]):
+ caption_lang_code = caption_entry.attrib.get('lang_code')
+ if not caption_lang_code:
+ continue
+ caption_format_data = []
+ for caption_format in self._caption_formats_ext:
+ query = {
+ 'vid': subtitles_id,
+ 'v': video_id,
+ 'fmt': caption_format,
+ 'lang': (caption_lang_code if origin_lang_code is None
+ else origin_lang_code),
+ 'type': 'track',
+ 'name': '',
+ 'kind': '',
+ }
+ if origin_lang_code is not None:
+ query.update({'tlang': caption_lang_code})
+ caption_format_data.append({
+ 'url': update_url_query(self._BASE_URL_CAPTIONS, query),
+ 'ext': caption_format,
+ })
+ captions[caption_lang_code] = caption_format_data
+ return captions
+
+ def _get_subtitles(self, video_id, subtitles_id, hl):
+ if not subtitles_id or not hl:
+ return
+ self._download_subtitles_xml(video_id, subtitles_id, hl)
+ if not self._captions_xml:
+ return
+ return self._get_captions_by_type(video_id, subtitles_id, 'subtitles')
+
+ def _get_automatic_captions(self, video_id, subtitles_id, hl):
+ if not subtitles_id or not hl:
+ return
+ self._download_subtitles_xml(video_id, subtitles_id, hl)
+ if not self._captions_xml:
+ return
+ track = self._captions_xml.find('track')
+ if track is None:
+ return
+ origin_lang_code = track.attrib.get('lang_code')
+ if not origin_lang_code:
+ return
+ return self._get_captions_by_type(
+ video_id, subtitles_id, 'automatic_captions', origin_lang_code)
+
def _real_extract(self, url):
video_id = self._match_id(url)
webpage = self._download_webpage(
'http://docs.google.com/file/d/%s' % video_id, video_id)
- reason = self._search_regex(r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
- if reason:
- raise ExtractorError(reason)
-
- title = self._search_regex(r'"title"\s*,\s*"([^"]+)', webpage, 'title')
+ title = self._search_regex(
+ r'"title"\s*,\s*"([^"]+)', webpage, 'title',
+ default=None) or self._og_search_title(webpage)
duration = int_or_none(self._search_regex(
- r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds', default=None))
+ r'"length_seconds"\s*,\s*"([^"]+)', webpage, 'length seconds',
+ default=None))
+
+ formats = []
fmt_stream_map = self._search_regex(
- r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage, 'fmt stream map').split(',')
- fmt_list = self._search_regex(r'"fmt_list"\s*,\s*"([^"]+)', webpage, 'fmt_list').split(',')
+ r'"fmt_stream_map"\s*,\s*"([^"]+)', webpage,
+ 'fmt stream map', default='').split(',')
+ fmt_list = self._search_regex(
+ r'"fmt_list"\s*,\s*"([^"]+)', webpage,
+ 'fmt_list', default='').split(',')
+ if fmt_stream_map and fmt_list:
+ resolutions = {}
+ for fmt in fmt_list:
+ mobj = re.search(
+ r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
+ if mobj:
+ resolutions[mobj.group('format_id')] = (
+ int(mobj.group('width')), int(mobj.group('height')))
- resolutions = {}
- for fmt in fmt_list:
- mobj = re.search(
- r'^(?P<format_id>\d+)/(?P<width>\d+)[xX](?P<height>\d+)', fmt)
- if mobj:
- resolutions[mobj.group('format_id')] = (
- int(mobj.group('width')), int(mobj.group('height')))
+ for fmt_stream in fmt_stream_map:
+ fmt_stream_split = fmt_stream.split('|')
+ if len(fmt_stream_split) < 2:
+ continue
+ format_id, format_url = fmt_stream_split[:2]
+ f = {
+ 'url': lowercase_escape(format_url),
+ 'format_id': format_id,
+ 'ext': self._FORMATS_EXT[format_id],
+ }
+ resolution = resolutions.get(format_id)
+ if resolution:
+ f.update({
+ 'width': resolution[0],
+ 'height': resolution[1],
+ })
+ formats.append(f)
- formats = []
- for fmt_stream in fmt_stream_map:
- fmt_stream_split = fmt_stream.split('|')
- if len(fmt_stream_split) < 2:
- continue
- format_id, format_url = fmt_stream_split[:2]
- f = {
- 'url': lowercase_escape(format_url),
- 'format_id': format_id,
- 'ext': self._FORMATS_EXT[format_id],
- }
- resolution = resolutions.get(format_id)
- if resolution:
- f.update({
- 'width': resolution[0],
- 'height': resolution[0],
+ source_url = update_url_query(
+ 'https://drive.google.com/uc', {
+ 'id': video_id,
+ 'export': 'download',
+ })
+ urlh = self._request_webpage(
+ source_url, video_id, note='Requesting source file',
+ errnote='Unable to request source file', fatal=False)
+ if urlh:
+ def add_source_format(src_url):
+ formats.append({
+ 'url': src_url,
+ 'ext': determine_ext(title, 'mp4').lower(),
+ 'format_id': 'source',
+ 'quality': 1,
})
- formats.append(f)
+ if urlh.headers.get('Content-Disposition'):
+ add_source_format(source_url)
+ else:
+ confirmation_webpage = self._webpage_read_content(
+ urlh, url, video_id, note='Downloading confirmation page',
+ errnote='Unable to confirm download', fatal=False)
+ if confirmation_webpage:
+ confirm = self._search_regex(
+ r'confirm=([^&"\']+)', confirmation_webpage,
+ 'confirmation code', fatal=False)
+ if confirm:
+ add_source_format(update_url_query(source_url, {
+ 'confirm': confirm,
+ }))
+
+ if not formats:
+ reason = self._search_regex(
+ r'"reason"\s*,\s*"([^"]+)', webpage, 'reason', default=None)
+ if reason:
+ raise ExtractorError(reason, expected=True)
+
self._sort_formats(formats)
+ hl = self._search_regex(
+ r'"hl"\s*,\s*"([^"]+)', webpage, 'hl', default=None)
+ subtitles_id = None
+ ttsurl = self._search_regex(
+ r'"ttsurl"\s*,\s*"([^"]+)', webpage, 'ttsurl', default=None)
+ if ttsurl:
+ # the video Id for subtitles will be the last value in the ttsurl
+ # query string
+ subtitles_id = ttsurl.encode('utf-8').decode(
+ 'unicode_escape').split('=')[-1]
+
return {
'id': video_id,
'title': title,
'thumbnail': self._og_search_thumbnail(webpage, default=None),
'duration': duration,
'formats': formats,
+ 'subtitles': self.extract_subtitles(video_id, subtitles_id, hl),
+ 'automatic_captions': self.extract_automatic_captions(
+ video_id, subtitles_id, hl),
}
diff --git a/youtube_dl/extractor/itv.py b/youtube_dl/extractor/itv.py
index f315680..26c48e4 100644
--- a/youtube_dl/extractor/itv.py
+++ b/youtube_dl/extractor/itv.py
@@ -59,12 +59,18 @@ class ITVIE(InfoExtractor):
def _add_sub_element(element, name):
return etree.SubElement(element, _add_ns(name))
+ production_id = (
+ params.get('data-video-autoplay-id') or
+ '%s#001' % (
+ params.get('data-video-episode-id') or
+ video_id.replace('a', '/')))
+
req_env = etree.Element(_add_ns('soapenv:Envelope'))
_add_sub_element(req_env, 'soapenv:Header')
body = _add_sub_element(req_env, 'soapenv:Body')
get_playlist = _add_sub_element(body, ('tem:GetPlaylist'))
request = _add_sub_element(get_playlist, 'tem:request')
- _add_sub_element(request, 'itv:ProductionId').text = params['data-video-id']
+ _add_sub_element(request, 'itv:ProductionId').text = production_id
_add_sub_element(request, 'itv:RequestGuid').text = compat_str(uuid.uuid4()).upper()
vodcrid = _add_sub_element(request, 'itv:Vodcrid')
_add_sub_element(vodcrid, 'com:Id')
diff --git a/youtube_dl/extractor/joj.py b/youtube_dl/extractor/joj.py
new file mode 100755
index 0000000..a764023
--- /dev/null
+++ b/youtube_dl/extractor/joj.py
@@ -0,0 +1,100 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ try_get,
+)
+
+
+class JojIE(InfoExtractor):
+ _VALID_URL = r'''(?x)
+ (?:
+ joj:|
+ https?://media\.joj\.sk/embed/
+ )
+ (?P<id>[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})
+ '''
+ _TESTS = [{
+ 'url': 'https://media.joj.sk/embed/a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'info_dict': {
+ 'id': 'a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'ext': 'mp4',
+ 'title': 'NOVÉ BÝVANIE',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 3118,
+ }
+ }, {
+ 'url': 'joj:a388ec4c-6019-4a4a-9312-b1bee194e932',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe\b[^>]+\bsrc=["\'](?P<url>(?:https?:)?//media\.joj\.sk/embed/[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12})',
+ webpage)
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(
+ 'https://media.joj.sk/embed/%s' % video_id, video_id)
+
+ title = self._search_regex(
+ (r'videoTitle\s*:\s*(["\'])(?P<title>(?:(?!\1).)+)\1',
+ r'<title>(?P<title>[^<]+)'), webpage, 'title',
+ default=None, group='title') or self._og_search_title(webpage)
+
+ bitrates = self._parse_json(
+ self._search_regex(
+ r'(?s)bitrates\s*=\s*({.+?});', webpage, 'bitrates',
+ default='{}'),
+ video_id, transform_source=js_to_json, fatal=False)
+
+ formats = []
+ for format_url in try_get(bitrates, lambda x: x['mp4'], list) or []:
+ if isinstance(format_url, compat_str):
+ height = self._search_regex(
+ r'(\d+)[pP]\.', format_url, 'height', default=None)
+ formats.append({
+ 'url': format_url,
+ 'format_id': '%sp' % height if height else None,
+ 'height': int(height),
+ })
+ if not formats:
+ playlist = self._download_xml(
+ 'https://media.joj.sk/services/Video.php?clip=%s' % video_id,
+ video_id)
+ for file_el in playlist.findall('./files/file'):
+ path = file_el.get('path')
+ if not path:
+ continue
+ format_id = file_el.get('id') or file_el.get('label')
+ formats.append({
+ 'url': 'http://n16.joj.sk/storage/%s' % path.replace(
+ 'dat/', '', 1),
+ 'format_id': format_id,
+ 'height': int_or_none(self._search_regex(
+ r'(\d+)[pP]', format_id or path, 'height',
+ default=None)),
+ })
+ self._sort_formats(formats)
+
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ duration = int_or_none(self._search_regex(
+ r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/kaltura.py b/youtube_dl/extractor/kaltura.py
index 41c1f3d..138d484 100644
--- a/youtube_dl/extractor/kaltura.py
+++ b/youtube_dl/extractor/kaltura.py
@@ -324,7 +324,7 @@ class KalturaIE(InfoExtractor):
if captions:
for caption in captions.get('objects', []):
# Continue if caption is not ready
- if f.get('status') != 2:
+ if caption.get('status') != 2:
continue
if not caption.get('id'):
continue
diff --git a/youtube_dl/extractor/karrierevideos.py b/youtube_dl/extractor/karrierevideos.py
index 4e9eb67..f236a2f 100644
--- a/youtube_dl/extractor/karrierevideos.py
+++ b/youtube_dl/extractor/karrierevideos.py
@@ -48,7 +48,7 @@ class KarriereVideosIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
title = (self._html_search_meta('title', webpage, default=None) or
- self._search_regex(r'<h1 class="title">([^<]+)</h1>'))
+ self._search_regex(r'<h1 class="title">([^<]+)</h1>', webpage, 'video title'))
video_id = self._search_regex(
r'/config/video/(.+?)\.xml', webpage, 'video id')
diff --git a/youtube_dl/extractor/laola1tv.py b/youtube_dl/extractor/laola1tv.py
index 1f91ba0..c7f8133 100644
--- a/youtube_dl/extractor/laola1tv.py
+++ b/youtube_dl/extractor/laola1tv.py
@@ -215,3 +215,21 @@ class Laola1TvIE(Laola1TvEmbedIE):
'formats': formats,
'is_live': is_live,
}
+
+
+class ITTFIE(InfoExtractor):
+ _VALID_URL = r'https?://tv\.ittf\.com/video/[^/]+/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://tv.ittf.com/video/peng-wang-wei-matsudaira-kenta/951802',
+ 'only_matching': True,
+ }
+
+ def _real_extract(self, url):
+ return self.url_result(
+ update_url_query('https://www.laola1.tv/titanplayer.php', {
+ 'videoid': self._match_id(url),
+ 'type': 'V',
+ 'lang': 'en',
+ 'portal': 'int',
+ 'customer': 1024,
+ }), Laola1TvEmbedIE.ie_key())
diff --git a/youtube_dl/extractor/limelight.py b/youtube_dl/extractor/limelight.py
index 0a5a395..ad65b27 100644
--- a/youtube_dl/extractor/limelight.py
+++ b/youtube_dl/extractor/limelight.py
@@ -26,14 +26,16 @@ class LimelightBaseIE(InfoExtractor):
'Channel': 'channel',
'ChannelList': 'channel_list',
}
+
+ def smuggle(url):
+ return smuggle_url(url, {'source_url': source_url})
+
entries = []
for kind, video_id in re.findall(
r'LimelightPlayer\.doLoad(Media|Channel|ChannelList)\(["\'](?P<id>[a-z0-9]{32})',
webpage):
entries.append(cls.url_result(
- smuggle_url(
- 'limelight:%s:%s' % (lm[kind], video_id),
- {'source_url': source_url}),
+ smuggle('limelight:%s:%s' % (lm[kind], video_id)),
'Limelight%s' % kind, video_id))
for mobj in re.finditer(
# As per [1] class attribute should be exactly equal to
@@ -49,10 +51,15 @@ class LimelightBaseIE(InfoExtractor):
''', webpage):
kind, video_id = mobj.group('kind'), mobj.group('id')
entries.append(cls.url_result(
- smuggle_url(
- 'limelight:%s:%s' % (kind, video_id),
- {'source_url': source_url}),
+ smuggle('limelight:%s:%s' % (kind, video_id)),
'Limelight%s' % kind.capitalize(), video_id))
+ # http://support.3playmedia.com/hc/en-us/articles/115009517327-Limelight-Embedding-the-Audio-Description-Plugin-with-the-Limelight-Player-on-Your-Web-Page)
+ for video_id in re.findall(
+ r'(?s)LimelightPlayerUtil\.embed\s*\(\s*{.*?\bmediaId["\']\s*:\s*["\'](?P<id>[a-z0-9]{32})',
+ webpage):
+ entries.append(cls.url_result(
+ smuggle('limelight:media:%s' % video_id),
+ LimelightMediaIE.ie_key(), video_id))
return entries
def _call_playlist_service(self, item_id, method, fatal=True, referer=None):
diff --git a/youtube_dl/extractor/liveleak.py b/youtube_dl/extractor/liveleak.py
index b2247a8..246aac5 100644
--- a/youtube_dl/extractor/liveleak.py
+++ b/youtube_dl/extractor/liveleak.py
@@ -72,15 +72,20 @@ class LiveLeakIE(InfoExtractor):
'params': {
'skip_download': True,
},
+ }, {
+ 'url': 'https://www.liveleak.com/view?i=677_1439397581',
+ 'info_dict': {
+ 'id': '677_1439397581',
+ 'title': 'Fuel Depot in China Explosion caught on video',
+ },
+ 'playlist_count': 3,
}]
@staticmethod
- def _extract_url(webpage):
- mobj = re.search(
- r'<iframe[^>]+src="https?://(?:\w+\.)?liveleak\.com/ll_embed\?(?:.*?)i=(?P<id>[\w_]+)(?:.*)',
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe[^>]+src="(https?://(?:\w+\.)?liveleak\.com/ll_embed\?[^"]*[if]=[\w_]+[^"]+)"',
webpage)
- if mobj:
- return 'http://www.liveleak.com/view?i=%s' % mobj.group('id')
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -111,23 +116,54 @@ class LiveLeakIE(InfoExtractor):
'age_limit': age_limit,
}
- info_dict = entries[0]
+ for idx, info_dict in enumerate(entries):
+ for a_format in info_dict['formats']:
+ if not a_format.get('height'):
+ a_format['height'] = int_or_none(self._search_regex(
+ r'([0-9]+)p\.mp4', a_format['url'], 'height label',
+ default=None))
+
+ self._sort_formats(info_dict['formats'])
+
+ # Don't append entry ID for one-video pages to keep backward compatibility
+ if len(entries) > 1:
+ info_dict['id'] = '%s_%s' % (video_id, idx + 1)
+ else:
+ info_dict['id'] = video_id
- for a_format in info_dict['formats']:
- if not a_format.get('height'):
- a_format['height'] = int_or_none(self._search_regex(
- r'([0-9]+)p\.mp4', a_format['url'], 'height label',
- default=None))
+ info_dict.update({
+ 'title': video_title,
+ 'description': video_description,
+ 'uploader': video_uploader,
+ 'age_limit': age_limit,
+ 'thumbnail': video_thumbnail,
+ })
- self._sort_formats(info_dict['formats'])
+ return self.playlist_result(entries, video_id, video_title)
+
+
+class LiveLeakEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?liveleak\.com/ll_embed\?.*?\b(?P<kind>[if])=(?P<id>[\w_]+)'
+
+ # See generic.py for actual test cases
+ _TESTS = [{
+ 'url': 'https://www.liveleak.com/ll_embed?i=874_1459135191',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.liveleak.com/ll_embed?f=ab065df993c1',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ kind, video_id = mobj.group('kind', 'id')
- info_dict.update({
- 'id': video_id,
- 'title': video_title,
- 'description': video_description,
- 'uploader': video_uploader,
- 'age_limit': age_limit,
- 'thumbnail': video_thumbnail,
- })
+ if kind == 'f':
+ webpage = self._download_webpage(url, video_id)
+ liveleak_url = self._search_regex(
+ r'logourl\s*:\s*(?P<q1>[\'"])(?P<url>%s)(?P=q1)' % LiveLeakIE._VALID_URL,
+ webpage, 'LiveLeak URL', group='url')
+ elif kind == 'i':
+ liveleak_url = 'http://www.liveleak.com/view?i=%s' % video_id
- return info_dict
+ return self.url_result(liveleak_url, ie=LiveLeakIE.ie_key())
diff --git a/youtube_dl/extractor/manyvids.py b/youtube_dl/extractor/manyvids.py
new file mode 100644
index 0000000..b94b3c2
--- /dev/null
+++ b/youtube_dl/extractor/manyvids.py
@@ -0,0 +1,48 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import int_or_none
+
+
+class ManyVidsIE(InfoExtractor):
+ _VALID_URL = r'(?i)https?://(?:www\.)?manyvids\.com/video/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'https://www.manyvids.com/Video/133957/everthing-about-me/',
+ 'md5': '03f11bb21c52dd12a05be21a5c7dcc97',
+ 'info_dict': {
+ 'id': '133957',
+ 'ext': 'mp4',
+ 'title': 'everthing about me (Preview)',
+ 'view_count': int,
+ 'like_count': int,
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ video_url = self._search_regex(
+ r'data-(?:video-filepath|meta-video)\s*=s*(["\'])(?P<url>(?:(?!\1).)+)\1',
+ webpage, 'video URL', group='url')
+
+ title = '%s (Preview)' % self._html_search_regex(
+ r'<h2[^>]+class="m-a-0"[^>]*>([^<]+)', webpage, 'title')
+
+ like_count = int_or_none(self._search_regex(
+ r'data-likes=["\'](\d+)', webpage, 'like count', default=None))
+ view_count = int_or_none(self._html_search_regex(
+ r'(?s)<span[^>]+class="views-wrapper"[^>]*>(.+?)</span', webpage,
+ 'view count', default=None))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'view_count': view_count,
+ 'like_count': like_count,
+ 'formats': [{
+ 'url': video_url,
+ }],
+ }
diff --git a/youtube_dl/extractor/megaphone.py b/youtube_dl/extractor/megaphone.py
new file mode 100644
index 0000000..60e3caf
--- /dev/null
+++ b/youtube_dl/extractor/megaphone.py
@@ -0,0 +1,55 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import js_to_json
+
+
+class MegaphoneIE(InfoExtractor):
+ IE_NAME = 'megaphone.fm'
+ IE_DESC = 'megaphone.fm embedded players'
+ _VALID_URL = r'https://player\.megaphone\.fm/(?P<id>[A-Z0-9]+)'
+ _TEST = {
+ 'url': 'https://player.megaphone.fm/GLT9749789991?"',
+ 'md5': '4816a0de523eb3e972dc0dda2c191f96',
+ 'info_dict': {
+ 'id': 'GLT9749789991',
+ 'ext': 'mp3',
+ 'title': '#97 What Kind Of Idiot Gets Phished?',
+ 'thumbnail': 're:^https://.*\.png.*$',
+ 'duration': 1776.26375,
+ 'author': 'Reply All',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+
+ title = self._og_search_property('audio:title', webpage)
+ author = self._og_search_property('audio:artist', webpage)
+ thumbnail = self._og_search_thumbnail(webpage)
+
+ episode_json = self._search_regex(r'(?s)var\s+episode\s*=\s*(\{.+?\});', webpage, 'episode JSON')
+ episode_data = self._parse_json(episode_json, video_id, js_to_json)
+ video_url = self._proto_relative_url(episode_data['mediaUrl'], 'https:')
+
+ formats = [{
+ 'url': video_url,
+ }]
+
+ return {
+ 'id': video_id,
+ 'thumbnail': thumbnail,
+ 'title': title,
+ 'author': author,
+ 'duration': episode_data['duration'],
+ 'formats': formats,
+ }
+
+ @classmethod
+ def _extract_urls(cls, webpage):
+ return [m[0] for m in re.findall(
+ r'<iframe[^>]*?\ssrc=["\'](%s)' % cls._VALID_URL, webpage)]
diff --git a/youtube_dl/extractor/mixcloud.py b/youtube_dl/extractor/mixcloud.py
index 0efbe66..f6360cc 100644
--- a/youtube_dl/extractor/mixcloud.py
+++ b/youtube_dl/extractor/mixcloud.py
@@ -9,6 +9,7 @@ from .common import InfoExtractor
from ..compat import (
compat_chr,
compat_ord,
+ compat_str,
compat_urllib_parse_unquote,
compat_urlparse,
)
@@ -53,16 +54,27 @@ class MixcloudIE(InfoExtractor):
'only_matching': True,
}]
- # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
- @staticmethod
- def _decrypt_play_info(play_info):
- KEY = 'pleasedontdownloadourmusictheartistswontgetpaid'
+ _keys = [
+ 'return { requestAnimationFrame: function(callback) { callback(); }, innerHeight: 500 };',
+ 'pleasedontdownloadourmusictheartistswontgetpaid',
+ 'window.addEventListener = window.addEventListener || function() {};',
+ '(function() { return new Date().toLocaleDateString(); })()'
+ ]
+ _current_key = None
+ # See https://www.mixcloud.com/media/js2/www_js_2.9e23256562c080482435196ca3975ab5.js
+ def _decrypt_play_info(self, play_info, video_id):
play_info = base64.b64decode(play_info.encode('ascii'))
-
- return ''.join([
- compat_chr(compat_ord(ch) ^ compat_ord(KEY[idx % len(KEY)]))
- for idx, ch in enumerate(play_info)])
+ for num, key in enumerate(self._keys, start=1):
+ try:
+ return self._parse_json(
+ ''.join([
+ compat_chr(compat_ord(ch) ^ compat_ord(key[idx % len(key)]))
+ for idx, ch in enumerate(play_info)]),
+ video_id)
+ except ExtractorError:
+ if num == len(self._keys):
+ raise
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
@@ -72,14 +84,30 @@ class MixcloudIE(InfoExtractor):
webpage = self._download_webpage(url, track_id)
+ if not self._current_key:
+ js_url = self._search_regex(
+ r'<script[^>]+\bsrc=["\"](https://(?:www\.)?mixcloud\.com/media/js2/www_js_4\.[^>]+\.js)',
+ webpage, 'js url', default=None)
+ if js_url:
+ js = self._download_webpage(js_url, track_id, fatal=False)
+ if js:
+ KEY_RE_TEMPLATE = r'player\s*:\s*{.*?\b%s\s*:\s*(["\'])(?P<key>(?:(?!\1).)+)\1'
+ for key_name in ('value', 'key_value', 'key_value.*?', '.*?value.*?'):
+ key = self._search_regex(
+ KEY_RE_TEMPLATE % key_name, js, 'key',
+ default=None, group='key')
+ if key and isinstance(key, compat_str):
+ self._keys.insert(0, key)
+ self._current_key = key
+
message = self._html_search_regex(
r'(?s)<div[^>]+class="global-message cloudcast-disabled-notice-light"[^>]*>(.+?)<(?:a|/div)',
webpage, 'error message', default=None)
encrypted_play_info = self._search_regex(
r'm-play-info="([^"]+)"', webpage, 'play info')
- play_info = self._parse_json(
- self._decrypt_play_info(encrypted_play_info), track_id)
+
+ play_info = self._decrypt_play_info(encrypted_play_info, track_id)
if message and 'stream_url' not in play_info:
raise ExtractorError('%s said: %s' % (self.IE_NAME, message), expected=True)
diff --git a/youtube_dl/extractor/mlb.py b/youtube_dl/extractor/mlb.py
index 59cd4b8..675ff68 100644
--- a/youtube_dl/extractor/mlb.py
+++ b/youtube_dl/extractor/mlb.py
@@ -15,7 +15,7 @@ class MLBIE(InfoExtractor):
(?:[\da-z_-]+\.)*mlb\.com/
(?:
(?:
- (?:.*?/)?video/(?:topic/[\da-z_-]+/)?v|
+ (?:.*?/)?video/(?:topic/[\da-z_-]+/)?(?:v|.*?/c-)|
(?:
shared/video/embed/(?:embed|m-internal-embed)\.html|
(?:[^/]+/)+(?:play|index)\.jsp|
@@ -84,7 +84,7 @@ class MLBIE(InfoExtractor):
},
{
'url': 'http://m.mlb.com/news/article/118550098/blue-jays-kevin-pillar-goes-spidey-up-the-wall-to-rob-tim-beckham-of-a-homer',
- 'md5': 'b190e70141fb9a1552a85426b4da1b5d',
+ 'md5': 'aafaf5b0186fee8f32f20508092f8111',
'info_dict': {
'id': '75609783',
'ext': 'mp4',
@@ -95,6 +95,10 @@ class MLBIE(InfoExtractor):
}
},
{
+ 'url': 'https://www.mlb.com/video/hargrove-homers-off-caldwell/c-1352023483?tid=67793694',
+ 'only_matching': True,
+ },
+ {
'url': 'http://m.mlb.com/shared/video/embed/embed.html?content_id=35692085&topic_id=6479266&width=400&height=224&property=mlb',
'only_matching': True,
},
diff --git a/youtube_dl/extractor/mpora.py b/youtube_dl/extractor/mpora.py
deleted file mode 100644
index 5a1bee5..0000000
--- a/youtube_dl/extractor/mpora.py
+++ /dev/null
@@ -1,62 +0,0 @@
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from ..utils import int_or_none
-
-
-class MporaIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?mpora\.(?:com|de)/videos/(?P<id>[^?#/]+)'
- IE_NAME = 'MPORA'
-
- _TEST = {
- 'url': 'http://mpora.de/videos/AAdo8okx4wiz/embed?locale=de',
- 'md5': 'a7a228473eedd3be741397cf452932eb',
- 'info_dict': {
- 'id': 'AAdo8okx4wiz',
- 'ext': 'mp4',
- 'title': 'Katy Curd - Winter in the Forest',
- 'duration': 416,
- 'uploader': 'Peter Newman Media',
- },
- }
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
-
- data_json = self._search_regex(
- [r"new FM\.Player\('[^']+',\s*(\{.*?)\).player;",
- r"new\s+FM\.Kaltura\.Player\('[^']+'\s*,\s*({.+?})\);"],
- webpage, 'json')
- data = self._parse_json(data_json, video_id)
-
- uploader = data['info_overlay'].get('username')
- duration = data['video']['duration'] // 1000
- thumbnail = data['video']['encodings']['sd']['poster']
- title = data['info_overlay']['title']
-
- formats = []
- for encoding_id, edata in data['video']['encodings'].items():
- for src in edata['sources']:
- width_str = self._search_regex(
- r'_([0-9]+)\.[a-zA-Z0-9]+$', src['src'],
- False, default=None)
- vcodec = src['type'].partition('/')[2]
-
- formats.append({
- 'format_id': encoding_id + '-' + vcodec,
- 'url': src['src'],
- 'vcodec': vcodec,
- 'width': int_or_none(width_str),
- })
-
- self._sort_formats(formats)
-
- return {
- 'id': video_id,
- 'title': title,
- 'formats': formats,
- 'uploader': uploader,
- 'duration': duration,
- 'thumbnail': thumbnail,
- }
diff --git a/youtube_dl/extractor/mtv.py b/youtube_dl/extractor/mtv.py
index 8acea14..25af5dd 100644
--- a/youtube_dl/extractor/mtv.py
+++ b/youtube_dl/extractor/mtv.py
@@ -50,8 +50,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
thumb_node = itemdoc.find(search_path)
if thumb_node is None:
return None
- else:
- return thumb_node.attrib['url']
+ return thumb_node.get('url') or thumb_node.text or None
def _extract_mobile_video_formats(self, mtvn_id):
webpage_url = self._MOBILE_TEMPLATE % mtvn_id
@@ -83,7 +82,7 @@ class MTVServicesInfoExtractor(InfoExtractor):
hls_url = rendition.find('./src').text
formats.extend(self._extract_m3u8_formats(
hls_url, video_id, ext='mp4', entry_protocol='m3u8_native',
- m3u8_id='hls'))
+ m3u8_id='hls', fatal=False))
else:
# fms
try:
@@ -106,7 +105,8 @@ class MTVServicesInfoExtractor(InfoExtractor):
}])
except (KeyError, TypeError):
raise ExtractorError('Invalid rendition field.')
- self._sort_formats(formats)
+ if formats:
+ self._sort_formats(formats)
return formats
def _extract_subtitles(self, mdoc, mtvn_id):
@@ -133,8 +133,11 @@ class MTVServicesInfoExtractor(InfoExtractor):
mediagen_url += 'acceptMethods='
mediagen_url += 'hls' if use_hls else 'fms'
- mediagen_doc = self._download_xml(mediagen_url, video_id,
- 'Downloading video urls')
+ mediagen_doc = self._download_xml(
+ mediagen_url, video_id, 'Downloading video urls', fatal=False)
+
+ if mediagen_doc is False:
+ return None
item = mediagen_doc.find('./video/item')
if item is not None and item.get('type') == 'text':
@@ -174,6 +177,13 @@ class MTVServicesInfoExtractor(InfoExtractor):
formats = self._extract_video_formats(mediagen_doc, mtvn_id, video_id)
+ # Some parts of complete video may be missing (e.g. missing Act 3 in
+ # http://www.southpark.de/alle-episoden/s14e01-sexual-healing)
+ if not formats:
+ return None
+
+ self._sort_formats(formats)
+
return {
'title': title,
'formats': formats,
@@ -205,9 +215,14 @@ class MTVServicesInfoExtractor(InfoExtractor):
title = xpath_text(idoc, './channel/title')
description = xpath_text(idoc, './channel/description')
+ entries = []
+ for item in idoc.findall('.//item'):
+ info = self._get_video_info(item, use_hls)
+ if info:
+ entries.append(info)
+
return self.playlist_result(
- [self._get_video_info(item, use_hls) for item in idoc.findall('.//item')],
- playlist_title=title, playlist_description=description)
+ entries, playlist_title=title, playlist_description=description)
def _extract_triforce_mgid(self, webpage, data_zone=None, video_id=None):
triforce_feed = self._parse_json(self._search_regex(
diff --git a/youtube_dl/extractor/nexx.py b/youtube_dl/extractor/nexx.py
new file mode 100644
index 0000000..d0235fd
--- /dev/null
+++ b/youtube_dl/extractor/nexx.py
@@ -0,0 +1,271 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import hashlib
+import random
+import re
+import time
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ parse_duration,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class NexxIE(InfoExtractor):
+ _VALID_URL = r'https?://api\.nexx(?:\.cloud|cdn\.com)/v3/(?P<domain_id>\d+)/videos/byid/(?P<id>\d+)'
+ _TESTS = [{
+ # movie
+ 'url': 'https://api.nexx.cloud/v3/748/videos/byid/128907',
+ 'md5': '16746bfc28c42049492385c989b26c4a',
+ 'info_dict': {
+ 'id': '128907',
+ 'ext': 'mp4',
+ 'title': 'Stiftung Warentest',
+ 'alt_title': 'Wie ein Test abläuft',
+ 'description': 'md5:d1ddb1ef63de721132abd38639cc2fd2',
+ 'release_year': 2013,
+ 'creator': 'SPIEGEL TV',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 2509,
+ 'timestamp': 1384264416,
+ 'upload_date': '20131112',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }, {
+ # episode
+ 'url': 'https://api.nexx.cloud/v3/741/videos/byid/247858',
+ 'info_dict': {
+ 'id': '247858',
+ 'ext': 'mp4',
+ 'title': 'Return of the Golden Child (OV)',
+ 'description': 'md5:5d969537509a92b733de21bae249dc63',
+ 'release_year': 2017,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1397,
+ 'timestamp': 1495033267,
+ 'upload_date': '20170517',
+ 'episode_number': 2,
+ 'season_number': 2,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://api.nexxcdn.com/v3/748/videos/byid/128907',
+ 'only_matching': True,
+ }]
+
+ @staticmethod
+ def _extract_urls(webpage):
+ # Reference:
+ # 1. https://nx-s.akamaized.net/files/201510/44.pdf
+
+ entries = []
+
+ # JavaScript Integration
+ mobj = re.search(
+ r'<script\b[^>]+\bsrc=["\']https?://require\.nexx(?:\.cloud|cdn\.com)/(?P<id>\d+)',
+ webpage)
+ if mobj:
+ domain_id = mobj.group('id')
+ for video_id in re.findall(
+ r'(?is)onPLAYReady.+?_play\.init\s*\(.+?\s*,\s*["\']?(\d+)',
+ webpage):
+ entries.append(
+ 'https://api.nexx.cloud/v3/%s/videos/byid/%s'
+ % (domain_id, video_id))
+
+ # TODO: support more embed formats
+
+ return entries
+
+ @staticmethod
+ def _extract_url(webpage):
+ return NexxIE._extract_urls(webpage)[0]
+
+ def _handle_error(self, response):
+ status = int_or_none(try_get(
+ response, lambda x: x['metadata']['status']) or 200)
+ if 200 <= status < 300:
+ return
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, response['metadata']['errorhint']),
+ expected=True)
+
+ def _call_api(self, domain_id, path, video_id, data=None, headers={}):
+ headers['Content-Type'] = 'application/x-www-form-urlencoded; charset=UTF-8'
+ result = self._download_json(
+ 'https://api.nexx.cloud/v3/%s/%s' % (domain_id, path), video_id,
+ 'Downloading %s JSON' % path, data=urlencode_postdata(data),
+ headers=headers)
+ self._handle_error(result)
+ return result['result']
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ domain_id, video_id = mobj.group('domain_id', 'id')
+
+ # Reverse engineered from JS code (see getDeviceID function)
+ device_id = '%d:%d:%d%d' % (
+ random.randint(1, 4), int(time.time()),
+ random.randint(1e4, 99999), random.randint(1, 9))
+
+ result = self._call_api(domain_id, 'session/init', video_id, data={
+ 'nxp_devh': device_id,
+ 'nxp_userh': '',
+ 'precid': '0',
+ 'playlicense': '0',
+ 'screenx': '1920',
+ 'screeny': '1080',
+ 'playerversion': '6.0.00',
+ 'gateway': 'html5',
+ 'adGateway': '',
+ 'explicitlanguage': 'en-US',
+ 'addTextTemplates': '1',
+ 'addDomainData': '1',
+ 'addAdModel': '1',
+ }, headers={
+ 'X-Request-Enable-Auth-Fallback': '1',
+ })
+
+ cid = result['general']['cid']
+
+ # As described in [1] X-Request-Token generation algorithm is
+ # as follows:
+ # md5( operation + domain_id + domain_secret )
+ # where domain_secret is a static value that will be given by nexx.tv
+ # as per [1]. Here is how this "secret" is generated (reversed
+ # from _play.api.init function, search for clienttoken). So it's
+ # actually not static and not that much of a secret.
+ # 1. https://nexxtvstorage.blob.core.windows.net/files/201610/27.pdf
+ secret = result['device']['clienttoken'][int(device_id[0]):]
+ secret = secret[0:len(secret) - int(device_id[-1])]
+
+ op = 'byid'
+
+ # Reversed from JS code for _play.api.call function (search for
+ # X-Request-Token)
+ request_token = hashlib.md5(
+ ''.join((op, domain_id, secret)).encode('utf-8')).hexdigest()
+
+ video = self._call_api(
+ domain_id, 'videos/%s/%s' % (op, video_id), video_id, data={
+ 'additionalfields': 'language,channel,actors,studio,licenseby,slug,subtitle,teaser,description',
+ 'addInteractionOptions': '1',
+ 'addStatusDetails': '1',
+ 'addStreamDetails': '1',
+ 'addCaptions': '1',
+ 'addScenes': '1',
+ 'addHotSpots': '1',
+ 'addBumpers': '1',
+ 'captionFormat': 'data',
+ }, headers={
+ 'X-Request-CID': cid,
+ 'X-Request-Token': request_token,
+ })
+
+ general = video['general']
+ title = general['title']
+
+ stream_data = video['streamdata']
+ language = general.get('language_raw') or ''
+
+ # TODO: reverse more cdns and formats
+
+ cdn = stream_data['cdnType']
+ assert cdn == 'azure'
+
+ azure_locator = stream_data['azureLocator']
+
+ AZURE_URL = 'http://nx-p%02d.akamaized.net/'
+
+ for secure in ('s', ''):
+ cdn_shield = stream_data.get('cdnShieldHTTP%s' % secure.upper())
+ if cdn_shield:
+ azure_base = 'http%s://%s' % (secure, cdn_shield)
+ break
+ else:
+ azure_base = AZURE_URL % int(stream_data['azureAccount'].replace('nexxplayplus', ''))
+
+ is_ml = ',' in language
+ azure_m3u8_url = '%s%s/%s_src%s.ism/Manifest(format=m3u8-aapl)' % (
+ azure_base, azure_locator, video_id, ('_manifest' if is_ml else ''))
+
+ protection_token = try_get(
+ video, lambda x: x['protectiondata']['token'], compat_str)
+ if protection_token:
+ azure_m3u8_url += '?hdnts=%s' % protection_token
+
+ formats = self._extract_m3u8_formats(
+ azure_m3u8_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='%s-hls' % cdn)
+ self._sort_formats(formats)
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'alt_title': general.get('subtitle'),
+ 'description': general.get('description'),
+ 'release_year': int_or_none(general.get('year')),
+ 'creator': general.get('studio') or general.get('studio_adref'),
+ 'thumbnail': try_get(
+ video, lambda x: x['imagedata']['thumb'], compat_str),
+ 'duration': parse_duration(general.get('runtime')),
+ 'timestamp': int_or_none(general.get('uploaded')),
+ 'episode_number': int_or_none(try_get(
+ video, lambda x: x['episodedata']['episode'])),
+ 'season_number': int_or_none(try_get(
+ video, lambda x: x['episodedata']['season'])),
+ 'formats': formats,
+ }
+
+
+class NexxEmbedIE(InfoExtractor):
+ _VALID_URL = r'https?://embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?P<id>[^/?#&]+)'
+ _TEST = {
+ 'url': 'http://embed.nexx.cloud/748/KC1614647Z27Y7T?autoplay=1',
+ 'md5': '16746bfc28c42049492385c989b26c4a',
+ 'info_dict': {
+ 'id': '161464',
+ 'ext': 'mp4',
+ 'title': 'Nervenkitzel Achterbahn',
+ 'alt_title': 'Karussellbauer in Deutschland',
+ 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
+ 'release_year': 2005,
+ 'creator': 'SPIEGEL TV',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 2761,
+ 'timestamp': 1394021479,
+ 'upload_date': '20140305',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ }
+
+ @staticmethod
+ def _extract_urls(webpage):
+ # Reference:
+ # 1. https://nx-s.akamaized.net/files/201510/44.pdf
+
+ # iFrame Embed Integration
+ return [mobj.group('url') for mobj in re.finditer(
+ r'<iframe[^>]+\bsrc=(["\'])(?P<url>(?:https?:)?//embed\.nexx(?:\.cloud|cdn\.com)/\d+/(?:(?!\1).)+)\1',
+ webpage)]
+
+ def _real_extract(self, url):
+ embed_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, embed_id)
+
+ return self.url_result(NexxIE._extract_url(webpage), ie=NexxIE.ie_key())
diff --git a/youtube_dl/extractor/nick.py b/youtube_dl/extractor/nick.py
index 08a7592..510b1c4 100644
--- a/youtube_dl/extractor/nick.py
+++ b/youtube_dl/extractor/nick.py
@@ -12,6 +12,7 @@ class NickIE(MTVServicesInfoExtractor):
IE_NAME = 'nick.com'
_VALID_URL = r'https?://(?:(?:www|beta)\.)?nick(?:jr)?\.com/(?:[^/]+/)?(?:videos/clip|[^/]+/videos)/(?P<id>[^/?#.]+)'
_FEED_URL = 'http://udat.mtvnservices.com/service1/dispatch.htm'
+ _GEO_COUNTRIES = ['US']
_TESTS = [{
'url': 'http://www.nick.com/videos/clip/alvinnn-and-the-chipmunks-112-full-episode.html',
'playlist': [
@@ -74,7 +75,7 @@ class NickIE(MTVServicesInfoExtractor):
class NickDeIE(MTVServicesInfoExtractor):
IE_NAME = 'nick.de'
- _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.de|nickelodeon\.(?:nl|at))/(?:playlist|shows)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _VALID_URL = r'https?://(?:www\.)?(?P<host>nick\.(?:de|com\.pl)|nickelodeon\.(?:nl|at))/[^/]+/(?:[^/]+/)*(?P<id>[^/?#&]+)'
_TESTS = [{
'url': 'http://www.nick.de/playlist/3773-top-videos/videos/episode/17306-zu-wasser-und-zu-land-rauchende-erdnusse',
'only_matching': True,
@@ -87,6 +88,9 @@ class NickDeIE(MTVServicesInfoExtractor):
}, {
'url': 'http://www.nickelodeon.at/playlist/3773-top-videos/videos/episode/77993-das-letzte-gefecht',
'only_matching': True,
+ }, {
+ 'url': 'http://www.nick.com.pl/seriale/474-spongebob-kanciastoporty/wideo/17412-teatr-to-jest-to-rodeo-oszolom',
+ 'only_matching': True,
}]
def _extract_mrss_url(self, webpage, host):
@@ -124,3 +128,21 @@ class NickNightIE(NickDeIE):
return self._search_regex(
r'mrss\s*:\s*(["\'])(?P<url>http.+?)\1', webpage,
'mrss url', group='url')
+
+
+class NickRuIE(MTVServicesInfoExtractor):
+ IE_NAME = 'nickelodeonru'
+ _VALID_URL = r'https?://(?:www\.)nickelodeon\.ru/(?:playlist|shows|videos)/(?:[^/]+/)*(?P<id>[^/?#&]+)'
+ _TESTS = [{
+ 'url': 'http://www.nickelodeon.ru/shows/henrydanger/videos/episodes/3-sezon-15-seriya-licenziya-na-polyot/pmomfb#playlist/7airc6',
+ 'only_matching': True,
+ }, {
+ 'url': 'http://www.nickelodeon.ru/videos/smotri-na-nickelodeon-v-iyule/g9hvh7',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+ webpage = self._download_webpage(url, video_id)
+ mgid = self._extract_mgid(webpage)
+ return self.url_result('http://media.mtvnservices.com/embed/%s' % mgid)
diff --git a/youtube_dl/extractor/niconico.py b/youtube_dl/extractor/niconico.py
index f268a72..026329d 100644
--- a/youtube_dl/extractor/niconico.py
+++ b/youtube_dl/extractor/niconico.py
@@ -1,23 +1,27 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
import json
import datetime
from .common import InfoExtractor
from ..compat import (
+ compat_parse_qs,
compat_urlparse,
)
from ..utils import (
+ determine_ext,
+ dict_get,
ExtractorError,
int_or_none,
+ float_or_none,
parse_duration,
parse_iso8601,
- sanitized_Request,
- xpath_text,
- determine_ext,
+ remove_start,
+ try_get,
+ unified_timestamp,
urlencode_postdata,
+ xpath_text,
)
@@ -32,12 +36,15 @@ class NiconicoIE(InfoExtractor):
'id': 'sm22312215',
'ext': 'mp4',
'title': 'Big Buck Bunny',
+ 'thumbnail': r're:https?://.*',
'uploader': 'takuya0301',
'uploader_id': '2698420',
'upload_date': '20131123',
'timestamp': 1385182762,
'description': '(c) copyright 2008, Blender Foundation / www.bigbuckbunny.org',
'duration': 33,
+ 'view_count': int,
+ 'comment_count': int,
},
'skip': 'Requires an account',
}, {
@@ -49,6 +56,7 @@ class NiconicoIE(InfoExtractor):
'ext': 'swf',
'title': '【鏡音リン】Dance on media【オリジナル】take2!',
'description': 'md5:689f066d74610b3b22e0f1739add0f58',
+ 'thumbnail': r're:https?://.*',
'uploader': 'りょうた',
'uploader_id': '18822557',
'upload_date': '20110429',
@@ -65,9 +73,11 @@ class NiconicoIE(InfoExtractor):
'ext': 'unknown_video',
'description': 'deleted',
'title': 'ドラえもんエターナル第3話「決戦第3新東京市」<前編>',
+ 'thumbnail': r're:https?://.*',
'upload_date': '20071224',
'timestamp': int, # timestamp field has different value if logged in
'duration': 304,
+ 'view_count': int,
},
'skip': 'Requires an account',
}, {
@@ -77,6 +87,7 @@ class NiconicoIE(InfoExtractor):
'ext': 'mp4',
'title': '【第1回】RADIOアニメロミックス ラブライブ!~のぞえりRadio Garden~',
'description': 'md5:b27d224bb0ff53d3c8269e9f8b561cf1',
+ 'thumbnail': r're:https?://.*',
'timestamp': 1388851200,
'upload_date': '20140104',
'uploader': 'アニメロチャンネル',
@@ -84,6 +95,44 @@ class NiconicoIE(InfoExtractor):
},
'skip': 'The viewing period of the video you were searching for has expired.',
}, {
+ # video not available via `getflv`; "old" HTML5 video
+ 'url': 'http://www.nicovideo.jp/watch/sm1151009',
+ 'md5': '8fa81c364eb619d4085354eab075598a',
+ 'info_dict': {
+ 'id': 'sm1151009',
+ 'ext': 'mp4',
+ 'title': 'マスターシステム本体内蔵のスペハリのメインテーマ(PSG版)',
+ 'description': 'md5:6ee077e0581ff5019773e2e714cdd0b7',
+ 'thumbnail': r're:https?://.*',
+ 'duration': 184,
+ 'timestamp': 1190868283,
+ 'upload_date': '20070927',
+ 'uploader': 'denden2',
+ 'uploader_id': '1392194',
+ 'view_count': int,
+ 'comment_count': int,
+ },
+ 'skip': 'Requires an account',
+ }, {
+ # "New" HTML5 video
+ 'url': 'http://www.nicovideo.jp/watch/sm31464864',
+ 'md5': '351647b4917660986dc0fa8864085135',
+ 'info_dict': {
+ 'id': 'sm31464864',
+ 'ext': 'mp4',
+ 'title': '新作TVアニメ「戦姫絶唱シンフォギアAXZ」PV 最高画質',
+ 'description': 'md5:e52974af9a96e739196b2c1ca72b5feb',
+ 'timestamp': 1498514060,
+ 'upload_date': '20170626',
+ 'uploader': 'ゲス',
+ 'uploader_id': '40826363',
+ 'thumbnail': r're:https?://.*',
+ 'duration': 198,
+ 'view_count': int,
+ 'comment_count': int,
+ },
+ 'skip': 'Requires an account',
+ }, {
'url': 'http://sp.nicovideo.jp/watch/sm28964488?ss_pos=1&cp_in=wt_tg',
'only_matching': True,
}]
@@ -101,19 +150,102 @@ class NiconicoIE(InfoExtractor):
return True
# Log in
+ login_ok = True
login_form_strs = {
- 'mail': username,
+ 'mail_tel': username,
'password': password,
}
- login_data = urlencode_postdata(login_form_strs)
- request = sanitized_Request(
- 'https://secure.nicovideo.jp/secure/login', login_data)
- login_results = self._download_webpage(
- request, None, note='Logging in', errnote='Unable to log in')
- if re.search(r'(?i)<h1 class="mb8p4">Log in error</h1>', login_results) is not None:
+ urlh = self._request_webpage(
+ 'https://account.nicovideo.jp/api/v1/login', None,
+ note='Logging in', errnote='Unable to log in',
+ data=urlencode_postdata(login_form_strs))
+ if urlh is False:
+ login_ok = False
+ else:
+ parts = compat_urlparse.urlparse(urlh.geturl())
+ if compat_parse_qs(parts.query).get('message', [None])[0] == 'cant_login':
+ login_ok = False
+ if not login_ok:
self._downloader.report_warning('unable to log in: bad username or password')
- return False
- return True
+ return login_ok
+
+ def _extract_format_for_quality(self, api_data, video_id, audio_quality, video_quality):
+ def yesno(boolean):
+ return 'yes' if boolean else 'no'
+
+ session_api_data = api_data['video']['dmcInfo']['session_api']
+ session_api_endpoint = session_api_data['urls'][0]
+
+ format_id = '-'.join(map(lambda s: remove_start(s['id'], 'archive_'), [video_quality, audio_quality]))
+
+ session_response = self._download_json(
+ session_api_endpoint['url'], video_id,
+ query={'_format': 'json'},
+ headers={'Content-Type': 'application/json'},
+ note='Downloading JSON metadata for %s' % format_id,
+ data=json.dumps({
+ 'session': {
+ 'client_info': {
+ 'player_id': session_api_data['player_id'],
+ },
+ 'content_auth': {
+ 'auth_type': session_api_data['auth_types'][session_api_data['protocols'][0]],
+ 'content_key_timeout': session_api_data['content_key_timeout'],
+ 'service_id': 'nicovideo',
+ 'service_user_id': session_api_data['service_user_id']
+ },
+ 'content_id': session_api_data['content_id'],
+ 'content_src_id_sets': [{
+ 'content_src_ids': [{
+ 'src_id_to_mux': {
+ 'audio_src_ids': [audio_quality['id']],
+ 'video_src_ids': [video_quality['id']],
+ }
+ }]
+ }],
+ 'content_type': 'movie',
+ 'content_uri': '',
+ 'keep_method': {
+ 'heartbeat': {
+ 'lifetime': session_api_data['heartbeat_lifetime']
+ }
+ },
+ 'priority': session_api_data['priority'],
+ 'protocol': {
+ 'name': 'http',
+ 'parameters': {
+ 'http_parameters': {
+ 'parameters': {
+ 'http_output_download_parameters': {
+ 'use_ssl': yesno(session_api_endpoint['is_ssl']),
+ 'use_well_known_port': yesno(session_api_endpoint['is_well_known_port']),
+ }
+ }
+ }
+ }
+ },
+ 'recipe_id': session_api_data['recipe_id'],
+ 'session_operation_auth': {
+ 'session_operation_auth_by_signature': {
+ 'signature': session_api_data['signature'],
+ 'token': session_api_data['token'],
+ }
+ },
+ 'timing_constraint': 'unlimited'
+ }
+ }))
+
+ resolution = video_quality.get('resolution', {})
+
+ return {
+ 'url': session_response['data']['session']['content_uri'],
+ 'format_id': format_id,
+ 'ext': 'mp4', # Session API are used in HTML5, which always serves mp4
+ 'abr': float_or_none(audio_quality.get('bitrate'), 1000),
+ 'vbr': float_or_none(video_quality.get('bitrate'), 1000),
+ 'height': resolution.get('height'),
+ 'width': resolution.get('width'),
+ }
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -126,30 +258,84 @@ class NiconicoIE(InfoExtractor):
if video_id.startswith('so'):
video_id = self._match_id(handle.geturl())
- video_info = self._download_xml(
- 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id, video_id,
- note='Downloading video info page')
-
- # Get flv info
- flv_info_webpage = self._download_webpage(
- 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
- video_id, 'Downloading flv info')
-
- flv_info = compat_urlparse.parse_qs(flv_info_webpage)
- if 'url' not in flv_info:
- if 'deleted' in flv_info:
- raise ExtractorError('The video has been deleted.',
- expected=True)
- elif 'closed' in flv_info:
- raise ExtractorError('Niconico videos now require logging in',
- expected=True)
- else:
- raise ExtractorError('Unable to find video URL')
-
- video_real_url = flv_info['url'][0]
+ api_data = self._parse_json(self._html_search_regex(
+ 'data-api-data="([^"]+)"', webpage,
+ 'API data', default='{}'), video_id)
+
+ def _format_id_from_url(video_url):
+ return 'economy' if video_real_url.endswith('low') else 'normal'
+
+ try:
+ video_real_url = api_data['video']['smileInfo']['url']
+ except KeyError: # Flash videos
+ # Get flv info
+ flv_info_webpage = self._download_webpage(
+ 'http://flapi.nicovideo.jp/api/getflv/' + video_id + '?as3=1',
+ video_id, 'Downloading flv info')
+
+ flv_info = compat_urlparse.parse_qs(flv_info_webpage)
+ if 'url' not in flv_info:
+ if 'deleted' in flv_info:
+ raise ExtractorError('The video has been deleted.',
+ expected=True)
+ elif 'closed' in flv_info:
+ raise ExtractorError('Niconico videos now require logging in',
+ expected=True)
+ elif 'error' in flv_info:
+ raise ExtractorError('%s reports error: %s' % (
+ self.IE_NAME, flv_info['error'][0]), expected=True)
+ else:
+ raise ExtractorError('Unable to find video URL')
+
+ video_info_xml = self._download_xml(
+ 'http://ext.nicovideo.jp/api/getthumbinfo/' + video_id,
+ video_id, note='Downloading video info page')
+
+ def get_video_info(items):
+ if not isinstance(items, list):
+ items = [items]
+ for item in items:
+ ret = xpath_text(video_info_xml, './/' + item)
+ if ret:
+ return ret
+
+ video_real_url = flv_info['url'][0]
+
+ extension = get_video_info('movie_type')
+ if not extension:
+ extension = determine_ext(video_real_url)
+
+ formats = [{
+ 'url': video_real_url,
+ 'ext': extension,
+ 'format_id': _format_id_from_url(video_real_url),
+ }]
+ else:
+ formats = []
+
+ dmc_info = api_data['video'].get('dmcInfo')
+ if dmc_info: # "New" HTML5 videos
+ quality_info = dmc_info['quality']
+ for audio_quality in quality_info['audios']:
+ for video_quality in quality_info['videos']:
+ if not audio_quality['available'] or not video_quality['available']:
+ continue
+ formats.append(self._extract_format_for_quality(
+ api_data, video_id, audio_quality, video_quality))
+
+ self._sort_formats(formats)
+ else: # "Old" HTML5 videos
+ formats = [{
+ 'url': video_real_url,
+ 'ext': 'mp4',
+ 'format_id': _format_id_from_url(video_real_url),
+ }]
+
+ def get_video_info(items):
+ return dict_get(api_data['video'], items)
# Start extracting information
- title = xpath_text(video_info, './/title')
+ title = get_video_info('title')
if not title:
title = self._og_search_title(webpage, default=None)
if not title:
@@ -163,18 +349,15 @@ class NiconicoIE(InfoExtractor):
watch_api_data = self._parse_json(watch_api_data_string, video_id) if watch_api_data_string else {}
video_detail = watch_api_data.get('videoDetail', {})
- extension = xpath_text(video_info, './/movie_type')
- if not extension:
- extension = determine_ext(video_real_url)
-
thumbnail = (
- xpath_text(video_info, './/thumbnail_url') or
+ get_video_info(['thumbnail_url', 'thumbnailURL']) or
self._html_search_meta('image', webpage, 'thumbnail', default=None) or
video_detail.get('thumbnail'))
- description = xpath_text(video_info, './/description')
+ description = get_video_info('description')
- timestamp = parse_iso8601(xpath_text(video_info, './/first_retrieve'))
+ timestamp = (parse_iso8601(get_video_info('first_retrieve')) or
+ unified_timestamp(get_video_info('postedDateTime')))
if not timestamp:
match = self._html_search_meta('datePublished', webpage, 'date published', default=None)
if match:
@@ -184,7 +367,7 @@ class NiconicoIE(InfoExtractor):
video_detail['postedAt'].replace('/', '-'),
delimiter=' ', timezone=datetime.timedelta(hours=9))
- view_count = int_or_none(xpath_text(video_info, './/view_counter'))
+ view_count = int_or_none(get_video_info(['view_counter', 'viewCount']))
if not view_count:
match = self._html_search_regex(
r'>Views: <strong[^>]*>([^<]+)</strong>',
@@ -193,38 +376,33 @@ class NiconicoIE(InfoExtractor):
view_count = int_or_none(match.replace(',', ''))
view_count = view_count or video_detail.get('viewCount')
- comment_count = int_or_none(xpath_text(video_info, './/comment_num'))
+ comment_count = (int_or_none(get_video_info('comment_num')) or
+ video_detail.get('commentCount') or
+ try_get(api_data, lambda x: x['thread']['commentCount']))
if not comment_count:
match = self._html_search_regex(
r'>Comments: <strong[^>]*>([^<]+)</strong>',
webpage, 'comment count', default=None)
if match:
comment_count = int_or_none(match.replace(',', ''))
- comment_count = comment_count or video_detail.get('commentCount')
duration = (parse_duration(
- xpath_text(video_info, './/length') or
+ get_video_info('length') or
self._html_search_meta(
'video:duration', webpage, 'video duration', default=None)) or
- video_detail.get('length'))
+ video_detail.get('length') or
+ get_video_info('duration'))
- webpage_url = xpath_text(video_info, './/watch_url') or url
+ webpage_url = get_video_info('watch_url') or url
- if video_info.find('.//ch_id') is not None:
- uploader_id = video_info.find('.//ch_id').text
- uploader = video_info.find('.//ch_name').text
- elif video_info.find('.//user_id') is not None:
- uploader_id = video_info.find('.//user_id').text
- uploader = video_info.find('.//user_nickname').text
- else:
- uploader_id = uploader = None
+ owner = api_data.get('owner', {})
+ uploader_id = get_video_info(['ch_id', 'user_id']) or owner.get('id')
+ uploader = get_video_info(['ch_name', 'user_nickname']) or owner.get('nickname')
return {
'id': video_id,
- 'url': video_real_url,
'title': title,
- 'ext': extension,
- 'format_id': 'economy' if video_real_url.endswith('low') else 'normal',
+ 'formats': formats,
'thumbnail': thumbnail,
'description': description,
'uploader': uploader,
diff --git a/youtube_dl/extractor/npo.py b/youtube_dl/extractor/npo.py
index 5f8b6de..fa4ef20 100644
--- a/youtube_dl/extractor/npo.py
+++ b/youtube_dl/extractor/npo.py
@@ -28,7 +28,7 @@ class NPOBaseIE(InfoExtractor):
class NPOIE(NPOBaseIE):
IE_NAME = 'npo'
- IE_DESC = 'npo.nl and ntr.nl'
+ IE_DESC = 'npo.nl, ntr.nl, omroepwnl.nl, zapp.nl and npo3.nl'
_VALID_URL = r'''(?x)
(?:
npo:|
@@ -38,7 +38,7 @@ class NPOIE(NPOBaseIE):
npo\.nl/(?!(?:live|radio)/)(?:[^/]+/){2}|
ntr\.nl/(?:[^/]+/){2,}|
omroepwnl\.nl/video/fragment/[^/]+__|
- zapp\.nl/[^/]+/[^/]+/
+ (?:zapp|npo3)\.nl/(?:[^/]+/){2}
)
)
(?P<id>[^/?#]+)
@@ -147,6 +147,9 @@ class NPOIE(NPOBaseIE):
'url': 'http://www.zapp.nl/beste-vrienden-quiz/extra-video-s/WO_NTR_1067990',
'only_matching': True,
}, {
+ 'url': 'https://www.npo3.nl/3onderzoekt/16-09-2015/VPWON_1239870',
+ 'only_matching': True,
+ }, {
# live stream
'url': 'npo:LI_NL1_4188102',
'only_matching': True,
@@ -341,7 +344,7 @@ class NPOLiveIE(NPOBaseIE):
webpage = self._download_webpage(url, display_id)
live_id = self._search_regex(
- r'data-prid="([^"]+)"', webpage, 'live id')
+ [r'media-id="([^"]+)"', r'data-prid="([^"]+)"'], webpage, 'live id')
return {
'_type': 'url_transparent',
diff --git a/youtube_dl/extractor/nrk.py b/youtube_dl/extractor/nrk.py
index 3b4f51f..18ead94 100644
--- a/youtube_dl/extractor/nrk.py
+++ b/youtube_dl/extractor/nrk.py
@@ -237,7 +237,7 @@ class NRKTVIE(NRKBaseIE):
(?:/\d{2}-\d{2}-\d{4})?
(?:\#del=(?P<part_id>\d+))?
''' % _EPISODE_RE
- _API_HOST = 'psapi-we.nrk.no'
+ _API_HOST = 'psapi-ne.nrk.no'
_TESTS = [{
'url': 'https://tv.nrk.no/serie/20-spoersmaal-tv/MUHH48000314/23-05-2014',
diff --git a/youtube_dl/extractor/pbs.py b/youtube_dl/extractor/pbs.py
index 16cc667..8889e4a 100644
--- a/youtube_dl/extractor/pbs.py
+++ b/youtube_dl/extractor/pbs.py
@@ -189,7 +189,7 @@ class PBSIE(InfoExtractor):
# Direct video URL
(?:%s)/(?:viralplayer|video)/(?P<id>[0-9]+)/? |
# Article with embedded player (or direct video)
- (?:www\.)?pbs\.org/(?:[^/]+/){2,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
+ (?:www\.)?pbs\.org/(?:[^/]+/){1,5}(?P<presumptive_id>[^/]+?)(?:\.html)?/?(?:$|[?\#]) |
# Player
(?:video|player)\.pbs\.org/(?:widget/)?partnerplayer/(?P<player_id>[^/]+)/
)
@@ -346,6 +346,21 @@ class PBSIE(InfoExtractor):
},
},
{
+ # https://github.com/rg3/youtube-dl/issues/13801
+ 'url': 'https://www.pbs.org/video/pbs-newshour-full-episode-july-31-2017-1501539057/',
+ 'info_dict': {
+ 'id': '3003333873',
+ 'ext': 'mp4',
+ 'title': 'PBS NewsHour - full episode July 31, 2017',
+ 'description': 'md5:d41d8cd98f00b204e9800998ecf8427e',
+ 'duration': 3265,
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
'url': 'http://player.pbs.org/widget/partnerplayer/2365297708/?start=0&end=0&chapterbar=false&endscreen=false&topbar=true',
'only_matching': True,
},
@@ -433,6 +448,9 @@ class PBSIE(InfoExtractor):
if url:
break
+ if not url:
+ url = self._og_search_url(webpage)
+
mobj = re.match(self._VALID_URL, url)
player_id = mobj.group('player_id')
diff --git a/youtube_dl/extractor/pearvideo.py b/youtube_dl/extractor/pearvideo.py
new file mode 100644
index 0000000..1d77722
--- /dev/null
+++ b/youtube_dl/extractor/pearvideo.py
@@ -0,0 +1,63 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..utils import (
+ qualities,
+ unified_timestamp,
+)
+
+
+class PearVideoIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?pearvideo\.com/video_(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.pearvideo.com/video_1076290',
+ 'info_dict': {
+ 'id': '1076290',
+ 'ext': 'mp4',
+ 'title': '小浣熊在主人家玻璃上滚石头:没砸',
+ 'description': 'md5:01d576b747de71be0ee85eb7cac25f9d',
+ 'timestamp': 1494275280,
+ 'upload_date': '20170508',
+ }
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ webpage = self._download_webpage(url, video_id)
+
+ quality = qualities(
+ ('ldflv', 'ld', 'sdflv', 'sd', 'hdflv', 'hd', 'src'))
+
+ formats = [{
+ 'url': mobj.group('url'),
+ 'format_id': mobj.group('id'),
+ 'quality': quality(mobj.group('id')),
+ } for mobj in re.finditer(
+ r'(?P<id>[a-zA-Z]+)Url\s*=\s*(["\'])(?P<url>(?:https?:)?//.+?)\2',
+ webpage)]
+ self._sort_formats(formats)
+
+ title = self._search_regex(
+ (r'<h1[^>]+\bclass=(["\'])video-tt\1[^>]*>(?P<value>[^<]+)',
+ r'<[^>]+\bdata-title=(["\'])(?P<value>(?:(?!\1).)+)\1'),
+ webpage, 'title', group='value')
+ description = self._search_regex(
+ (r'<div[^>]+\bclass=(["\'])summary\1[^>]*>(?P<value>[^<]+)',
+ r'<[^>]+\bdata-summary=(["\'])(?P<value>(?:(?!\1).)+)\1'),
+ webpage, 'description', default=None,
+ group='value') or self._html_search_meta('Description', webpage)
+ timestamp = unified_timestamp(self._search_regex(
+ r'<div[^>]+\bclass=["\']date["\'][^>]*>([^<]+)',
+ webpage, 'timestamp', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'timestamp': timestamp,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/periscope.py b/youtube_dl/extractor/periscope.py
index 1add6b8..e5e0853 100644
--- a/youtube_dl/extractor/periscope.py
+++ b/youtube_dl/extractor/periscope.py
@@ -49,7 +49,7 @@ class PeriscopeIE(PeriscopeBaseIE):
@staticmethod
def _extract_url(webpage):
mobj = re.search(
- r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?periscope\.tv/(?:(?!\1).)+)\1', webpage)
+ r'<iframe[^>]+src=([\'"])(?P<url>(?:https?:)?//(?:www\.)?(?:periscope|pscp)\.tv/(?:(?!\1).)+)\1', webpage)
if mobj:
return mobj.group('url')
@@ -80,18 +80,24 @@ class PeriscopeIE(PeriscopeBaseIE):
stream = self._call_api(
'getAccessPublic', {'broadcast_id': token}, token)
+ video_urls = set()
formats = []
- for format_id in ('replay', 'rtmp', 'hls', 'https_hls'):
+ for format_id in ('replay', 'rtmp', 'hls', 'https_hls', 'lhls', 'lhlsweb'):
video_url = stream.get(format_id + '_url')
- if not video_url:
+ if not video_url or video_url in video_urls:
continue
- f = {
+ video_urls.add(video_url)
+ if format_id != 'rtmp':
+ formats.extend(self._extract_m3u8_formats(
+ video_url, token, 'mp4',
+ entry_protocol='m3u8_native'
+ if state in ('ended', 'timed_out') else 'm3u8',
+ m3u8_id=format_id, fatal=False))
+ continue
+ formats.append({
'url': video_url,
'ext': 'flv' if format_id == 'rtmp' else 'mp4',
- }
- if format_id != 'rtmp':
- f['protocol'] = 'm3u8_native' if state in ('ended', 'timed_out') else 'm3u8'
- formats.append(f)
+ })
self._sort_formats(formats)
return {
diff --git a/youtube_dl/extractor/pluralsight.py b/youtube_dl/extractor/pluralsight.py
index e45d9fe..f6a9131 100644
--- a/youtube_dl/extractor/pluralsight.py
+++ b/youtube_dl/extractor/pluralsight.py
@@ -18,6 +18,7 @@ from ..utils import (
parse_duration,
qualities,
srt_subtitles_timecode,
+ try_get,
update_url_query,
urlencode_postdata,
)
@@ -26,6 +27,39 @@ from ..utils import (
class PluralsightBaseIE(InfoExtractor):
_API_BASE = 'https://app.pluralsight.com'
+ def _download_course(self, course_id, url, display_id):
+ try:
+ return self._download_course_rpc(course_id, url, display_id)
+ except ExtractorError:
+ # Old API fallback
+ return self._download_json(
+ 'https://app.pluralsight.com/player/user/api/v1/player/payload',
+ display_id, data=urlencode_postdata({'courseId': course_id}),
+ headers={'Referer': url})
+
+ def _download_course_rpc(self, course_id, url, display_id):
+ response = self._download_json(
+ '%s/player/functions/rpc' % self._API_BASE, display_id,
+ 'Downloading course JSON',
+ data=json.dumps({
+ 'fn': 'bootstrapPlayer',
+ 'payload': {
+ 'courseId': course_id,
+ },
+ }).encode('utf-8'),
+ headers={
+ 'Content-Type': 'application/json;charset=utf-8',
+ 'Referer': url,
+ })
+
+ course = try_get(response, lambda x: x['payload']['course'], dict)
+ if course:
+ return course
+
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, response['error']['message']),
+ expected=True)
+
class PluralsightIE(PluralsightBaseIE):
IE_NAME = 'pluralsight'
@@ -162,10 +196,7 @@ class PluralsightIE(PluralsightBaseIE):
display_id = '%s-%s' % (name, clip_id)
- course = self._download_json(
- 'https://app.pluralsight.com/player/user/api/v1/player/payload',
- display_id, data=urlencode_postdata({'courseId': course_name}),
- headers={'Referer': url})
+ course = self._download_course(course_name, url, display_id)
collection = course['modules']
@@ -224,6 +255,7 @@ class PluralsightIE(PluralsightBaseIE):
req_format_split = req_format.split('-', 1)
if len(req_format_split) > 1:
req_ext, req_quality = req_format_split
+ req_quality = '-'.join(req_quality.split('-')[:2])
for allowed_quality in ALLOWED_QUALITIES:
if req_ext == allowed_quality.ext and req_quality in allowed_quality.qualities:
return (AllowedQuality(req_ext, (req_quality, )), )
@@ -330,18 +362,7 @@ class PluralsightCourseIE(PluralsightBaseIE):
# TODO: PSM cookie
- course = self._download_json(
- '%s/player/functions/rpc' % self._API_BASE, course_id,
- 'Downloading course JSON',
- data=json.dumps({
- 'fn': 'bootstrapPlayer',
- 'payload': {
- 'courseId': course_id,
- }
- }).encode('utf-8'),
- headers={
- 'Content-Type': 'application/json;charset=utf-8'
- })['payload']['course']
+ course = self._download_course(course_id, url, course_id)
title = course['title']
course_name = course['name']
diff --git a/youtube_dl/extractor/podomatic.py b/youtube_dl/extractor/podomatic.py
index f20946a..25fcebf 100644
--- a/youtube_dl/extractor/podomatic.py
+++ b/youtube_dl/extractor/podomatic.py
@@ -9,39 +9,46 @@ from ..utils import int_or_none
class PodomaticIE(InfoExtractor):
IE_NAME = 'podomatic'
- _VALID_URL = r'^(?P<proto>https?)://(?P<channel>[^.]+)\.podomatic\.com/entry/(?P<id>[^?]+)'
+ _VALID_URL = r'''(?x)
+ (?P<proto>https?)://
+ (?:
+ (?P<channel>[^.]+)\.podomatic\.com/entry|
+ (?:www\.)?podomatic\.com/podcasts/(?P<channel_2>[^/]+)/episodes
+ )/
+ (?P<id>[^/?#&]+)
+ '''
- _TESTS = [
- {
- 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
- 'md5': '84bb855fcf3429e6bf72460e1eed782d',
- 'info_dict': {
- 'id': '2009-01-02T16_03_35-08_00',
- 'ext': 'mp3',
- 'uploader': 'Science Teaching Tips',
- 'uploader_id': 'scienceteachingtips',
- 'title': '64. When the Moon Hits Your Eye',
- 'duration': 446,
- }
- },
- {
- 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
- 'md5': 'd2cf443931b6148e27638650e2638297',
- 'info_dict': {
- 'id': '2013-11-15T16_31_21-08_00',
- 'ext': 'mp3',
- 'uploader': 'Ostbahnhof / Techno Mix',
- 'uploader_id': 'ostbahnhof',
- 'title': 'Einunddreizig',
- 'duration': 3799,
- }
- },
- ]
+ _TESTS = [{
+ 'url': 'http://scienceteachingtips.podomatic.com/entry/2009-01-02T16_03_35-08_00',
+ 'md5': '84bb855fcf3429e6bf72460e1eed782d',
+ 'info_dict': {
+ 'id': '2009-01-02T16_03_35-08_00',
+ 'ext': 'mp3',
+ 'uploader': 'Science Teaching Tips',
+ 'uploader_id': 'scienceteachingtips',
+ 'title': '64. When the Moon Hits Your Eye',
+ 'duration': 446,
+ }
+ }, {
+ 'url': 'http://ostbahnhof.podomatic.com/entry/2013-11-15T16_31_21-08_00',
+ 'md5': 'd2cf443931b6148e27638650e2638297',
+ 'info_dict': {
+ 'id': '2013-11-15T16_31_21-08_00',
+ 'ext': 'mp3',
+ 'uploader': 'Ostbahnhof / Techno Mix',
+ 'uploader_id': 'ostbahnhof',
+ 'title': 'Einunddreizig',
+ 'duration': 3799,
+ }
+ }, {
+ 'url': 'https://www.podomatic.com/podcasts/scienceteachingtips/episodes/2009-01-02T16_03_35-08_00',
+ 'only_matching': True,
+ }]
def _real_extract(self, url):
mobj = re.match(self._VALID_URL, url)
video_id = mobj.group('id')
- channel = mobj.group('channel')
+ channel = mobj.group('channel') or mobj.group('channel_2')
json_url = (('%s://%s.podomatic.com/entry/embed_params/%s' +
'?permalink=true&rtmp=0') %
diff --git a/youtube_dl/extractor/pornhd.py b/youtube_dl/extractor/pornhd.py
index 842317e..b52879c 100644
--- a/youtube_dl/extractor/pornhd.py
+++ b/youtube_dl/extractor/pornhd.py
@@ -54,7 +54,7 @@ class PornHdIE(InfoExtractor):
r'<title>(.+?) - .*?[Pp]ornHD.*?</title>'], webpage, 'title')
sources = self._parse_json(js_to_json(self._search_regex(
- r"(?s)'sources'\s*:\s*(\{.+?\})\s*\}[;,)]",
+ r"(?s)sources'?\s*[:=]\s*(\{.+?\})",
webpage, 'sources', default='{}')), video_id)
if not sources:
@@ -82,7 +82,8 @@ class PornHdIE(InfoExtractor):
view_count = int_or_none(self._html_search_regex(
r'(\d+) views\s*<', webpage, 'view count', fatal=False))
thumbnail = self._search_regex(
- r"'poster'\s*:\s*'([^']+)'", webpage, 'thumbnail', fatal=False)
+ r"poster'?\s*:\s*([\"'])(?P<url>(?:(?!\1).)+)\1", webpage,
+ 'thumbnail', fatal=False, group='url')
return {
'id': video_id,
diff --git a/youtube_dl/extractor/pornhub.py b/youtube_dl/extractor/pornhub.py
index e032817..3428458 100644
--- a/youtube_dl/extractor/pornhub.py
+++ b/youtube_dl/extractor/pornhub.py
@@ -186,7 +186,7 @@ class PornHubIE(InfoExtractor):
title, thumbnail, duration = [None] * 3
video_uploader = self._html_search_regex(
- r'(?s)From:&nbsp;.+?<(?:a href="/users/|a href="/channels/|span class="username)[^>]+>(.+?)<',
+ r'(?s)From:&nbsp;.+?<(?:a\b[^>]+\bhref=["\']/(?:user|channel)s/|span\b[^>]+\bclass=["\']username)[^>]+>(.+?)<',
webpage, 'uploader', fatal=False)
view_count = self._extract_count(
@@ -227,13 +227,20 @@ class PornHubIE(InfoExtractor):
class PornHubPlaylistBaseIE(InfoExtractor):
def _extract_entries(self, webpage):
+ # Only process container div with main playlist content skipping
+ # drop-down menu that uses similar pattern for videos (see
+ # https://github.com/rg3/youtube-dl/issues/11594).
+ container = self._search_regex(
+ r'(?s)(<div[^>]+class=["\']container.+)', webpage,
+ 'container', default=webpage)
+
return [
self.url_result(
'http://www.pornhub.com/%s' % video_url,
PornHubIE.ie_key(), video_title=title)
for video_url, title in orderedSet(re.findall(
r'href="/?(view_video\.php\?.*\bviewkey=[\da-z]+[^"]*)"[^>]*\s+title="([^"]+)"',
- webpage))
+ container))
]
def _real_extract(self, url):
@@ -241,14 +248,7 @@ class PornHubPlaylistBaseIE(InfoExtractor):
webpage = self._download_webpage(url, playlist_id)
- # Only process container div with main playlist content skipping
- # drop-down menu that uses similar pattern for videos (see
- # https://github.com/rg3/youtube-dl/issues/11594).
- container = self._search_regex(
- r'(?s)(<div[^>]+class=["\']container.+)', webpage,
- 'container', default=webpage)
-
- entries = self._extract_entries(container)
+ entries = self._extract_entries(webpage)
playlist = self._parse_json(
self._search_regex(
diff --git a/youtube_dl/extractor/qqmusic.py b/youtube_dl/extractor/qqmusic.py
index 17c27da..084308a 100644
--- a/youtube_dl/extractor/qqmusic.py
+++ b/youtube_dl/extractor/qqmusic.py
@@ -2,38 +2,37 @@
from __future__ import unicode_literals
import random
-import time
import re
+import time
from .common import InfoExtractor
from ..utils import (
- sanitized_Request,
- strip_jsonp,
- unescapeHTML,
clean_html,
ExtractorError,
+ strip_jsonp,
+ unescapeHTML,
)
class QQMusicIE(InfoExtractor):
IE_NAME = 'qqmusic'
IE_DESC = 'QQ音乐'
- _VALID_URL = r'https?://y\.qq\.com/#type=song&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/song/(?P<id>[0-9A-Za-z]+)\.html'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=song&mid=004295Et37taLD',
- 'md5': '9ce1c1c8445f561506d2e3cfb0255705',
+ 'url': 'https://y.qq.com/n/yqq/song/004295Et37taLD.html',
+ 'md5': '5f1e6cea39e182857da7ffc5ef5e6bb8',
'info_dict': {
'id': '004295Et37taLD',
'ext': 'mp3',
'title': '可惜没如果',
'release_date': '20141227',
'creator': '林俊杰',
- 'description': 'md5:d327722d0361576fde558f1ac68a7065',
+ 'description': 'md5:d85afb3051952ecc50a1ee8a286d1eac',
'thumbnail': r're:^https?://.*\.jpg$',
}
}, {
'note': 'There is no mp3-320 version of this song.',
- 'url': 'http://y.qq.com/#type=song&mid=004MsGEo3DdNxV',
+ 'url': 'https://y.qq.com/n/yqq/song/004MsGEo3DdNxV.html',
'md5': 'fa3926f0c585cda0af8fa4f796482e3e',
'info_dict': {
'id': '004MsGEo3DdNxV',
@@ -46,14 +45,14 @@ class QQMusicIE(InfoExtractor):
}
}, {
'note': 'lyrics not in .lrc format',
- 'url': 'http://y.qq.com/#type=song&mid=001JyApY11tIp6',
+ 'url': 'https://y.qq.com/n/yqq/song/001JyApY11tIp6.html',
'info_dict': {
'id': '001JyApY11tIp6',
'ext': 'mp3',
'title': 'Shadows Over Transylvania',
'release_date': '19970225',
'creator': 'Dark Funeral',
- 'description': 'md5:ed14d5bd7ecec19609108052c25b2c11',
+ 'description': 'md5:c9b20210587cbcd6836a1c597bab4525',
'thumbnail': r're:^https?://.*\.jpg$',
},
'params': {
@@ -105,7 +104,7 @@ class QQMusicIE(InfoExtractor):
[r'albummid:\'([0-9a-zA-Z]+)\'', r'"albummid":"([0-9a-zA-Z]+)"'],
detail_info_page, 'album mid', default=None)
if albummid:
- thumbnail_url = "http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg" \
+ thumbnail_url = 'http://i.gtimg.cn/music/photo/mid_album_500/%s/%s/%s.jpg' \
% (albummid[-2:-1], albummid[-1], albummid)
guid = self.m_r_get_ruin()
@@ -156,15 +155,39 @@ class QQPlaylistBaseIE(InfoExtractor):
def qq_static_url(category, mid):
return 'http://y.qq.com/y/static/%s/%s/%s/%s.html' % (category, mid[-2], mid[-1], mid)
- @classmethod
- def get_entries_from_page(cls, page):
+ def get_singer_all_songs(self, singmid, num):
+ return self._download_webpage(
+ r'https://c.y.qq.com/v8/fcg-bin/fcg_v8_singer_track_cp.fcg', singmid,
+ query={
+ 'format': 'json',
+ 'inCharset': 'utf8',
+ 'outCharset': 'utf-8',
+ 'platform': 'yqq',
+ 'needNewCode': 0,
+ 'singermid': singmid,
+ 'order': 'listen',
+ 'begin': 0,
+ 'num': num,
+ 'songstatus': 1,
+ })
+
+ def get_entries_from_page(self, singmid):
entries = []
- for item in re.findall(r'class="data"[^<>]*>([^<>]+)</', page):
- song_mid = unescapeHTML(item).split('|')[-5]
- entries.append(cls.url_result(
- 'http://y.qq.com/#type=song&mid=' + song_mid, 'QQMusic',
- song_mid))
+ default_num = 1
+ json_text = self.get_singer_all_songs(singmid, default_num)
+ json_obj_all_songs = self._parse_json(json_text, singmid)
+
+ if json_obj_all_songs['code'] == 0:
+ total = json_obj_all_songs['data']['total']
+ json_text = self.get_singer_all_songs(singmid, total)
+ json_obj_all_songs = self._parse_json(json_text, singmid)
+
+ for item in json_obj_all_songs['data']['list']:
+ if item['musicData'].get('songmid') is not None:
+ songmid = item['musicData']['songmid']
+ entries.append(self.url_result(
+ r'https://y.qq.com/n/yqq/song/%s.html' % songmid, 'QQMusic', songmid))
return entries
@@ -172,42 +195,32 @@ class QQPlaylistBaseIE(InfoExtractor):
class QQMusicSingerIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:singer'
IE_DESC = 'QQ音乐 - 歌手'
- _VALID_URL = r'https?://y\.qq\.com/#type=singer&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/singer/(?P<id>[0-9A-Za-z]+)\.html'
_TEST = {
- 'url': 'http://y.qq.com/#type=singer&mid=001BLpXF2DyJe2',
+ 'url': 'https://y.qq.com/n/yqq/singer/001BLpXF2DyJe2.html',
'info_dict': {
'id': '001BLpXF2DyJe2',
'title': '林俊杰',
'description': 'md5:870ec08f7d8547c29c93010899103751',
},
- 'playlist_count': 12,
+ 'playlist_mincount': 12,
}
def _real_extract(self, url):
mid = self._match_id(url)
- singer_page = self._download_webpage(
- self.qq_static_url('singer', mid), mid, 'Download singer page')
-
- entries = self.get_entries_from_page(singer_page)
-
+ entries = self.get_entries_from_page(mid)
+ singer_page = self._download_webpage(url, mid, 'Download singer page')
singer_name = self._html_search_regex(
- r"singername\s*:\s*'([^']+)'", singer_page, 'singer name',
- default=None)
-
- singer_id = self._html_search_regex(
- r"singerid\s*:\s*'([0-9]+)'", singer_page, 'singer id',
- default=None)
-
+ r"singername\s*:\s*'(.*?)'", singer_page, 'singer name', default=None)
singer_desc = None
- if singer_id:
- req = sanitized_Request(
- 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg?utf8=1&outCharset=utf-8&format=xml&singerid=%s' % singer_id)
- req.add_header(
- 'Referer', 'http://s.plcloud.music.qq.com/xhr_proxy_utf8.html')
+ if mid:
singer_desc_page = self._download_xml(
- req, mid, 'Donwload singer description XML')
+ 'http://s.plcloud.music.qq.com/fcgi-bin/fcg_get_singer_desc.fcg', mid,
+ 'Donwload singer description XML',
+ query={'utf8': 1, 'outCharset': 'utf-8', 'format': 'xml', 'singermid': mid},
+ headers={'Referer': 'https://y.qq.com/n/yqq/singer/'})
singer_desc = singer_desc_page.find('./data/info/desc').text
@@ -217,10 +230,10 @@ class QQMusicSingerIE(QQPlaylistBaseIE):
class QQMusicAlbumIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:album'
IE_DESC = 'QQ音乐 - 专辑'
- _VALID_URL = r'https?://y\.qq\.com/#type=album&mid=(?P<id>[0-9A-Za-z]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/album/(?P<id>[0-9A-Za-z]+)\.html'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=album&mid=000gXCTb2AhRR1',
+ 'url': 'https://y.qq.com/n/yqq/album/000gXCTb2AhRR1.html',
'info_dict': {
'id': '000gXCTb2AhRR1',
'title': '我们都是这样长大的',
@@ -228,7 +241,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
},
'playlist_count': 4,
}, {
- 'url': 'http://y.qq.com/#type=album&mid=002Y5a3b3AlCu3',
+ 'url': 'https://y.qq.com/n/yqq/album/002Y5a3b3AlCu3.html',
'info_dict': {
'id': '002Y5a3b3AlCu3',
'title': '그리고...',
@@ -246,7 +259,7 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
entries = [
self.url_result(
- 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
+ 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid']
) for song in album['list']
]
album_name = album.get('name')
@@ -260,31 +273,30 @@ class QQMusicAlbumIE(QQPlaylistBaseIE):
class QQMusicToplistIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:toplist'
IE_DESC = 'QQ音乐 - 排行榜'
- _VALID_URL = r'https?://y\.qq\.com/#type=toplist&p=(?P<id>(top|global)_[0-9]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/toplist/(?P<id>[0-9]+)\.html'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=toplist&p=global_123',
+ 'url': 'https://y.qq.com/n/yqq/toplist/123.html',
'info_dict': {
- 'id': 'global_123',
+ 'id': '123',
'title': '美国iTunes榜',
+ 'description': 'md5:89db2335fdbb10678dee2d43fe9aba08',
},
- 'playlist_count': 10,
+ 'playlist_count': 100,
}, {
- 'url': 'http://y.qq.com/#type=toplist&p=top_3',
+ 'url': 'https://y.qq.com/n/yqq/toplist/3.html',
'info_dict': {
- 'id': 'top_3',
+ 'id': '3',
'title': '巅峰榜·欧美',
- 'description': 'QQ音乐巅峰榜·欧美根据用户收听行为自动生成,集结当下最流行的欧美新歌!:更新时间:每周四22点|统'
- '计周期:一周(上周四至本周三)|统计对象:三个月内发行的欧美歌曲|统计数量:100首|统计算法:根据'
- '歌曲在一周内的有效播放次数,由高到低取前100名(同一歌手最多允许5首歌曲同时上榜)|有效播放次数:'
- '登录用户完整播放一首歌曲,记为一次有效播放;同一用户收听同一首歌曲,每天记录为1次有效播放'
+ 'description': 'md5:5a600d42c01696b26b71f8c4d43407da',
},
'playlist_count': 100,
}, {
- 'url': 'http://y.qq.com/#type=toplist&p=global_106',
+ 'url': 'https://y.qq.com/n/yqq/toplist/106.html',
'info_dict': {
- 'id': 'global_106',
+ 'id': '106',
'title': '韩国Mnet榜',
+ 'description': 'md5:cb84b325215e1d21708c615cac82a6e7',
},
'playlist_count': 50,
}]
@@ -292,18 +304,15 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
def _real_extract(self, url):
list_id = self._match_id(url)
- list_type, num_id = list_id.split("_")
-
toplist_json = self._download_json(
- 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg?type=%s&topid=%s&format=json'
- % (list_type, num_id),
- list_id, 'Download toplist page')
+ 'http://i.y.qq.com/v8/fcg-bin/fcg_v8_toplist_cp.fcg', list_id,
+ note='Download toplist page',
+ query={'type': 'toplist', 'topid': list_id, 'format': 'json'})
- entries = [
- self.url_result(
- 'http://y.qq.com/#type=song&mid=' + song['data']['songmid'], 'QQMusic', song['data']['songmid']
- ) for song in toplist_json['songlist']
- ]
+ entries = [self.url_result(
+ 'https://y.qq.com/n/yqq/song/' + song['data']['songmid'] + '.html', 'QQMusic',
+ song['data']['songmid'])
+ for song in toplist_json['songlist']]
topinfo = toplist_json.get('topinfo', {})
list_name = topinfo.get('ListName')
@@ -314,10 +323,10 @@ class QQMusicToplistIE(QQPlaylistBaseIE):
class QQMusicPlaylistIE(QQPlaylistBaseIE):
IE_NAME = 'qqmusic:playlist'
IE_DESC = 'QQ音乐 - 歌单'
- _VALID_URL = r'https?://y\.qq\.com/#type=taoge&id=(?P<id>[0-9]+)'
+ _VALID_URL = r'https?://y\.qq\.com/n/yqq/playlist/(?P<id>[0-9]+)\.html'
_TESTS = [{
- 'url': 'http://y.qq.com/#type=taoge&id=3462654915',
+ 'url': 'http://y.qq.com/n/yqq/playlist/3462654915.html',
'info_dict': {
'id': '3462654915',
'title': '韩国5月新歌精选下旬',
@@ -326,7 +335,7 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
'playlist_count': 40,
'skip': 'playlist gone',
}, {
- 'url': 'http://y.qq.com/#type=taoge&id=1374105607',
+ 'url': 'https://y.qq.com/n/yqq/playlist/1374105607.html',
'info_dict': {
'id': '1374105607',
'title': '易入人心的华语民谣',
@@ -339,8 +348,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
list_id = self._match_id(url)
list_json = self._download_json(
- 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg?type=1&json=1&utf8=1&onlysong=0&disstid=%s'
- % list_id, list_id, 'Download list page',
+ 'http://i.y.qq.com/qzone-music/fcg-bin/fcg_ucc_getcdinfo_byids_cp.fcg',
+ list_id, 'Download list page',
+ query={'type': 1, 'json': 1, 'utf8': 1, 'onlysong': 0, 'disstid': list_id},
transform_source=strip_jsonp)
if not len(list_json.get('cdlist', [])):
if list_json.get('code'):
@@ -350,11 +360,9 @@ class QQMusicPlaylistIE(QQPlaylistBaseIE):
raise ExtractorError('Unable to get playlist info')
cdlist = list_json['cdlist'][0]
- entries = [
- self.url_result(
- 'http://y.qq.com/#type=song&mid=' + song['songmid'], 'QQMusic', song['songmid']
- ) for song in cdlist['songlist']
- ]
+ entries = [self.url_result(
+ 'https://y.qq.com/n/yqq/song/' + song['songmid'] + '.html', 'QQMusic', song['songmid'])
+ for song in cdlist['songlist']]
list_name = cdlist.get('dissname')
list_description = clean_html(unescapeHTML(cdlist.get('desc')))
diff --git a/youtube_dl/extractor/radiocanada.py b/youtube_dl/extractor/radiocanada.py
index 3b40002..b952e59 100644
--- a/youtube_dl/extractor/radiocanada.py
+++ b/youtube_dl/extractor/radiocanada.py
@@ -20,20 +20,37 @@ from ..utils import (
class RadioCanadaIE(InfoExtractor):
IE_NAME = 'radiocanada'
_VALID_URL = r'(?:radiocanada:|https?://ici\.radio-canada\.ca/widgets/mediaconsole/)(?P<app_code>[^:/]+)[:/](?P<id>[0-9]+)'
- _TEST = {
- 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
- 'info_dict': {
- 'id': '7184272',
- 'ext': 'mp4',
- 'title': 'Le parcours du tireur capté sur vidéo',
- 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
- 'upload_date': '20141023',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
+ _TESTS = [
+ {
+ 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7184272',
+ 'info_dict': {
+ 'id': '7184272',
+ 'ext': 'mp4',
+ 'title': 'Le parcours du tireur capté sur vidéo',
+ 'description': 'Images des caméras de surveillance fournies par la GRC montrant le parcours du tireur d\'Ottawa',
+ 'upload_date': '20141023',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ }
},
- }
+ {
+ # empty Title
+ 'url': 'http://ici.radio-canada.ca/widgets/mediaconsole/medianet/7754998/',
+ 'info_dict': {
+ 'id': '7754998',
+ 'ext': 'mp4',
+ 'title': 'letelejournal22h',
+ 'description': 'INTEGRALE WEB 22H-TJ',
+ 'upload_date': '20170720',
+ },
+ 'params': {
+ # m3u8 download
+ 'skip_download': True,
+ },
+ }
+ ]
def _real_extract(self, url):
url, smuggled_data = unsmuggle_url(url, {})
@@ -59,6 +76,7 @@ class RadioCanadaIE(InfoExtractor):
device_types.append('android')
formats = []
+ error = None
# TODO: extract f4m formats
# f4m formats can be extracted using flashhd device_type but they produce unplayable file
for device_type in device_types:
@@ -84,8 +102,8 @@ class RadioCanadaIE(InfoExtractor):
if not v_url:
continue
if v_url == 'null':
- raise ExtractorError('%s said: %s' % (
- self.IE_NAME, xpath_text(v_data, 'message')), expected=True)
+ error = xpath_text(v_data, 'message')
+ continue
ext = determine_ext(v_url)
if ext == 'm3u8':
formats.extend(self._extract_m3u8_formats(
@@ -129,6 +147,9 @@ class RadioCanadaIE(InfoExtractor):
formats.extend(self._extract_f4m_formats(
base_url + '/manifest.f4m', video_id,
f4m_id='hds', fatal=False))
+ if not formats and error:
+ raise ExtractorError(
+ '%s said: %s' % (self.IE_NAME, error), expected=True)
self._sort_formats(formats)
subtitles = {}
@@ -141,7 +162,7 @@ class RadioCanadaIE(InfoExtractor):
return {
'id': video_id,
- 'title': get_meta('Title'),
+ 'title': get_meta('Title') or get_meta('AV-nomEmission'),
'description': get_meta('Description') or get_meta('ShortDescription'),
'thumbnail': get_meta('imageHR') or get_meta('imageMR') or get_meta('imageBR'),
'duration': int_or_none(get_meta('length')),
diff --git a/youtube_dl/extractor/rai.py b/youtube_dl/extractor/rai.py
index e11bf8f..5bf64a5 100644
--- a/youtube_dl/extractor/rai.py
+++ b/youtube_dl/extractor/rai.py
@@ -345,11 +345,11 @@ class RaiIE(RaiBaseIE):
media_type = media['type']
if 'Audio' in media_type:
relinker_info = {
- 'formats': {
+ 'formats': [{
'format_id': media.get('formatoAudio'),
'url': media['audioUrl'],
'ext': media.get('formatoAudio'),
- }
+ }]
}
elif 'Video' in media_type:
relinker_info = self._extract_relinker_info(media['mediaUri'], content_id)
diff --git a/youtube_dl/extractor/reddit.py b/youtube_dl/extractor/reddit.py
new file mode 100644
index 0000000..01c85ee
--- /dev/null
+++ b/youtube_dl/extractor/reddit.py
@@ -0,0 +1,114 @@
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ float_or_none,
+)
+
+
+class RedditIE(InfoExtractor):
+ _VALID_URL = r'https?://v\.redd\.it/(?P<id>[^/?#&]+)'
+ _TEST = {
+ # from https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/
+ 'url': 'https://v.redd.it/zv89llsvexdz',
+ 'md5': '655d06ace653ea3b87bccfb1b27ec99d',
+ 'info_dict': {
+ 'id': 'zv89llsvexdz',
+ 'ext': 'mp4',
+ 'title': 'zv89llsvexdz',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ },
+ }
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ formats = self._extract_m3u8_formats(
+ 'https://v.redd.it/%s/HLSPlaylist.m3u8' % video_id, video_id,
+ 'mp4', entry_protocol='m3u8_native', m3u8_id='hls', fatal=False)
+
+ formats.extend(self._extract_mpd_formats(
+ 'https://v.redd.it/%s/DASHPlaylist.mpd' % video_id, video_id,
+ mpd_id='dash', fatal=False))
+
+ return {
+ 'id': video_id,
+ 'title': video_id,
+ 'formats': formats,
+ }
+
+
+class RedditRIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?reddit\.com/r/[^/]+/comments/(?P<id>[^/]+)'
+ _TESTS = [{
+ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj/that_small_heart_attack/',
+ 'info_dict': {
+ 'id': 'zv89llsvexdz',
+ 'ext': 'mp4',
+ 'title': 'That small heart attack.',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'timestamp': 1501941939,
+ 'upload_date': '20170805',
+ 'uploader': 'Antw87',
+ 'like_count': int,
+ 'dislike_count': int,
+ 'comment_count': int,
+ 'age_limit': 0,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ }, {
+ 'url': 'https://www.reddit.com/r/videos/comments/6rrwyj',
+ 'only_matching': True,
+ }, {
+ # imgur
+ 'url': 'https://www.reddit.com/r/MadeMeSmile/comments/6t7wi5/wait_for_it/',
+ 'only_matching': True,
+ }, {
+ # streamable
+ 'url': 'https://www.reddit.com/r/videos/comments/6t7sg9/comedians_hilarious_joke_about_the_guam_flag/',
+ 'only_matching': True,
+ }, {
+ # youtube
+ 'url': 'https://www.reddit.com/r/videos/comments/6t75wq/southern_man_tries_to_speak_without_an_accent/',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ data = self._download_json(
+ url + '.json', video_id)[0]['data']['children'][0]['data']
+
+ video_url = data['url']
+
+ # Avoid recursing into the same reddit URL
+ if 'reddit.com/' in video_url and '/%s/' % video_id in video_url:
+ raise ExtractorError('No media found', expected=True)
+
+ over_18 = data.get('over_18')
+ if over_18 is True:
+ age_limit = 18
+ elif over_18 is False:
+ age_limit = 0
+ else:
+ age_limit = None
+
+ return {
+ '_type': 'url_transparent',
+ 'url': video_url,
+ 'title': data.get('title'),
+ 'thumbnail': data.get('thumbnail'),
+ 'timestamp': float_or_none(data.get('created_utc')),
+ 'uploader': data.get('author'),
+ 'like_count': int_or_none(data.get('ups')),
+ 'dislike_count': int_or_none(data.get('downs')),
+ 'comment_count': int_or_none(data.get('num_comments')),
+ 'age_limit': age_limit,
+ }
diff --git a/youtube_dl/extractor/redtube.py b/youtube_dl/extractor/redtube.py
index c367a6a..f70a752 100644
--- a/youtube_dl/extractor/redtube.py
+++ b/youtube_dl/extractor/redtube.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
ExtractorError,
int_or_none,
@@ -62,7 +63,23 @@ class RedTubeIE(InfoExtractor):
'format_id': format_id,
'height': int_or_none(format_id),
})
- else:
+ medias = self._parse_json(
+ self._search_regex(
+ r'mediaDefinition\s*:\s*(\[.+?\])', webpage,
+ 'media definitions', default='{}'),
+ video_id, fatal=False)
+ if medias and isinstance(medias, list):
+ for media in medias:
+ format_url = media.get('videoUrl')
+ if not format_url or not isinstance(format_url, compat_str):
+ continue
+ format_id = media.get('quality')
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'height': int_or_none(format_id),
+ })
+ if not formats:
video_url = self._html_search_regex(
r'<source src="(.+?)" type="video/mp4">', webpage, 'video URL')
formats.append({'url': video_url})
@@ -73,7 +90,7 @@ class RedTubeIE(InfoExtractor):
r'<span[^>]+class="added-time"[^>]*>ADDED ([^<]+)<',
webpage, 'upload date', fatal=False))
duration = int_or_none(self._search_regex(
- r'videoDuration\s*:\s*(\d+)', webpage, 'duration', fatal=False))
+ r'videoDuration\s*:\s*(\d+)', webpage, 'duration', default=None))
view_count = str_to_int(self._search_regex(
r'<span[^>]*>VIEWS</span></td>\s*<td>([\d,.]+)',
webpage, 'view count', fatal=False))
diff --git a/youtube_dl/extractor/rutube.py b/youtube_dl/extractor/rutube.py
index 889fa76..89d89b6 100644
--- a/youtube_dl/extractor/rutube.py
+++ b/youtube_dl/extractor/rutube.py
@@ -7,43 +7,84 @@ import itertools
from .common import InfoExtractor
from ..compat import (
compat_str,
+ compat_parse_qs,
+ compat_urllib_parse_urlparse,
)
from ..utils import (
determine_ext,
- unified_strdate,
+ bool_or_none,
+ int_or_none,
+ try_get,
+ unified_timestamp,
)
-class RutubeIE(InfoExtractor):
+class RutubeBaseIE(InfoExtractor):
+ def _extract_video(self, video, video_id=None, require_title=True):
+ title = video['title'] if require_title else video.get('title')
+
+ age_limit = video.get('is_adult')
+ if age_limit is not None:
+ age_limit = 18 if age_limit is True else 0
+
+ uploader_id = try_get(video, lambda x: x['author']['id'])
+ category = try_get(video, lambda x: x['category']['name'])
+
+ return {
+ 'id': video.get('id') or video_id,
+ 'title': title,
+ 'description': video.get('description'),
+ 'thumbnail': video.get('thumbnail_url'),
+ 'duration': int_or_none(video.get('duration')),
+ 'uploader': try_get(video, lambda x: x['author']['name']),
+ 'uploader_id': compat_str(uploader_id) if uploader_id else None,
+ 'timestamp': unified_timestamp(video.get('created_ts')),
+ 'category': [category] if category else None,
+ 'age_limit': age_limit,
+ 'view_count': int_or_none(video.get('hits')),
+ 'comment_count': int_or_none(video.get('comments_count')),
+ 'is_live': bool_or_none(video.get('is_livestream')),
+ }
+
+
+class RutubeIE(RutubeBaseIE):
IE_NAME = 'rutube'
IE_DESC = 'Rutube videos'
_VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/(?P<id>[\da-z]{32})'
_TESTS = [{
'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/',
+ 'md5': '79938ade01294ef7e27574890d0d3769',
'info_dict': {
'id': '3eac3b4561676c17df9132a9a1e62e3e',
- 'ext': 'mp4',
+ 'ext': 'flv',
'title': 'Раненный кенгуру забежал в аптеку',
'description': 'http://www.ntdtv.ru ',
'duration': 80,
'uploader': 'NTDRussian',
'uploader_id': '29790',
+ 'timestamp': 1381943602,
'upload_date': '20131016',
'age_limit': 0,
},
- 'params': {
- # It requires ffmpeg (m3u8 download)
- 'skip_download': True,
- },
}, {
'url': 'http://rutube.ru/play/embed/a10e53b86e8f349080f718582ce4c661',
'only_matching': True,
}, {
'url': 'http://rutube.ru/embed/a10e53b86e8f349080f718582ce4c661',
'only_matching': True,
+ }, {
+ 'url': 'http://rutube.ru/video/3eac3b4561676c17df9132a9a1e62e3e/?pl_id=4252',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_type=source',
+ 'only_matching': True,
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if RutubePlaylistIE.suitable(url) else super(RutubeIE, cls).suitable(url)
+
@staticmethod
def _extract_urls(webpage):
return [mobj.group('url') for mobj in re.finditer(
@@ -52,12 +93,12 @@ class RutubeIE(InfoExtractor):
def _real_extract(self, url):
video_id = self._match_id(url)
+
video = self._download_json(
'http://rutube.ru/api/video/%s/?format=json' % video_id,
video_id, 'Downloading video JSON')
- # Some videos don't have the author field
- author = video.get('author') or {}
+ info = self._extract_video(video, video_id)
options = self._download_json(
'http://rutube.ru/api/play/options/%s/?format=json' % video_id,
@@ -79,19 +120,8 @@ class RutubeIE(InfoExtractor):
})
self._sort_formats(formats)
- return {
- 'id': video['id'],
- 'title': video['title'],
- 'description': video['description'],
- 'duration': video['duration'],
- 'view_count': video['hits'],
- 'formats': formats,
- 'thumbnail': video['thumbnail_url'],
- 'uploader': author.get('name'),
- 'uploader_id': compat_str(author['id']) if author else None,
- 'upload_date': unified_strdate(video['created_ts']),
- 'age_limit': 18 if video['is_adult'] else 0,
- }
+ info['formats'] = formats
+ return info
class RutubeEmbedIE(InfoExtractor):
@@ -103,7 +133,8 @@ class RutubeEmbedIE(InfoExtractor):
'url': 'http://rutube.ru/video/embed/6722881?vk_puid37=&vk_puid38=',
'info_dict': {
'id': 'a10e53b86e8f349080f718582ce4c661',
- 'ext': 'mp4',
+ 'ext': 'flv',
+ 'timestamp': 1387830582,
'upload_date': '20131223',
'uploader_id': '297833',
'description': 'Видео группы ★http://vk.com/foxkidsreset★ музей Fox Kids и Jetix<br/><br/> восстановлено и сделано в шикоформате subziro89 http://vk.com/subziro89',
@@ -111,7 +142,7 @@ class RutubeEmbedIE(InfoExtractor):
'title': 'Мистический городок Эйри в Индиан 5 серия озвучка subziro89',
},
'params': {
- 'skip_download': 'Requires ffmpeg',
+ 'skip_download': True,
},
}, {
'url': 'http://rutube.ru/play/embed/8083783',
@@ -125,10 +156,51 @@ class RutubeEmbedIE(InfoExtractor):
canonical_url = self._html_search_regex(
r'<link\s+rel="canonical"\s+href="([^"]+?)"', webpage,
'Canonical URL')
- return self.url_result(canonical_url, 'Rutube')
+ return self.url_result(canonical_url, RutubeIE.ie_key())
+
+
+class RutubePlaylistBaseIE(RutubeBaseIE):
+ def _next_page_url(self, page_num, playlist_id, *args, **kwargs):
+ return self._PAGE_TEMPLATE % (playlist_id, page_num)
+ def _entries(self, playlist_id, *args, **kwargs):
+ next_page_url = None
+ for pagenum in itertools.count(1):
+ page = self._download_json(
+ next_page_url or self._next_page_url(
+ pagenum, playlist_id, *args, **kwargs),
+ playlist_id, 'Downloading page %s' % pagenum)
+
+ results = page.get('results')
+ if not results or not isinstance(results, list):
+ break
+
+ for result in results:
+ video_url = result.get('video_url')
+ if not video_url or not isinstance(video_url, compat_str):
+ continue
+ entry = self._extract_video(result, require_title=False)
+ entry.update({
+ '_type': 'url',
+ 'url': video_url,
+ 'ie_key': RutubeIE.ie_key(),
+ })
+ yield entry
-class RutubeChannelIE(InfoExtractor):
+ next_page_url = page.get('next')
+ if not next_page_url or not page.get('has_next'):
+ break
+
+ def _extract_playlist(self, playlist_id, *args, **kwargs):
+ return self.playlist_result(
+ self._entries(playlist_id, *args, **kwargs),
+ playlist_id, kwargs.get('playlist_name'))
+
+ def _real_extract(self, url):
+ return self._extract_playlist(self._match_id(url))
+
+
+class RutubeChannelIE(RutubePlaylistBaseIE):
IE_NAME = 'rutube:channel'
IE_DESC = 'Rutube channels'
_VALID_URL = r'https?://rutube\.ru/tags/video/(?P<id>\d+)'
@@ -142,27 +214,8 @@ class RutubeChannelIE(InfoExtractor):
_PAGE_TEMPLATE = 'http://rutube.ru/api/tags/video/%s/?page=%s&format=json'
- def _extract_videos(self, channel_id, channel_title=None):
- entries = []
- for pagenum in itertools.count(1):
- page = self._download_json(
- self._PAGE_TEMPLATE % (channel_id, pagenum),
- channel_id, 'Downloading page %s' % pagenum)
- results = page['results']
- if not results:
- break
- entries.extend(self.url_result(result['video_url'], 'Rutube') for result in results)
- if not page['has_next']:
- break
- return self.playlist_result(entries, channel_id, channel_title)
-
- def _real_extract(self, url):
- mobj = re.match(self._VALID_URL, url)
- channel_id = mobj.group('id')
- return self._extract_videos(channel_id)
-
-class RutubeMovieIE(RutubeChannelIE):
+class RutubeMovieIE(RutubePlaylistBaseIE):
IE_NAME = 'rutube:movie'
IE_DESC = 'Rutube movies'
_VALID_URL = r'https?://rutube\.ru/metainfo/tv/(?P<id>\d+)'
@@ -176,11 +229,11 @@ class RutubeMovieIE(RutubeChannelIE):
movie = self._download_json(
self._MOVIE_TEMPLATE % movie_id, movie_id,
'Downloading movie JSON')
- movie_name = movie['name']
- return self._extract_videos(movie_id, movie_name)
+ return self._extract_playlist(
+ movie_id, playlist_name=movie.get('name'))
-class RutubePersonIE(RutubeChannelIE):
+class RutubePersonIE(RutubePlaylistBaseIE):
IE_NAME = 'rutube:person'
IE_DESC = 'Rutube person videos'
_VALID_URL = r'https?://rutube\.ru/video/person/(?P<id>\d+)'
@@ -193,3 +246,37 @@ class RutubePersonIE(RutubeChannelIE):
}]
_PAGE_TEMPLATE = 'http://rutube.ru/api/video/person/%s/?page=%s&format=json'
+
+
+class RutubePlaylistIE(RutubePlaylistBaseIE):
+ IE_NAME = 'rutube:playlist'
+ IE_DESC = 'Rutube playlists'
+ _VALID_URL = r'https?://rutube\.ru/(?:video|(?:play/)?embed)/[\da-z]{32}/\?.*?\bpl_id=(?P<id>\d+)'
+ _TESTS = [{
+ 'url': 'https://rutube.ru/video/cecd58ed7d531fc0f3d795d51cee9026/?pl_id=3097&pl_type=tag',
+ 'info_dict': {
+ 'id': '3097',
+ },
+ 'playlist_count': 27,
+ }, {
+ 'url': 'https://rutube.ru/video/10b3a03fc01d5bbcc632a2f3514e8aab/?pl_id=4252&pl_type=source',
+ 'only_matching': True,
+ }]
+
+ _PAGE_TEMPLATE = 'http://rutube.ru/api/playlist/%s/%s/?page=%s&format=json'
+
+ @classmethod
+ def suitable(cls, url):
+ if not super(RutubePlaylistIE, cls).suitable(url):
+ return False
+ params = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ return params.get('pl_type', [None])[0] and int_or_none(params.get('pl_id', [None])[0])
+
+ def _next_page_url(self, page_num, playlist_id, item_kind):
+ return self._PAGE_TEMPLATE % (item_kind, playlist_id, page_num)
+
+ def _real_extract(self, url):
+ qs = compat_parse_qs(compat_urllib_parse_urlparse(url).query)
+ playlist_kind = qs['pl_type'][0]
+ playlist_id = qs['pl_id'][0]
+ return self._extract_playlist(playlist_id, item_kind=playlist_kind)
diff --git a/youtube_dl/extractor/slideshare.py b/youtube_dl/extractor/slideshare.py
index 74a1dc6..e89ebeb 100644
--- a/youtube_dl/extractor/slideshare.py
+++ b/youtube_dl/extractor/slideshare.py
@@ -31,7 +31,7 @@ class SlideshareIE(InfoExtractor):
page_title = mobj.group('title')
webpage = self._download_webpage(url, page_title)
slideshare_obj = self._search_regex(
- r'\$\.extend\(slideshare_object,\s*(\{.*?\})\);',
+ r'\$\.extend\(.*?slideshare_object,\s*(\{.*?\})\);',
webpage, 'slideshare object')
info = json.loads(slideshare_obj)
if info['slideshow']['type'] != 'video':
diff --git a/youtube_dl/extractor/soundcloud.py b/youtube_dl/extractor/soundcloud.py
index 3f1a46b..1c6799d 100644
--- a/youtube_dl/extractor/soundcloud.py
+++ b/youtube_dl/extractor/soundcloud.py
@@ -1,8 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
-import re
import itertools
+import re
from .common import (
InfoExtractor,
@@ -17,6 +17,7 @@ from ..utils import (
ExtractorError,
int_or_none,
unified_strdate,
+ update_url_query,
)
@@ -31,6 +32,7 @@ class SoundcloudIE(InfoExtractor):
_VALID_URL = r'''(?x)^(?:https?://)?
(?:(?:(?:www\.|m\.)?soundcloud\.com/
+ (?!stations/track)
(?P<uploader>[\w\d-]+)/
(?!(?:tracks|sets(?:/.+?)?|reposts|likes|spotlight)/?(?:$|[?#]))
(?P<title>[\w\d-]+)/?
@@ -119,9 +121,24 @@ class SoundcloudIE(InfoExtractor):
'license': 'cc-by-sa',
},
},
+ # private link, downloadable format
+ {
+ 'url': 'https://soundcloud.com/oriuplift/uponly-238-no-talking-wav/s-AyZUd',
+ 'md5': '64a60b16e617d41d0bef032b7f55441e',
+ 'info_dict': {
+ 'id': '340344461',
+ 'ext': 'wav',
+ 'title': 'Uplifting Only 238 [No Talking] (incl. Alex Feed Guestmix) (Aug 31, 2017) [wav]',
+ 'description': 'md5:fa20ee0fca76a3d6df8c7e57f3715366',
+ 'uploader': 'Ori Uplift Music',
+ 'upload_date': '20170831',
+ 'duration': 7449,
+ 'license': 'all-rights-reserved',
+ },
+ },
]
- _CLIENT_ID = '2t9loNQH90kzJcsFCODdigxfp325aq4z'
+ _CLIENT_ID = 'JlZIsxg2hY5WnBgtn3jfS0UYCl0K8DOg'
_IPHONE_CLIENT_ID = '376f225bf427445fc4bfb6b99b72e0bf'
@staticmethod
@@ -159,11 +176,13 @@ class SoundcloudIE(InfoExtractor):
'license': info.get('license'),
}
formats = []
+ query = {'client_id': self._CLIENT_ID}
+ if secret_token is not None:
+ query['secret_token'] = secret_token
if info.get('downloadable', False):
# We can build a direct link to the song
- format_url = (
- 'https://api.soundcloud.com/tracks/{0}/download?client_id={1}'.format(
- track_id, self._CLIENT_ID))
+ format_url = update_url_query(
+ 'https://api.soundcloud.com/tracks/%s/download' % track_id, query)
formats.append({
'format_id': 'download',
'ext': info.get('original_format', 'mp3'),
@@ -175,10 +194,7 @@ class SoundcloudIE(InfoExtractor):
# We have to retrieve the url
format_dict = self._download_json(
'https://api.soundcloud.com/i1/tracks/%s/streams' % track_id,
- track_id, 'Downloading track url', query={
- 'client_id': self._CLIENT_ID,
- 'secret_token': secret_token,
- })
+ track_id, 'Downloading track url', query=query)
for key, stream_url in format_dict.items():
abr = int_or_none(self._search_regex(
@@ -215,7 +231,7 @@ class SoundcloudIE(InfoExtractor):
# cannot be always used, sometimes it can give an HTTP 404 error
formats.append({
'format_id': 'fallback',
- 'url': info['stream_url'] + '?client_id=' + self._CLIENT_ID,
+ 'url': update_url_query(info['stream_url'], query),
'ext': ext,
})
@@ -330,7 +346,63 @@ class SoundcloudSetIE(SoundcloudPlaylistBaseIE):
}
-class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
+class SoundcloudPagedPlaylistBaseIE(SoundcloudPlaylistBaseIE):
+ _API_BASE = 'https://api.soundcloud.com'
+ _API_V2_BASE = 'https://api-v2.soundcloud.com'
+
+ def _extract_playlist(self, base_url, playlist_id, playlist_title):
+ COMMON_QUERY = {
+ 'limit': 50,
+ 'client_id': self._CLIENT_ID,
+ 'linked_partitioning': '1',
+ }
+
+ query = COMMON_QUERY.copy()
+ query['offset'] = 0
+
+ next_href = base_url + '?' + compat_urllib_parse_urlencode(query)
+
+ entries = []
+ for i in itertools.count():
+ response = self._download_json(
+ next_href, playlist_id, 'Downloading track page %s' % (i + 1))
+
+ collection = response['collection']
+ if not collection:
+ break
+
+ def resolve_permalink_url(candidates):
+ for cand in candidates:
+ if isinstance(cand, dict):
+ permalink_url = cand.get('permalink_url')
+ entry_id = self._extract_id(cand)
+ if permalink_url and permalink_url.startswith('http'):
+ return permalink_url, entry_id
+
+ for e in collection:
+ permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
+ if permalink_url:
+ entries.append(self.url_result(permalink_url, video_id=entry_id))
+
+ next_href = response.get('next_href')
+ if not next_href:
+ break
+
+ parsed_next_href = compat_urlparse.urlparse(response['next_href'])
+ qs = compat_urlparse.parse_qs(parsed_next_href.query)
+ qs.update(COMMON_QUERY)
+ next_href = compat_urlparse.urlunparse(
+ parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
+
+ return {
+ '_type': 'playlist',
+ 'id': playlist_id,
+ 'title': playlist_title,
+ 'entries': entries,
+ }
+
+
+class SoundcloudUserIE(SoundcloudPagedPlaylistBaseIE):
_VALID_URL = r'''(?x)
https?://
(?:(?:www|m)\.)?soundcloud\.com/
@@ -385,16 +457,13 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
'playlist_mincount': 1,
}]
- _API_BASE = 'https://api.soundcloud.com'
- _API_V2_BASE = 'https://api-v2.soundcloud.com'
-
_BASE_URL_MAP = {
- 'all': '%s/profile/soundcloud:users:%%s' % _API_V2_BASE,
- 'tracks': '%s/users/%%s/tracks' % _API_BASE,
- 'sets': '%s/users/%%s/playlists' % _API_V2_BASE,
- 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % _API_V2_BASE,
- 'likes': '%s/users/%%s/likes' % _API_V2_BASE,
- 'spotlight': '%s/users/%%s/spotlight' % _API_V2_BASE,
+ 'all': '%s/profile/soundcloud:users:%%s' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+ 'tracks': '%s/users/%%s/tracks' % SoundcloudPagedPlaylistBaseIE._API_BASE,
+ 'sets': '%s/users/%%s/playlists' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+ 'reposts': '%s/profile/soundcloud:users:%%s/reposts' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+ 'likes': '%s/users/%%s/likes' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
+ 'spotlight': '%s/users/%%s/spotlight' % SoundcloudPagedPlaylistBaseIE._API_V2_BASE,
}
_TITLE_MAP = {
@@ -416,57 +485,36 @@ class SoundcloudUserIE(SoundcloudPlaylistBaseIE):
resolv_url, uploader, 'Downloading user info')
resource = mobj.group('rsrc') or 'all'
- base_url = self._BASE_URL_MAP[resource] % user['id']
- COMMON_QUERY = {
- 'limit': 50,
- 'client_id': self._CLIENT_ID,
- 'linked_partitioning': '1',
- }
+ return self._extract_playlist(
+ self._BASE_URL_MAP[resource] % user['id'], compat_str(user['id']),
+ '%s (%s)' % (user['username'], self._TITLE_MAP[resource]))
- query = COMMON_QUERY.copy()
- query['offset'] = 0
-
- next_href = base_url + '?' + compat_urllib_parse_urlencode(query)
- entries = []
- for i in itertools.count():
- response = self._download_json(
- next_href, uploader, 'Downloading track page %s' % (i + 1))
-
- collection = response['collection']
- if not collection:
- break
-
- def resolve_permalink_url(candidates):
- for cand in candidates:
- if isinstance(cand, dict):
- permalink_url = cand.get('permalink_url')
- entry_id = self._extract_id(cand)
- if permalink_url and permalink_url.startswith('http'):
- return permalink_url, entry_id
+class SoundcloudTrackStationIE(SoundcloudPagedPlaylistBaseIE):
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?soundcloud\.com/stations/track/[^/]+/(?P<id>[^/?#&]+)'
+ IE_NAME = 'soundcloud:trackstation'
+ _TESTS = [{
+ 'url': 'https://soundcloud.com/stations/track/officialsundial/your-text',
+ 'info_dict': {
+ 'id': '286017854',
+ 'title': 'Track station: your-text',
+ },
+ 'playlist_mincount': 47,
+ }]
- for e in collection:
- permalink_url, entry_id = resolve_permalink_url((e, e.get('track'), e.get('playlist')))
- if permalink_url:
- entries.append(self.url_result(permalink_url, video_id=entry_id))
+ def _real_extract(self, url):
+ track_name = self._match_id(url)
- next_href = response.get('next_href')
- if not next_href:
- break
+ webpage = self._download_webpage(url, track_name)
- parsed_next_href = compat_urlparse.urlparse(response['next_href'])
- qs = compat_urlparse.parse_qs(parsed_next_href.query)
- qs.update(COMMON_QUERY)
- next_href = compat_urlparse.urlunparse(
- parsed_next_href._replace(query=compat_urllib_parse_urlencode(qs, True)))
+ track_id = self._search_regex(
+ r'soundcloud:track-stations:(\d+)', webpage, 'track id')
- return {
- '_type': 'playlist',
- 'id': compat_str(user['id']),
- 'title': '%s (%s)' % (user['username'], self._TITLE_MAP[resource]),
- 'entries': entries,
- }
+ return self._extract_playlist(
+ '%s/stations/soundcloud:track-stations:%s/tracks'
+ % (self._API_V2_BASE, track_id),
+ track_id, 'Track station: %s' % track_name)
class SoundcloudPlaylistIE(SoundcloudPlaylistBaseIE):
diff --git a/youtube_dl/extractor/spiegel.py b/youtube_dl/extractor/spiegel.py
index ec1b603..84298fe 100644
--- a/youtube_dl/extractor/spiegel.py
+++ b/youtube_dl/extractor/spiegel.py
@@ -4,6 +4,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from .nexx import NexxEmbedIE
from .spiegeltv import SpiegeltvIE
from ..compat import compat_urlparse
from ..utils import (
@@ -121,6 +122,26 @@ class SpiegelArticleIE(InfoExtractor):
},
'playlist_count': 6,
+ }, {
+ # Nexx iFrame embed
+ 'url': 'http://www.spiegel.de/sptv/spiegeltv/spiegel-tv-ueber-schnellste-katapult-achterbahn-der-welt-taron-a-1137884.html',
+ 'info_dict': {
+ 'id': '161464',
+ 'ext': 'mp4',
+ 'title': 'Nervenkitzel Achterbahn',
+ 'alt_title': 'Karussellbauer in Deutschland',
+ 'description': 'md5:ffe7b1cc59a01f585e0569949aef73cc',
+ 'release_year': 2005,
+ 'creator': 'SPIEGEL TV',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 2761,
+ 'timestamp': 1394021479,
+ 'upload_date': '20140305',
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
}]
def _real_extract(self, url):
@@ -143,6 +164,9 @@ class SpiegelArticleIE(InfoExtractor):
entries = [
self.url_result(compat_urlparse.urljoin(
self.http_scheme() + '//spiegel.de/', embed_path))
- for embed_path in embeds
- ]
- return self.playlist_result(entries)
+ for embed_path in embeds]
+ if embeds:
+ return self.playlist_result(entries)
+
+ return self.playlist_from_matches(
+ NexxEmbedIE._extract_urls(webpage), ie=NexxEmbedIE.ie_key())
diff --git a/youtube_dl/extractor/spiegeltv.py b/youtube_dl/extractor/spiegeltv.py
index e1cfb86..6ccf4c3 100644
--- a/youtube_dl/extractor/spiegeltv.py
+++ b/youtube_dl/extractor/spiegeltv.py
@@ -1,114 +1,17 @@
-# coding: utf-8
from __future__ import unicode_literals
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_urlparse
-from ..utils import (
- determine_ext,
- float_or_none,
-)
+from .nexx import NexxIE
class SpiegeltvIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/(?:#/)?filme/(?P<id>[\-a-z0-9]+)'
- _TESTS = [{
- 'url': 'http://www.spiegel.tv/filme/flug-mh370/',
- 'info_dict': {
- 'id': 'flug-mh370',
- 'ext': 'm4v',
- 'title': 'Flug MH370',
- 'description': 'Das Rätsel um die Boeing 777 der Malaysia-Airlines',
- 'thumbnail': r're:http://.*\.jpg$',
- },
- 'params': {
- # m3u8 download
- 'skip_download': True,
- }
- }, {
- 'url': 'http://www.spiegel.tv/#/filme/alleskino-die-wahrheit-ueber-maenner/',
+ _VALID_URL = r'https?://(?:www\.)?spiegel\.tv/videos/(?P<id>\d+)'
+ _TEST = {
+ 'url': 'http://www.spiegel.tv/videos/161681-flug-mh370/',
'only_matching': True,
- }]
+ }
def _real_extract(self, url):
- if '/#/' in url:
- url = url.replace('/#/', '/')
- video_id = self._match_id(url)
- webpage = self._download_webpage(url, video_id)
- title = self._html_search_regex(r'<h1.*?>(.*?)</h1>', webpage, 'title')
-
- apihost = 'http://spiegeltv-ivms2-restapi.s3.amazonaws.com'
- version_json = self._download_json(
- '%s/version.json' % apihost, video_id,
- note='Downloading version information')
- version_name = version_json['version_name']
-
- slug_json = self._download_json(
- '%s/%s/restapi/slugs/%s.json' % (apihost, version_name, video_id),
- video_id,
- note='Downloading object information')
- oid = slug_json['object_id']
-
- media_json = self._download_json(
- '%s/%s/restapi/media/%s.json' % (apihost, version_name, oid),
- video_id, note='Downloading media information')
- uuid = media_json['uuid']
- is_wide = media_json['is_wide']
-
- server_json = self._download_json(
- 'http://spiegeltv-prod-static.s3.amazonaws.com/projectConfigs/projectConfig.json',
- video_id, note='Downloading server information')
-
- format = '16x9' if is_wide else '4x3'
-
- formats = []
- for streamingserver in server_json['streamingserver']:
- endpoint = streamingserver.get('endpoint')
- if not endpoint:
- continue
- play_path = 'mp4:%s_spiegeltv_0500_%s.m4v' % (uuid, format)
- if endpoint.startswith('rtmp'):
- formats.append({
- 'url': endpoint,
- 'format_id': 'rtmp',
- 'app': compat_urllib_parse_urlparse(endpoint).path[1:],
- 'play_path': play_path,
- 'player_path': 'http://prod-static.spiegel.tv/frontend-076.swf',
- 'ext': 'flv',
- 'rtmp_live': True,
- })
- elif determine_ext(endpoint) == 'm3u8':
- formats.append({
- 'url': endpoint.replace('[video]', play_path),
- 'ext': 'm4v',
- 'format_id': 'hls', # Prefer hls since it allows to workaround georestriction
- 'protocol': 'm3u8',
- 'preference': 1,
- 'http_headers': {
- 'Accept-Encoding': 'deflate', # gzip causes trouble on the server side
- },
- })
- else:
- formats.append({
- 'url': endpoint,
- })
- self._check_formats(formats, video_id)
-
- thumbnails = []
- for image in media_json['images']:
- thumbnails.append({
- 'url': image['url'],
- 'width': image['width'],
- 'height': image['height'],
- })
-
- description = media_json['subtitle']
- duration = float_or_none(media_json.get('duration_in_ms'), scale=1000)
-
- return {
- 'id': video_id,
- 'title': title,
- 'description': description,
- 'duration': duration,
- 'thumbnails': thumbnails,
- 'formats': formats,
- }
+ return self.url_result(
+ 'https://api.nexx.cloud/v3/748/videos/byid/%s'
+ % self._match_id(url), ie=NexxIE.ie_key())
diff --git a/youtube_dl/extractor/sportbox.py b/youtube_dl/extractor/sportbox.py
index e7bd5bf..54497c8 100644
--- a/youtube_dl/extractor/sportbox.py
+++ b/youtube_dl/extractor/sportbox.py
@@ -4,7 +4,11 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
-from ..utils import js_to_json
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ js_to_json,
+)
class SportBoxEmbedIE(InfoExtractor):
@@ -14,8 +18,10 @@ class SportBoxEmbedIE(InfoExtractor):
'info_dict': {
'id': '211355',
'ext': 'mp4',
- 'title': 'В Новороссийске прошел детский турнир «Поле славы боевой»',
+ 'title': '211355',
'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 292,
+ 'view_count': int,
},
'params': {
# m3u8 download
@@ -24,6 +30,9 @@ class SportBoxEmbedIE(InfoExtractor):
}, {
'url': 'http://news.sportbox.ru/vdl/player?nid=370908&only_player=1&autostart=false&playeri=2&height=340&width=580',
'only_matching': True,
+ }, {
+ 'url': 'https://news.sportbox.ru/vdl/player/media/193095',
+ 'only_matching': True,
}]
@staticmethod
@@ -37,36 +46,34 @@ class SportBoxEmbedIE(InfoExtractor):
webpage = self._download_webpage(url, video_id)
- formats = []
-
- def cleanup_js(code):
- # desktop_advert_config contains complex Javascripts and we don't need it
- return js_to_json(re.sub(r'desktop_advert_config.*', '', code))
-
- jwplayer_data = self._parse_json(self._search_regex(
- r'(?s)player\.setup\(({.+?})\);', webpage, 'jwplayer settings'), video_id,
- transform_source=cleanup_js)
-
- hls_url = jwplayer_data.get('hls_url')
- if hls_url:
- formats.extend(self._extract_m3u8_formats(
- hls_url, video_id, ext='mp4', m3u8_id='hls'))
-
- rtsp_url = jwplayer_data.get('rtsp_url')
- if rtsp_url:
- formats.append({
- 'url': rtsp_url,
- 'format_id': 'rtsp',
- })
+ wjplayer_data = self._parse_json(
+ self._search_regex(
+ r'(?s)wjplayer\(({.+?})\);', webpage, 'wjplayer settings'),
+ video_id, transform_source=js_to_json)
+ formats = []
+ for source in wjplayer_data['sources']:
+ src = source.get('src')
+ if not src:
+ continue
+ if determine_ext(src) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ src, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': src,
+ })
self._sort_formats(formats)
- title = jwplayer_data['node_title']
- thumbnail = jwplayer_data.get('image_url')
+ view_count = int_or_none(self._search_regex(
+ r'Просмотров\s*:\s*(\d+)', webpage, 'view count', default=None))
return {
'id': video_id,
- 'title': title,
- 'thumbnail': thumbnail,
+ 'title': video_id,
+ 'thumbnail': wjplayer_data.get('poster'),
+ 'duration': int_or_none(wjplayer_data.get('duration')),
+ 'view_count': view_count,
'formats': formats,
}
diff --git a/youtube_dl/extractor/svt.py b/youtube_dl/extractor/svt.py
index 1b5afb7..48bc452 100644
--- a/youtube_dl/extractor/svt.py
+++ b/youtube_dl/extractor/svt.py
@@ -181,7 +181,8 @@ class SVTPlayIE(SVTBaseIE):
if video_id:
data = self._download_json(
- 'http://www.svt.se/videoplayer-api/video/%s' % video_id, video_id)
+ 'https://api.svt.se/videoplayer-api/video/%s' % video_id,
+ video_id, headers=self.geo_verification_headers())
info_dict = self._extract_video(data, video_id)
if not info_dict.get('title'):
info_dict['title'] = re.sub(
diff --git a/youtube_dl/extractor/tbs.py b/youtube_dl/extractor/tbs.py
index bf93eb8..e947453 100644
--- a/youtube_dl/extractor/tbs.py
+++ b/youtube_dl/extractor/tbs.py
@@ -8,6 +8,9 @@ from ..utils import extract_attributes
class TBSIE(TurnerBaseIE):
+ # https://github.com/rg3/youtube-dl/issues/13658
+ _WORKING = False
+
_VALID_URL = r'https?://(?:www\.)?(?P<site>tbs|tntdrama)\.com/videos/(?:[^/]+/)+(?P<id>[^/?#]+)\.html'
_TESTS = [{
'url': 'http://www.tbs.com/videos/people-of-earth/season-1/extras/2007318/theatrical-trailer.html',
@@ -17,7 +20,8 @@ class TBSIE(TurnerBaseIE):
'ext': 'mp4',
'title': 'Theatrical Trailer',
'description': 'Catch the latest comedy from TBS, People of Earth, premiering Halloween night--Monday, October 31, at 9/8c.',
- }
+ },
+ 'skip': 'TBS videos are deleted after a while',
}, {
'url': 'http://www.tntdrama.com/videos/good-behavior/season-1/extras/1538823/you-better-run.html',
'md5': 'ce53c6ead5e9f3280b4ad2031a6fab56',
@@ -26,7 +30,8 @@ class TBSIE(TurnerBaseIE):
'ext': 'mp4',
'title': 'You Better Run',
'description': 'Letty Raines must figure out what she\'s running toward while running away from her past. Good Behavior premieres November 15 at 9/8c.',
- }
+ },
+ 'skip': 'TBS videos are deleted after a while',
}]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/teamfourstar.py b/youtube_dl/extractor/teamfourstar.py
deleted file mode 100644
index a8c6ed7..0000000
--- a/youtube_dl/extractor/teamfourstar.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-from .common import InfoExtractor
-from .jwplatform import JWPlatformIE
-from ..utils import unified_strdate
-
-
-class TeamFourStarIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?teamfourstar\.com/(?P<id>[a-z0-9\-]+)'
- _TEST = {
- 'url': 'http://teamfourstar.com/tfs-abridged-parody-episode-1-2/',
- 'info_dict': {
- 'id': '0WdZO31W',
- 'title': 'TFS Abridged Parody Episode 1',
- 'description': 'md5:d60bc389588ebab2ee7ad432bda953ae',
- 'ext': 'mp4',
- 'timestamp': 1394168400,
- 'upload_date': '20080508',
- },
- }
-
- def _real_extract(self, url):
- display_id = self._match_id(url)
- webpage = self._download_webpage(url, display_id)
-
- jwplatform_url = JWPlatformIE._extract_url(webpage)
-
- video_title = self._html_search_regex(
- r'<h1[^>]+class="entry-title"[^>]*>(?P<title>.+?)</h1>',
- webpage, 'title')
- video_date = unified_strdate(self._html_search_regex(
- r'<span[^>]+class="meta-date date updated"[^>]*>(?P<date>.+?)</span>',
- webpage, 'date', fatal=False))
- video_description = self._html_search_regex(
- r'(?s)<div[^>]+class="content-inner"[^>]*>.*?(?P<description><p>.+?)</div>',
- webpage, 'description', fatal=False)
- video_thumbnail = self._og_search_thumbnail(webpage)
-
- return {
- '_type': 'url_transparent',
- 'display_id': display_id,
- 'title': video_title,
- 'description': video_description,
- 'upload_date': video_date,
- 'thumbnail': video_thumbnail,
- 'url': jwplatform_url,
- }
diff --git a/youtube_dl/extractor/ted.py b/youtube_dl/extractor/ted.py
index f27d0e3..06a27fd 100644
--- a/youtube_dl/extractor/ted.py
+++ b/youtube_dl/extractor/ted.py
@@ -271,20 +271,22 @@ class TEDIE(InfoExtractor):
}
def _get_subtitles(self, video_id, talk_info):
- languages = [lang['languageCode'] for lang in talk_info.get('languages', [])]
- if languages:
- sub_lang_list = {}
- for l in languages:
- sub_lang_list[l] = [
- {
- 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, l, ext),
- 'ext': ext,
- }
- for ext in ['ted', 'srt']
- ]
- return sub_lang_list
- else:
- return {}
+ sub_lang_list = {}
+ for language in try_get(
+ talk_info,
+ (lambda x: x['downloads']['languages'],
+ lambda x: x['languages']), list):
+ lang_code = language.get('languageCode') or language.get('ianaCode')
+ if not lang_code:
+ continue
+ sub_lang_list[lang_code] = [
+ {
+ 'url': 'http://www.ted.com/talks/subtitles/id/%s/lang/%s/format/%s' % (video_id, lang_code, ext),
+ 'ext': ext,
+ }
+ for ext in ['ted', 'srt']
+ ]
+ return sub_lang_list
def _watch_info(self, url, name):
webpage = self._download_webpage(url, name)
diff --git a/youtube_dl/extractor/thisoldhouse.py b/youtube_dl/extractor/thisoldhouse.py
index 197258d..6ab147a 100644
--- a/youtube_dl/extractor/thisoldhouse.py
+++ b/youtube_dl/extractor/thisoldhouse.py
@@ -2,13 +2,15 @@
from __future__ import unicode_literals
from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import try_get
class ThisOldHouseIE(InfoExtractor):
_VALID_URL = r'https?://(?:www\.)?thisoldhouse\.com/(?:watch|how-to|tv-episode)/(?P<id>[^/?#]+)'
_TESTS = [{
'url': 'https://www.thisoldhouse.com/how-to/how-to-build-storage-bench',
- 'md5': '946f05bbaa12a33f9ae35580d2dfcfe3',
+ 'md5': '568acf9ca25a639f0c4ff905826b662f',
'info_dict': {
'id': '2REGtUDQ',
'ext': 'mp4',
@@ -28,8 +30,15 @@ class ThisOldHouseIE(InfoExtractor):
def _real_extract(self, url):
display_id = self._match_id(url)
webpage = self._download_webpage(url, display_id)
- drupal_settings = self._parse_json(self._search_regex(
- r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
- webpage, 'drupal settings'), display_id)
- video_id = drupal_settings['jwplatform']['video_id']
+ video_id = self._search_regex(
+ (r'data-mid=(["\'])(?P<id>(?:(?!\1).)+)\1',
+ r'id=(["\'])inline-video-player-(?P<id>(?:(?!\1).)+)\1'),
+ webpage, 'video id', default=None, group='id')
+ if not video_id:
+ drupal_settings = self._parse_json(self._search_regex(
+ r'jQuery\.extend\(Drupal\.settings\s*,\s*({.+?})\);',
+ webpage, 'drupal settings'), display_id)
+ video_id = try_get(
+ drupal_settings, lambda x: x['jwplatform']['video_id'],
+ compat_str) or list(drupal_settings['comScore'])[0]
return self.url_result('jwplatform:' + video_id, 'JWPlatform', video_id)
diff --git a/youtube_dl/extractor/toutv.py b/youtube_dl/extractor/toutv.py
index 26d7709..e59ed26 100644
--- a/youtube_dl/extractor/toutv.py
+++ b/youtube_dl/extractor/toutv.py
@@ -5,7 +5,6 @@ from .common import InfoExtractor
from ..utils import (
int_or_none,
js_to_json,
- ExtractorError,
urlencode_postdata,
extract_attributes,
smuggle_url,
@@ -78,8 +77,10 @@ class TouTvIE(InfoExtractor):
def _real_extract(self, url):
path = self._match_id(url)
metadata = self._download_json('http://ici.tou.tv/presentation/%s' % path, path)
+ # IsDrm does not necessarily mean the video is DRM protected (see
+ # https://github.com/rg3/youtube-dl/issues/13994).
if metadata.get('IsDrm'):
- raise ExtractorError('This video is DRM protected.', expected=True)
+ self.report_warning('This video is probably DRM protected.', path)
video_id = metadata['IdMedia']
details = metadata['Details']
title = details['OriginalTitle']
diff --git a/youtube_dl/extractor/twentymin.py b/youtube_dl/extractor/twentymin.py
index 4fd1aa4..a42977f 100644
--- a/youtube_dl/extractor/twentymin.py
+++ b/youtube_dl/extractor/twentymin.py
@@ -50,7 +50,7 @@ class TwentyMinutenIE(InfoExtractor):
@staticmethod
def _extract_urls(webpage):
return [m.group('url') for m in re.finditer(
- r'<iframe[^>]+src=(["\'])(?P<url>(?:https?://)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',
+ r'<iframe[^>]+src=(["\'])(?P<url>(?:(?:https?:)?//)?(?:www\.)?20min\.ch/videoplayer/videoplayer.html\?.*?\bvideoId@\d+.*?)\1',
webpage)]
def _real_extract(self, url):
diff --git a/youtube_dl/extractor/twitter.py b/youtube_dl/extractor/twitter.py
index 37e3bc4..6eaf360 100644
--- a/youtube_dl/extractor/twitter.py
+++ b/youtube_dl/extractor/twitter.py
@@ -7,20 +7,38 @@ from .common import InfoExtractor
from ..compat import compat_urlparse
from ..utils import (
determine_ext,
+ dict_get,
+ ExtractorError,
float_or_none,
- xpath_text,
- remove_end,
int_or_none,
- ExtractorError,
+ remove_end,
+ try_get,
+ xpath_text,
)
from .periscope import PeriscopeIE
class TwitterBaseIE(InfoExtractor):
- def _get_vmap_video_url(self, vmap_url, video_id):
+ def _extract_formats_from_vmap_url(self, vmap_url, video_id):
vmap_data = self._download_xml(vmap_url, video_id)
- return xpath_text(vmap_data, './/MediaFile').strip()
+ video_url = xpath_text(vmap_data, './/MediaFile').strip()
+ if determine_ext(video_url) == 'm3u8':
+ return self._extract_m3u8_formats(
+ video_url, video_id, ext='mp4', m3u8_id='hls',
+ entry_protocol='m3u8_native')
+ return [{
+ 'url': video_url,
+ }]
+
+ @staticmethod
+ def _search_dimensions_in_video_url(a_format, video_url):
+ m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
+ if m:
+ a_format.update({
+ 'width': int(m.group('width')),
+ 'height': int(m.group('height')),
+ })
class TwitterCardIE(TwitterBaseIE):
@@ -36,7 +54,8 @@ class TwitterCardIE(TwitterBaseIE):
'title': 'Twitter Card',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 30.033,
- }
+ },
+ 'skip': 'Video gone',
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/623160978427936768',
@@ -48,6 +67,7 @@ class TwitterCardIE(TwitterBaseIE):
'thumbnail': r're:^https?://.*\.jpg',
'duration': 80.155,
},
+ 'skip': 'Video gone',
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/654001591733886977',
@@ -65,7 +85,7 @@ class TwitterCardIE(TwitterBaseIE):
},
{
'url': 'https://twitter.com/i/cards/tfw/v1/665289828897005568',
- 'md5': 'ab2745d0b0ce53319a534fccaa986439',
+ 'md5': '6dabeaca9e68cbb71c99c322a4b42a11',
'info_dict': {
'id': 'iBb2x00UVlv',
'ext': 'mp4',
@@ -73,16 +93,17 @@ class TwitterCardIE(TwitterBaseIE):
'uploader_id': '1189339351084113920',
'uploader': 'ArsenalTerje',
'title': 'Vine by ArsenalTerje',
+ 'timestamp': 1447451307,
},
'add_ie': ['Vine'],
}, {
'url': 'https://twitter.com/i/videos/tweet/705235433198714880',
- 'md5': '3846d0a07109b5ab622425449b59049d',
+ 'md5': '884812a2adc8aaf6fe52b15ccbfa3b88',
'info_dict': {
'id': '705235433198714880',
'ext': 'mp4',
'title': 'Twitter web player',
- 'thumbnail': r're:^https?://.*\.jpg',
+ 'thumbnail': r're:^https?://.*',
},
}, {
'url': 'https://twitter.com/i/videos/752274308186120192',
@@ -90,6 +111,59 @@ class TwitterCardIE(TwitterBaseIE):
},
]
+ def _parse_media_info(self, media_info, video_id):
+ formats = []
+ for media_variant in media_info.get('variants', []):
+ media_url = media_variant['url']
+ if media_url.endswith('.m3u8'):
+ formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
+ elif media_url.endswith('.mpd'):
+ formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
+ else:
+ vbr = int_or_none(dict_get(media_variant, ('bitRate', 'bitrate')), scale=1000)
+ a_format = {
+ 'url': media_url,
+ 'format_id': 'http-%d' % vbr if vbr else 'http',
+ 'vbr': vbr,
+ }
+ # Reported bitRate may be zero
+ if not a_format['vbr']:
+ del a_format['vbr']
+
+ self._search_dimensions_in_video_url(a_format, media_url)
+
+ formats.append(a_format)
+ return formats
+
+ def _extract_mobile_formats(self, username, video_id):
+ webpage = self._download_webpage(
+ 'https://mobile.twitter.com/%s/status/%s' % (username, video_id),
+ video_id, 'Downloading mobile webpage',
+ headers={
+ # A recent mobile UA is necessary for `gt` cookie
+ 'User-Agent': 'Mozilla/5.0 (Android 6.0.1; Mobile; rv:54.0) Gecko/54.0 Firefox/54.0',
+ })
+ main_script_url = self._html_search_regex(
+ r'<script[^>]+src="([^"]+main\.[^"]+)"', webpage, 'main script URL')
+ main_script = self._download_webpage(
+ main_script_url, video_id, 'Downloading main script')
+ bearer_token = self._search_regex(
+ r'BEARER_TOKEN\s*:\s*"([^"]+)"',
+ main_script, 'bearer token')
+ guest_token = self._search_regex(
+ r'document\.cookie\s*=\s*decodeURIComponent\("gt=(\d+)',
+ webpage, 'guest token')
+ api_data = self._download_json(
+ 'https://api.twitter.com/2/timeline/conversation/%s.json' % video_id,
+ video_id, 'Downloading mobile API data',
+ headers={
+ 'Authorization': 'Bearer ' + bearer_token,
+ 'x-guest-token': guest_token,
+ })
+ media_info = try_get(api_data, lambda o: o['globalObjects']['tweets'][video_id]
+ ['extended_entities']['media'][0]['video_info']) or {}
+ return self._parse_media_info(media_info, video_id)
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -117,14 +191,6 @@ class TwitterCardIE(TwitterBaseIE):
if periscope_url:
return self.url_result(periscope_url, PeriscopeIE.ie_key())
- def _search_dimensions_in_video_url(a_format, video_url):
- m = re.search(r'/(?P<width>\d+)x(?P<height>\d+)/', video_url)
- if m:
- a_format.update({
- 'width': int(m.group('width')),
- 'height': int(m.group('height')),
- })
-
video_url = config.get('video_url') or config.get('playlist', [{}])[0].get('source')
if video_url:
@@ -135,15 +201,14 @@ class TwitterCardIE(TwitterBaseIE):
'url': video_url,
}
- _search_dimensions_in_video_url(f, video_url)
+ self._search_dimensions_in_video_url(f, video_url)
formats.append(f)
vmap_url = config.get('vmapUrl') or config.get('vmap_url')
if vmap_url:
- formats.append({
- 'url': self._get_vmap_video_url(vmap_url, video_id),
- })
+ formats.extend(
+ self._extract_formats_from_vmap_url(vmap_url, video_id))
media_info = None
@@ -152,29 +217,14 @@ class TwitterCardIE(TwitterBaseIE):
media_info = entity['mediaInfo']
if media_info:
- for media_variant in media_info['variants']:
- media_url = media_variant['url']
- if media_url.endswith('.m3u8'):
- formats.extend(self._extract_m3u8_formats(media_url, video_id, ext='mp4', m3u8_id='hls'))
- elif media_url.endswith('.mpd'):
- formats.extend(self._extract_mpd_formats(media_url, video_id, mpd_id='dash'))
- else:
- vbr = int_or_none(media_variant.get('bitRate'), scale=1000)
- a_format = {
- 'url': media_url,
- 'format_id': 'http-%d' % vbr if vbr else 'http',
- 'vbr': vbr,
- }
- # Reported bitRate may be zero
- if not a_format['vbr']:
- del a_format['vbr']
-
- _search_dimensions_in_video_url(a_format, media_url)
-
- formats.append(a_format)
-
+ formats.extend(self._parse_media_info(media_info, video_id))
duration = float_or_none(media_info.get('duration', {}).get('nanos'), scale=1e9)
+ username = config.get('user', {}).get('screen_name')
+ if username:
+ formats.extend(self._extract_mobile_formats(username, video_id))
+
+ self._remove_duplicate_formats(formats)
self._sort_formats(formats)
title = self._search_regex(r'<title>([^<]+)</title>', webpage, 'title')
@@ -255,10 +305,10 @@ class TwitterIE(InfoExtractor):
'info_dict': {
'id': '700207533655363584',
'ext': 'mp4',
- 'title': 'JG - BEAT PROD: @suhmeduh #Damndaniel',
- 'description': 'JG on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
+ 'title': 'Donte - BEAT PROD: @suhmeduh #Damndaniel',
+ 'description': 'Donte on Twitter: "BEAT PROD: @suhmeduh https://t.co/HBrQ4AfpvZ #Damndaniel https://t.co/byBooq2ejZ"',
'thumbnail': r're:^https?://.*\.jpg',
- 'uploader': 'JG',
+ 'uploader': 'Donte',
'uploader_id': 'jaydingeer',
},
'params': {
@@ -270,9 +320,11 @@ class TwitterIE(InfoExtractor):
'info_dict': {
'id': 'MIOxnrUteUd',
'ext': 'mp4',
- 'title': 'Dr.Pepperの飲み方 #japanese #バカ #ドクペ #電動ガン',
- 'uploader': 'TAKUMA',
- 'uploader_id': '1004126642786242560',
+ 'title': 'FilmDrunk - Vine of the day',
+ 'description': 'FilmDrunk on Twitter: "Vine of the day https://t.co/xmTvRdqxWf"',
+ 'uploader': 'FilmDrunk',
+ 'uploader_id': 'Filmdrunk',
+ 'timestamp': 1402826626,
'upload_date': '20140615',
},
'add_ie': ['Vine'],
@@ -294,13 +346,28 @@ class TwitterIE(InfoExtractor):
'info_dict': {
'id': '1zqKVVlkqLaKB',
'ext': 'mp4',
- 'title': 'Sgt Kerry Schmidt - Ontario Provincial Police - Road rage, mischief, assault, rollover and fire in one occurrence',
+ 'title': 'Sgt Kerry Schmidt - LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence',
+ 'description': 'Sgt Kerry Schmidt on Twitter: "LIVE on #Periscope: Road rage, mischief, assault, rollover and fire in one occurrence https://t.co/EKrVgIXF3s"',
'upload_date': '20160923',
'uploader_id': 'OPP_HSD',
- 'uploader': 'Sgt Kerry Schmidt - Ontario Provincial Police',
+ 'uploader': 'Sgt Kerry Schmidt',
'timestamp': 1474613214,
},
'add_ie': ['Periscope'],
+ }, {
+ # has mp4 formats via mobile API
+ 'url': 'https://twitter.com/news_al3alm/status/852138619213144067',
+ 'info_dict': {
+ 'id': '852138619213144067',
+ 'ext': 'mp4',
+ 'title': 'عالم الأخبار - كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة',
+ 'description': 'عالم الأخبار on Twitter: "كلمة تاريخية بجلسة الجناسي التاريخية.. النائب خالد مؤنس العتيبي للمعارضين : اتقوا الله .. الظلم ظلمات يوم القيامة https://t.co/xg6OhpyKfN"',
+ 'uploader': 'عالم الأخبار',
+ 'uploader_id': 'news_al3alm',
+ },
+ 'params': {
+ 'format': 'best[format_id^=http-]',
+ },
}]
def _real_extract(self, url):
@@ -393,7 +460,7 @@ class TwitterAmplifyIE(TwitterBaseIE):
vmap_url = self._html_search_meta(
'twitter:amplify:vmap', webpage, 'vmap url')
- video_url = self._get_vmap_video_url(vmap_url, video_id)
+ formats = self._extract_formats_from_vmap_url(vmap_url, video_id)
thumbnails = []
thumbnail = self._html_search_meta(
@@ -415,11 +482,10 @@ class TwitterAmplifyIE(TwitterBaseIE):
})
video_w, video_h = _find_dimension('player')
- formats = [{
- 'url': video_url,
+ formats[0].update({
'width': video_w,
'height': video_h,
- }]
+ })
return {
'id': video_id,
diff --git a/youtube_dl/extractor/udemy.py b/youtube_dl/extractor/udemy.py
index 160be1b..207c4a6 100644
--- a/youtube_dl/extractor/udemy.py
+++ b/youtube_dl/extractor/udemy.py
@@ -15,6 +15,7 @@ from ..utils import (
ExtractorError,
float_or_none,
int_or_none,
+ js_to_json,
sanitized_Request,
unescapeHTML,
urlencode_postdata,
@@ -73,7 +74,7 @@ class UdemyIE(InfoExtractor):
return compat_urlparse.urljoin(base_url, url) if not url.startswith('http') else url
checkout_url = unescapeHTML(self._search_regex(
- r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/payment/checkout/.+?)\1',
+ r'href=(["\'])(?P<url>(?:https?://(?:www\.)?udemy\.com)?/(?:payment|cart)/checkout/.+?)\1',
webpage, 'checkout url', group='url', default=None))
if checkout_url:
raise ExtractorError(
@@ -268,6 +269,25 @@ class UdemyIE(InfoExtractor):
f = add_output_format_meta(f, format_id)
formats.append(f)
+ def extract_subtitles(track_list):
+ if not isinstance(track_list, list):
+ return
+ for track in track_list:
+ if not isinstance(track, dict):
+ continue
+ if track.get('kind') != 'captions':
+ continue
+ src = track.get('src')
+ if not src or not isinstance(src, compat_str):
+ continue
+ lang = track.get('language') or track.get(
+ 'srclang') or track.get('label')
+ sub_dict = automatic_captions if track.get(
+ 'autogenerated') is True else subtitles
+ sub_dict.setdefault(lang, []).append({
+ 'url': src,
+ })
+
download_urls = asset.get('download_urls')
if isinstance(download_urls, dict):
extract_formats(download_urls.get('Video'))
@@ -315,23 +335,16 @@ class UdemyIE(InfoExtractor):
extract_formats(data.get('sources'))
if not duration:
duration = int_or_none(data.get('duration'))
- tracks = data.get('tracks')
- if isinstance(tracks, list):
- for track in tracks:
- if not isinstance(track, dict):
- continue
- if track.get('kind') != 'captions':
- continue
- src = track.get('src')
- if not src or not isinstance(src, compat_str):
- continue
- lang = track.get('language') or track.get(
- 'srclang') or track.get('label')
- sub_dict = automatic_captions if track.get(
- 'autogenerated') is True else subtitles
- sub_dict.setdefault(lang, []).append({
- 'url': src,
- })
+ extract_subtitles(data.get('tracks'))
+
+ if not subtitles and not automatic_captions:
+ text_tracks = self._parse_json(
+ self._search_regex(
+ r'text-tracks=(["\'])(?P<data>\[.+?\])\1', view_html,
+ 'text tracks', default='{}', group='data'), video_id,
+ transform_source=lambda s: js_to_json(unescapeHTML(s)),
+ fatal=False)
+ extract_subtitles(text_tracks)
self._sort_formats(formats, field_preference=('height', 'width', 'tbr', 'format_id'))
diff --git a/youtube_dl/extractor/veoh.py b/youtube_dl/extractor/veoh.py
index 0f5d687..b20dddc 100644
--- a/youtube_dl/extractor/veoh.py
+++ b/youtube_dl/extractor/veoh.py
@@ -12,47 +12,46 @@ from ..utils import (
class VeohIE(InfoExtractor):
- _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|yapi-)[\da-zA-Z]+)'
+ _VALID_URL = r'https?://(?:www\.)?veoh\.com/(?:watch|iphone/#_Watch)/(?P<id>(?:v|e|yapi-)[\da-zA-Z]+)'
- _TESTS = [
- {
- 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
- 'md5': '620e68e6a3cff80086df3348426c9ca3',
- 'info_dict': {
- 'id': '56314296',
- 'ext': 'mp4',
- 'title': 'Straight Backs Are Stronger',
- 'uploader': 'LUMOback',
- 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
- },
+ _TESTS = [{
+ 'url': 'http://www.veoh.com/watch/v56314296nk7Zdmz3',
+ 'md5': '620e68e6a3cff80086df3348426c9ca3',
+ 'info_dict': {
+ 'id': '56314296',
+ 'ext': 'mp4',
+ 'title': 'Straight Backs Are Stronger',
+ 'uploader': 'LUMOback',
+ 'description': 'At LUMOback, we believe straight backs are stronger. The LUMOback Posture & Movement Sensor: It gently vibrates when you slouch, inspiring improved posture and mobility. Use the app to track your data and improve your posture over time. ',
},
- {
- 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
- 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
- 'info_dict': {
- 'id': '27701988',
- 'ext': 'mp4',
- 'title': 'Chile workers cover up to avoid skin damage',
- 'description': 'md5:2bd151625a60a32822873efc246ba20d',
- 'uploader': 'afp-news',
- 'duration': 123,
- },
- 'skip': 'This video has been deleted.',
+ }, {
+ 'url': 'http://www.veoh.com/watch/v27701988pbTc4wzN?h1=Chile+workers+cover+up+to+avoid+skin+damage',
+ 'md5': '4a6ff84b87d536a6a71e6aa6c0ad07fa',
+ 'info_dict': {
+ 'id': '27701988',
+ 'ext': 'mp4',
+ 'title': 'Chile workers cover up to avoid skin damage',
+ 'description': 'md5:2bd151625a60a32822873efc246ba20d',
+ 'uploader': 'afp-news',
+ 'duration': 123,
},
- {
- 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
- 'md5': '4fde7b9e33577bab2f2f8f260e30e979',
- 'note': 'Embedded ooyala video',
- 'info_dict': {
- 'id': '69525809',
- 'ext': 'mp4',
- 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
- 'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
- 'uploader': 'newsy-videos',
- },
- 'skip': 'This video has been deleted.',
+ 'skip': 'This video has been deleted.',
+ }, {
+ 'url': 'http://www.veoh.com/watch/v69525809F6Nc4frX',
+ 'md5': '4fde7b9e33577bab2f2f8f260e30e979',
+ 'note': 'Embedded ooyala video',
+ 'info_dict': {
+ 'id': '69525809',
+ 'ext': 'mp4',
+ 'title': 'Doctors Alter Plan For Preteen\'s Weight Loss Surgery',
+ 'description': 'md5:f5a11c51f8fb51d2315bca0937526891',
+ 'uploader': 'newsy-videos',
},
- ]
+ 'skip': 'This video has been deleted.',
+ }, {
+ 'url': 'http://www.veoh.com/watch/e152215AJxZktGS',
+ 'only_matching': True,
+ }]
def _extract_formats(self, source):
formats = []
diff --git a/youtube_dl/extractor/vh1.py b/youtube_dl/extractor/vh1.py
index 6be3774..570fa45 100644
--- a/youtube_dl/extractor/vh1.py
+++ b/youtube_dl/extractor/vh1.py
@@ -121,7 +121,11 @@ class VH1IE(MTVIE):
idoc = self._download_xml(
doc_url, video_id,
'Downloading info', transform_source=fix_xml_ampersands)
- return self.playlist_result(
- [self._get_video_info(item) for item in idoc.findall('.//item')],
- playlist_id=video_id,
- )
+
+ entries = []
+ for item in idoc.findall('.//item'):
+ info = self._get_video_info(item)
+ if info:
+ entries.append(info)
+
+ return self.playlist_result(entries, playlist_id=video_id)
diff --git a/youtube_dl/extractor/vice.py b/youtube_dl/extractor/vice.py
index 54e207b..b8b8bf9 100644
--- a/youtube_dl/extractor/vice.py
+++ b/youtube_dl/extractor/vice.py
@@ -7,6 +7,7 @@ import hashlib
import json
from .adobepass import AdobePassIE
+from .youtube import YoutubeIE
from .common import InfoExtractor
from ..compat import compat_HTTPError
from ..utils import (
@@ -261,11 +262,9 @@ class ViceArticleIE(InfoExtractor):
if embed_code:
return _url_res('ooyala:%s' % embed_code, 'Ooyala')
- youtube_url = self._html_search_regex(
- r'<iframe[^>]+src="(.*youtube\.com/.*)"',
- body, 'YouTube URL', default=None)
+ youtube_url = YoutubeIE._extract_url(body)
if youtube_url:
- return _url_res(youtube_url, 'Youtube')
+ return _url_res(youtube_url, YoutubeIE.ie_key())
video_url = self._html_search_regex(
r'data-video-url="([^"]+)"',
diff --git a/youtube_dl/extractor/vidio.py b/youtube_dl/extractor/vidio.py
index 701bb1d..01da32f 100644
--- a/youtube_dl/extractor/vidio.py
+++ b/youtube_dl/extractor/vidio.py
@@ -56,7 +56,8 @@ class VidioIE(InfoExtractor):
self._sort_formats(formats)
duration = int_or_none(duration or self._search_regex(
- r'data-video-duration=(["\'])(?P<duartion>\d+)\1', webpage, 'duration'))
+ r'data-video-duration=(["\'])(?P<duration>\d+)\1', webpage,
+ 'duration', fatal=False, group='duration'))
thumbnail = thumbnail or self._og_search_thumbnail(webpage)
like_count = int_or_none(self._search_regex(
diff --git a/youtube_dl/extractor/vidme.py b/youtube_dl/extractor/vidme.py
index e9ff336..59adb23 100644
--- a/youtube_dl/extractor/vidme.py
+++ b/youtube_dl/extractor/vidme.py
@@ -3,7 +3,10 @@ from __future__ import unicode_literals
import itertools
from .common import InfoExtractor
-from ..compat import compat_HTTPError
+from ..compat import (
+ compat_HTTPError,
+ compat_str,
+)
from ..utils import (
ExtractorError,
int_or_none,
@@ -161,13 +164,28 @@ class VidmeIE(InfoExtractor):
'or for violating the terms of use.',
expected=True)
- formats = [{
- 'format_id': f.get('type'),
- 'url': f['uri'],
- 'width': int_or_none(f.get('width')),
- 'height': int_or_none(f.get('height')),
- 'preference': 0 if f.get('type', '').endswith('clip') else 1,
- } for f in video.get('formats', []) if f.get('uri')]
+ formats = []
+ for f in video.get('formats', []):
+ format_url = f.get('uri')
+ if not format_url or not isinstance(format_url, compat_str):
+ continue
+ format_type = f.get('type')
+ if format_type == 'dash':
+ formats.extend(self._extract_mpd_formats(
+ format_url, video_id, mpd_id='dash', fatal=False))
+ elif format_type == 'hls':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'format_id': f.get('type'),
+ 'url': format_url,
+ 'width': int_or_none(f.get('width')),
+ 'height': int_or_none(f.get('height')),
+ 'preference': 0 if f.get('type', '').endswith(
+ 'clip') else 1,
+ })
if not formats and video.get('complete_url'):
formats.append({
@@ -245,29 +263,35 @@ class VidmeListBaseIE(InfoExtractor):
class VidmeUserIE(VidmeListBaseIE):
IE_NAME = 'vidme:user'
- _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})(?!/likes)(?:[^\da-zA-Z]|$)'
+ _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})(?!/likes)(?:[^\da-zA-Z_-]|$)'
_API_ITEM = 'list'
_TITLE = 'Videos'
- _TEST = {
- 'url': 'https://vid.me/EFARCHIVE',
+ _TESTS = [{
+ 'url': 'https://vid.me/MasakoX',
'info_dict': {
- 'id': '3834632',
- 'title': 'EFARCHIVE - %s' % _TITLE,
+ 'id': '16112341',
+ 'title': 'MasakoX - %s' % _TITLE,
},
- 'playlist_mincount': 238,
- }
+ 'playlist_mincount': 191,
+ }, {
+ 'url': 'https://vid.me/unsQuare_netWork',
+ 'only_matching': True,
+ }]
class VidmeUserLikesIE(VidmeListBaseIE):
IE_NAME = 'vidme:user:likes'
- _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z]{6,})/likes'
+ _VALID_URL = r'https?://vid\.me/(?:e/)?(?P<id>[\da-zA-Z_-]{6,})/likes'
_API_ITEM = 'likes'
_TITLE = 'Likes'
- _TEST = {
+ _TESTS = [{
'url': 'https://vid.me/ErinAlexis/likes',
'info_dict': {
'id': '6483530',
'title': 'ErinAlexis - %s' % _TITLE,
},
'playlist_mincount': 415,
- }
+ }, {
+ 'url': 'https://vid.me/Kaleidoscope-Ish/likes',
+ 'only_matching': True,
+ }]
diff --git a/youtube_dl/extractor/vier.py b/youtube_dl/extractor/vier.py
index 3e67eb8..dbd5ba9 100644
--- a/youtube_dl/extractor/vier.py
+++ b/youtube_dl/extractor/vier.py
@@ -15,7 +15,21 @@ from ..utils import (
class VierIE(InfoExtractor):
IE_NAME = 'vier'
IE_DESC = 'vier.be and vijf.be'
- _VALID_URL = r'https?://(?:www\.)?(?P<site>vier|vijf)\.be/(?:[^/]+/videos/(?P<display_id>[^/]+)(?:/(?P<id>\d+))?|video/v3/embed/(?P<embed_id>\d+))'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:www\.)?(?P<site>vier|vijf)\.be/
+ (?:
+ (?:
+ [^/]+/videos|
+ video(?:/[^/]+)*
+ )/
+ (?P<display_id>[^/]+)(?:/(?P<id>\d+))?|
+ (?:
+ video/v3/embed|
+ embed/video/public
+ )/(?P<embed_id>\d+)
+ )
+ '''
_NETRC_MACHINE = 'vier'
_TESTS = [{
'url': 'http://www.vier.be/planb/videos/het-wordt-warm-de-moestuin/16129',
@@ -83,6 +97,15 @@ class VierIE(InfoExtractor):
}, {
'url': 'http://www.vier.be/video/v3/embed/16129',
'only_matching': True,
+ }, {
+ 'url': 'https://www.vijf.be/embed/video/public/4093',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.vier.be/video/blockbusters/in-juli-en-augustus-summer-classics',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.vier.be/video/achter-de-rug/2017/achter-de-rug-seizoen-1-aflevering-6',
+ 'only_matching': True,
}]
def _real_initialize(self):
@@ -133,14 +156,20 @@ class VierIE(InfoExtractor):
video_id = self._search_regex(
[r'data-nid="(\d+)"', r'"nid"\s*:\s*"(\d+)"'],
webpage, 'video id', default=video_id or display_id)
- application = self._search_regex(
- [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
- webpage, 'application', default=site + '_vod')
- filename = self._search_regex(
- [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
- webpage, 'filename')
-
- playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
+
+ playlist_url = self._search_regex(
+ r'data-file=(["\'])(?P<url>(?:https?:)?//[^/]+/.+?\.m3u8.*?)\1',
+ webpage, 'm3u8 url', default=None, group='url')
+
+ if not playlist_url:
+ application = self._search_regex(
+ [r'data-application="([^"]+)"', r'"application"\s*:\s*"([^"]+)"'],
+ webpage, 'application', default=site + '_vod')
+ filename = self._search_regex(
+ [r'data-filename="([^"]+)"', r'"filename"\s*:\s*"([^"]+)"'],
+ webpage, 'filename')
+ playlist_url = 'http://vod.streamcloud.be/%s/_definst_/mp4:%s.mp4/playlist.m3u8' % (application, filename)
+
formats = self._extract_wowza_formats(
playlist_url, display_id, skip_protocols=['dash'])
self._sort_formats(formats)
diff --git a/youtube_dl/extractor/viidea.py b/youtube_dl/extractor/viidea.py
index 4adcd18..a0abbae 100644
--- a/youtube_dl/extractor/viidea.py
+++ b/youtube_dl/extractor/viidea.py
@@ -4,12 +4,14 @@ import re
from .common import InfoExtractor
from ..compat import (
- compat_urlparse,
+ compat_HTTPError,
compat_str,
+ compat_urlparse,
)
from ..utils import (
- parse_duration,
+ ExtractorError,
js_to_json,
+ parse_duration,
parse_iso8601,
)
@@ -128,9 +130,16 @@ class ViideaIE(InfoExtractor):
base_url = self._proto_relative_url(cfg['livepipe'], 'http:')
- lecture_data = self._download_json(
- '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
- lecture_id)['lecture'][0]
+ try:
+ lecture_data = self._download_json(
+ '%s/site/api/lecture/%s?format=json' % (base_url, lecture_id),
+ lecture_id)['lecture'][0]
+ except ExtractorError as e:
+ if isinstance(e.cause, compat_HTTPError) and e.cause.code == 403:
+ msg = self._parse_json(
+ e.cause.read().decode('utf-8'), lecture_id)
+ raise ExtractorError(msg['detail'], expected=True)
+ raise
lecture_info = {
'id': lecture_id,
diff --git a/youtube_dl/extractor/vine.py b/youtube_dl/extractor/vine.py
index 4957a07..46950d3 100644
--- a/youtube_dl/extractor/vine.py
+++ b/youtube_dl/extractor/vine.py
@@ -92,10 +92,12 @@ class VineIE(InfoExtractor):
username = data.get('username')
+ alt_title = 'Vine by %s' % username if username else None
+
return {
'id': video_id,
- 'title': data.get('description'),
- 'alt_title': 'Vine by %s' % username if username else None,
+ 'title': data.get('description') or alt_title or 'Vine video',
+ 'alt_title': alt_title,
'thumbnail': data.get('thumbnailUrl'),
'timestamp': unified_timestamp(data.get('created')),
'uploader': username,
diff --git a/youtube_dl/extractor/vk.py b/youtube_dl/extractor/vk.py
index dc2719c..105e172 100644
--- a/youtube_dl/extractor/vk.py
+++ b/youtube_dl/extractor/vk.py
@@ -25,6 +25,7 @@ from ..utils import (
from .dailymotion import DailymotionIE
from .pladform import PladformIE
from .vimeo import VimeoIE
+from .youtube import YoutubeIE
class VKBaseIE(InfoExtractor):
@@ -345,11 +346,9 @@ class VKIE(VKBaseIE):
if re.search(error_re, info_page):
raise ExtractorError(error_msg % video_id, expected=True)
- youtube_url = self._search_regex(
- r'<iframe[^>]+src="((?:https?:)?//www.youtube.com/embed/[^"]+)"',
- info_page, 'youtube iframe', default=None)
+ youtube_url = YoutubeIE._extract_url(info_page)
if youtube_url:
- return self.url_result(youtube_url, 'Youtube')
+ return self.url_result(youtube_url, ie=YoutubeIE.ie_key())
vimeo_url = VimeoIE._extract_url(url, info_page)
if vimeo_url is not None:
diff --git a/youtube_dl/extractor/vlive.py b/youtube_dl/extractor/vlive.py
index e589406..64d0224 100644
--- a/youtube_dl/extractor/vlive.py
+++ b/youtube_dl/extractor/vlive.py
@@ -49,6 +49,10 @@ class VLiveIE(InfoExtractor):
},
}]
+ @classmethod
+ def suitable(cls, url):
+ return False if VLivePlaylistIE.suitable(url) else super(VLiveIE, cls).suitable(url)
+
def _real_extract(self, url):
video_id = self._match_id(url)
@@ -232,7 +236,12 @@ class VLiveChannelIE(InfoExtractor):
query={
'app_id': app_id,
'channelSeq': channel_seq,
- 'maxNumOfRows': 1000,
+ # Large values of maxNumOfRows (~300 or above) may cause
+ # empty responses (see [1]), e.g. this happens for [2] that
+ # has more than 300 videos.
+ # 1. https://github.com/rg3/youtube-dl/issues/13830
+ # 2. http://channels.vlive.tv/EDBF.
+ 'maxNumOfRows': 100,
'_': int(time.time()),
'pageNo': page_num
}
@@ -261,3 +270,54 @@ class VLiveChannelIE(InfoExtractor):
return self.playlist_result(
entries, channel_code, channel_name)
+
+
+class VLivePlaylistIE(InfoExtractor):
+ IE_NAME = 'vlive:playlist'
+ _VALID_URL = r'https?://(?:(?:www|m)\.)?vlive\.tv/video/(?P<video_id>[0-9]+)/playlist/(?P<id>[0-9]+)'
+ _TEST = {
+ 'url': 'http://www.vlive.tv/video/22867/playlist/22912',
+ 'info_dict': {
+ 'id': '22912',
+ 'title': 'Valentine Day Message from TWICE'
+ },
+ 'playlist_mincount': 9
+ }
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ video_id, playlist_id = mobj.group('video_id', 'id')
+
+ VIDEO_URL_TEMPLATE = 'http://www.vlive.tv/video/%s'
+ if self._downloader.params.get('noplaylist'):
+ self.to_screen(
+ 'Downloading just video %s because of --no-playlist' % video_id)
+ return self.url_result(
+ VIDEO_URL_TEMPLATE % video_id,
+ ie=VLiveIE.ie_key(), video_id=video_id)
+
+ self.to_screen(
+ 'Downloading playlist %s - add --no-playlist to just download video'
+ % playlist_id)
+
+ webpage = self._download_webpage(
+ 'http://www.vlive.tv/video/%s/playlist/%s'
+ % (video_id, playlist_id), playlist_id)
+
+ item_ids = self._parse_json(
+ self._search_regex(
+ r'playlistVideoSeqs\s*=\s*(\[[^]]+\])', webpage,
+ 'playlist video seqs'),
+ playlist_id)
+
+ entries = [
+ self.url_result(
+ VIDEO_URL_TEMPLATE % item_id, ie=VLiveIE.ie_key(),
+ video_id=compat_str(item_id))
+ for item_id in item_ids]
+
+ playlist_name = self._html_search_regex(
+ r'<div[^>]+class="[^"]*multicam_playlist[^>]*>\s*<h3[^>]+>([^<]+)',
+ webpage, 'playlist title', fatal=False)
+
+ return self.playlist_result(entries, playlist_id, playlist_name)
diff --git a/youtube_dl/extractor/voot.py b/youtube_dl/extractor/voot.py
new file mode 100644
index 0000000..5de3deb
--- /dev/null
+++ b/youtube_dl/extractor/voot.py
@@ -0,0 +1,98 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from .kaltura import KalturaIE
+from ..utils import (
+ ExtractorError,
+ int_or_none,
+ try_get,
+ unified_timestamp,
+)
+
+
+class VootIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?voot\.com/(?:[^/]+/)+(?P<id>\d+)'
+ _GEO_COUNTRIES = ['IN']
+ _TESTS = [{
+ 'url': 'https://www.voot.com/shows/ishq-ka-rang-safed/1/360558/is-this-the-end-of-kamini-/441353',
+ 'info_dict': {
+ 'id': '0_8ledb18o',
+ 'ext': 'mp4',
+ 'title': 'Ishq Ka Rang Safed - Season 01 - Episode 340',
+ 'description': 'md5:06291fbbbc4dcbe21235c40c262507c1',
+ 'uploader_id': 'batchUser',
+ 'timestamp': 1472162937,
+ 'upload_date': '20160825',
+ 'duration': 1146,
+ 'series': 'Ishq Ka Rang Safed',
+ 'season_number': 1,
+ 'episode': 'Is this the end of Kamini?',
+ 'episode_number': 340,
+ 'view_count': int,
+ 'like_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to download m3u8 information'],
+ }, {
+ 'url': 'https://www.voot.com/kids/characters/mighty-cat-masked-niyander-e-/400478/school-bag-disappears/440925',
+ 'only_matching': True,
+ }, {
+ 'url': 'https://www.voot.com/movies/pandavas-5/424627',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ media_info = self._download_json(
+ 'https://wapi.voot.com/ws/ott/getMediaInfo.json', video_id,
+ query={
+ 'platform': 'Web',
+ 'pId': 2,
+ 'mediaId': video_id,
+ })
+
+ status_code = try_get(media_info, lambda x: x['status']['code'], int)
+ if status_code != 0:
+ raise ExtractorError(media_info['status']['message'], expected=True)
+
+ media = media_info['assets']
+
+ entry_id = media['EntryId']
+ title = media['MediaName']
+
+ description, series, season_number, episode, episode_number = [None] * 5
+
+ for meta in try_get(media, lambda x: x['Metas'], list) or []:
+ key, value = meta.get('Key'), meta.get('Value')
+ if not key or not value:
+ continue
+ if key == 'ContentSynopsis':
+ description = value
+ elif key == 'RefSeriesTitle':
+ series = value
+ elif key == 'RefSeriesSeason':
+ season_number = int_or_none(value)
+ elif key == 'EpisodeMainTitle':
+ episode = value
+ elif key == 'EpisodeNo':
+ episode_number = int_or_none(value)
+
+ return {
+ '_type': 'url_transparent',
+ 'url': 'kaltura:1982551:%s' % entry_id,
+ 'ie_key': KalturaIE.ie_key(),
+ 'title': title,
+ 'description': description,
+ 'series': series,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ 'timestamp': unified_timestamp(media.get('CreationDate')),
+ 'duration': int_or_none(media.get('Duration')),
+ 'view_count': int_or_none(media.get('ViewCounter')),
+ 'like_count': int_or_none(media.get('like_counter')),
+ }
diff --git a/youtube_dl/extractor/vzaar.py b/youtube_dl/extractor/vzaar.py
index b270f08..02fcd52 100644
--- a/youtube_dl/extractor/vzaar.py
+++ b/youtube_dl/extractor/vzaar.py
@@ -1,6 +1,8 @@
# coding: utf-8
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
from ..utils import (
int_or_none,
@@ -28,6 +30,12 @@ class VzaarIE(InfoExtractor):
},
}]
+ @staticmethod
+ def _extract_urls(webpage):
+ return re.findall(
+ r'<iframe[^>]+src=["\']((?:https?:)?//(?:view\.vzaar\.com)/[0-9]+)',
+ webpage)
+
def _real_extract(self, url):
video_id = self._match_id(url)
video_data = self._download_json(
diff --git a/youtube_dl/extractor/watchbox.py b/youtube_dl/extractor/watchbox.py
new file mode 100644
index 0000000..b382338
--- /dev/null
+++ b/youtube_dl/extractor/watchbox.py
@@ -0,0 +1,151 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ int_or_none,
+ js_to_json,
+ strip_or_none,
+ try_get,
+ unified_timestamp,
+)
+
+
+class WatchBoxIE(InfoExtractor):
+ _VALID_URL = r'https?://(?:www\.)?watchbox\.de/(?P<kind>serien|filme)/(?:[^/]+/)*[^/]+-(?P<id>\d+)'
+ _TESTS = [{
+ # film
+ 'url': 'https://www.watchbox.de/filme/free-jimmy-12325.html',
+ 'info_dict': {
+ 'id': '341368',
+ 'ext': 'mp4',
+ 'title': 'Free Jimmy',
+ 'description': 'md5:bcd8bafbbf9dc0ef98063d344d7cc5f6',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 4890,
+ 'age_limit': 16,
+ 'release_year': 2009,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to download m3u8 information'],
+ }, {
+ # episode
+ 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-1/date-in-der-hoelle-328286.html',
+ 'info_dict': {
+ 'id': '328286',
+ 'ext': 'mp4',
+ 'title': 'S01 E01 - Date in der Hölle',
+ 'description': 'md5:2f31c74a8186899f33cb5114491dae2b',
+ 'thumbnail': r're:^https?://.*\.jpg$',
+ 'duration': 1291,
+ 'age_limit': 12,
+ 'release_year': 2010,
+ 'series': 'Ugly Americans',
+ 'season_number': 1,
+ 'episode': 'Date in der Hölle',
+ 'episode_number': 1,
+ },
+ 'params': {
+ 'format': 'bestvideo',
+ 'skip_download': True,
+ },
+ 'expected_warnings': ['Failed to download m3u8 information'],
+ }, {
+ 'url': 'https://www.watchbox.de/serien/ugly-americans-12231/staffel-2/der-ring-des-powers-328270',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ mobj = re.match(self._VALID_URL, url)
+ kind, video_id = mobj.group('kind', 'id')
+
+ webpage = self._download_webpage(url, video_id)
+
+ source = self._parse_json(
+ self._search_regex(
+ r'(?s)source\s*:\s*({.+?})\s*,\s*\n', webpage, 'source',
+ default='{}'),
+ video_id, transform_source=js_to_json, fatal=False) or {}
+
+ video_id = compat_str(source.get('videoId') or video_id)
+
+ devapi = self._download_json(
+ 'http://api.watchbox.de/devapi/id/%s' % video_id, video_id, query={
+ 'format': 'json',
+ 'apikey': 'hbbtv',
+ }, fatal=False)
+
+ item = try_get(devapi, lambda x: x['items'][0], dict) or {}
+
+ title = item.get('title') or try_get(
+ item, lambda x: x['movie']['headline_movie'],
+ compat_str) or source['title']
+
+ formats = []
+ hls_url = item.get('media_videourl_hls') or source.get('hls')
+ if hls_url:
+ formats.extend(self._extract_m3u8_formats(
+ hls_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ dash_url = item.get('media_videourl_wv') or source.get('dash')
+ if dash_url:
+ formats.extend(self._extract_mpd_formats(
+ dash_url, video_id, mpd_id='dash', fatal=False))
+ mp4_url = item.get('media_videourl')
+ if mp4_url:
+ formats.append({
+ 'url': mp4_url,
+ 'format_id': 'mp4',
+ 'width': int_or_none(item.get('width')),
+ 'height': int_or_none(item.get('height')),
+ 'tbr': int_or_none(item.get('bitrate')),
+ })
+ self._sort_formats(formats)
+
+ description = strip_or_none(item.get('descr'))
+ thumbnail = item.get('media_content_thumbnail_large') or source.get('poster') or item.get('media_thumbnail')
+ duration = int_or_none(item.get('media_length') or source.get('length'))
+ timestamp = unified_timestamp(item.get('pubDate'))
+ view_count = int_or_none(item.get('media_views'))
+ age_limit = int_or_none(try_get(item, lambda x: x['movie']['fsk']))
+ release_year = int_or_none(try_get(item, lambda x: x['movie']['rel_year']))
+
+ info = {
+ 'id': video_id,
+ 'title': title,
+ 'description': description,
+ 'thumbnail': thumbnail,
+ 'duration': duration,
+ 'timestamp': timestamp,
+ 'view_count': view_count,
+ 'age_limit': age_limit,
+ 'release_year': release_year,
+ 'formats': formats,
+ }
+
+ if kind.lower() == 'serien':
+ series = try_get(
+ item, lambda x: x['special']['title'],
+ compat_str) or source.get('format')
+ season_number = int_or_none(self._search_regex(
+ r'^S(\d{1,2})\s*E\d{1,2}', title, 'season number',
+ default=None) or self._search_regex(
+ r'/staffel-(\d+)/', url, 'season number', default=None))
+ episode = source.get('title')
+ episode_number = int_or_none(self._search_regex(
+ r'^S\d{1,2}\s*E(\d{1,2})', title, 'episode number',
+ default=None))
+ info.update({
+ 'series': series,
+ 'season_number': season_number,
+ 'episode': episode,
+ 'episode_number': episode_number,
+ })
+
+ return info
diff --git a/youtube_dl/extractor/xhamster.py b/youtube_dl/extractor/xhamster.py
index 6987b2e..c42b59e 100644
--- a/youtube_dl/extractor/xhamster.py
+++ b/youtube_dl/extractor/xhamster.py
@@ -3,6 +3,7 @@ from __future__ import unicode_literals
import re
from .common import InfoExtractor
+from ..compat import compat_str
from ..utils import (
clean_html,
dict_get,
@@ -14,12 +15,21 @@ from ..utils import (
class XHamsterIE(InfoExtractor):
- _VALID_URL = r'(?P<proto>https?)://(?:.+?\.)?xhamster\.com/movies/(?P<id>[0-9]+)/(?P<seo>.*?)\.html(?:\?.*)?'
+ _VALID_URL = r'''(?x)
+ https?://
+ (?:.+?\.)?xhamster\.com/
+ (?:
+ movies/(?P<id>\d+)/(?P<display_id>[^/]*)\.html|
+ videos/(?P<display_id_2>[^/]*)-(?P<id_2>\d+)
+ )
+ '''
+
_TESTS = [{
'url': 'http://xhamster.com/movies/1509445/femaleagent_shy_beauty_takes_the_bait.html',
'md5': '8281348b8d3c53d39fffb377d24eac4e',
'info_dict': {
'id': '1509445',
+ 'display_id': 'femaleagent_shy_beauty_takes_the_bait',
'ext': 'mp4',
'title': 'FemaleAgent Shy beauty takes the bait',
'upload_date': '20121014',
@@ -32,6 +42,7 @@ class XHamsterIE(InfoExtractor):
'url': 'http://xhamster.com/movies/2221348/britney_spears_sexy_booty.html?hd',
'info_dict': {
'id': '2221348',
+ 'display_id': 'britney_spears_sexy_booty',
'ext': 'mp4',
'title': 'Britney Spears Sexy Booty',
'upload_date': '20130914',
@@ -66,26 +77,18 @@ class XHamsterIE(InfoExtractor):
# This video is visible for marcoalfa123456's friends only
'url': 'https://it.xhamster.com/movies/7263980/la_mia_vicina.html',
'only_matching': True,
+ }, {
+ # new URL schema
+ 'url': 'https://pt.xhamster.com/videos/euro-pedal-pumping-7937821',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- def extract_video_url(webpage, name):
- return self._search_regex(
- [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
- r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
- r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
- webpage, name, group='mp4')
-
- def is_hd(webpage):
- return '<div class=\'icon iconHD\'' in webpage
-
mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('id_2')
+ display_id = mobj.group('display_id') or mobj.group('display_id_2')
- video_id = mobj.group('id')
- seo = mobj.group('seo')
- proto = mobj.group('proto')
- mrss_url = '%s://xhamster.com/movies/%s/%s.html' % (proto, video_id, seo)
- webpage = self._download_webpage(mrss_url, video_id)
+ webpage = self._download_webpage(url, video_id)
error = self._html_search_regex(
r'<div[^>]+id=["\']videoClosed["\'][^>]*>(.+?)</div>',
@@ -99,6 +102,39 @@ class XHamsterIE(InfoExtractor):
r'<title[^>]*>(.+?)(?:,\s*[^,]*?\s*Porn\s*[^,]*?:\s*xHamster[^<]*| - xHamster\.com)</title>'],
webpage, 'title')
+ formats = []
+ format_urls = set()
+
+ sources = self._parse_json(
+ self._search_regex(
+ r'sources\s*:\s*({.+?})\s*,?\s*\n', webpage, 'sources',
+ default='{}'),
+ video_id, fatal=False)
+ for format_id, format_url in sources.items():
+ if not isinstance(format_url, compat_str):
+ continue
+ if format_url in format_urls:
+ continue
+ format_urls.add(format_url)
+ formats.append({
+ 'format_id': format_id,
+ 'url': format_url,
+ 'height': int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_id, 'height', default=None))
+ })
+
+ video_url = self._search_regex(
+ [r'''file\s*:\s*(?P<q>["'])(?P<mp4>.+?)(?P=q)''',
+ r'''<a\s+href=(?P<q>["'])(?P<mp4>.+?)(?P=q)\s+class=["']mp4Thumb''',
+ r'''<video[^>]+file=(?P<q>["'])(?P<mp4>.+?)(?P=q)[^>]*>'''],
+ webpage, 'video url', group='mp4', default=None)
+ if video_url and video_url not in format_urls:
+ formats.append({
+ 'url': video_url,
+ })
+
+ self._sort_formats(formats)
+
# Only a few videos have an description
mobj = re.search(r'<span>Description: </span>([^<]+)', webpage)
description = mobj.group(1) if mobj else None
@@ -117,7 +153,8 @@ class XHamsterIE(InfoExtractor):
webpage, 'thumbnail', fatal=False, group='thumbnail')
duration = parse_duration(self._search_regex(
- r'Runtime:\s*</span>\s*([\d:]+)', webpage,
+ [r'<[^<]+\bitemprop=["\']duration["\'][^<]+\bcontent=["\'](.+?)["\']',
+ r'Runtime:\s*</span>\s*([\d:]+)'], webpage,
'duration', fatal=False))
view_count = int_or_none(self._search_regex(
@@ -132,30 +169,6 @@ class XHamsterIE(InfoExtractor):
age_limit = self._rta_search(webpage)
- hd = is_hd(webpage)
-
- format_id = 'hd' if hd else 'sd'
-
- video_url = extract_video_url(webpage, format_id)
- formats = [{
- 'url': video_url,
- 'format_id': 'hd' if hd else 'sd',
- 'preference': 1,
- }]
-
- if not hd:
- mrss_url = self._search_regex(r'<link rel="canonical" href="([^"]+)', webpage, 'mrss_url')
- webpage = self._download_webpage(mrss_url + '?hd', video_id, note='Downloading HD webpage')
- if is_hd(webpage):
- video_url = extract_video_url(webpage, 'hd')
- formats.append({
- 'url': video_url,
- 'format_id': 'hd',
- 'preference': 2,
- })
-
- self._sort_formats(formats)
-
categories_html = self._search_regex(
r'(?s)<table.+?(<span>Categories:.+?)</table>', webpage,
'categories', default=None)
@@ -164,6 +177,7 @@ class XHamsterIE(InfoExtractor):
return {
'id': video_id,
+ 'display_id': display_id,
'title': title,
'description': description,
'upload_date': upload_date,
diff --git a/youtube_dl/extractor/xuite.py b/youtube_dl/extractor/xuite.py
index e081820..0276c0d 100644
--- a/youtube_dl/extractor/xuite.py
+++ b/youtube_dl/extractor/xuite.py
@@ -1,14 +1,13 @@
# coding: utf-8
from __future__ import unicode_literals
-import base64
-
from .common import InfoExtractor
-from ..compat import compat_urllib_parse_unquote
from ..utils import (
ExtractorError,
+ float_or_none,
+ get_element_by_attribute,
parse_iso8601,
- parse_duration,
+ remove_end,
)
@@ -24,6 +23,7 @@ class XuiteIE(InfoExtractor):
'id': '3860914',
'ext': 'mp3',
'title': '孤單南半球-歐德陽',
+ 'description': '孤單南半球-歐德陽',
'thumbnail': r're:^https?://.*\.jpg$',
'duration': 247.246,
'timestamp': 1314932940,
@@ -44,7 +44,7 @@ class XuiteIE(InfoExtractor):
'duration': 596.458,
'timestamp': 1454242500,
'upload_date': '20160131',
- 'uploader': 'yan12125',
+ 'uploader': '屁姥',
'uploader_id': '12158353',
'categories': ['個人短片'],
'description': 'http://download.blender.org/peach/bigbuckbunny_movies/BigBuckBunny_320x180.mp4',
@@ -72,10 +72,10 @@ class XuiteIE(InfoExtractor):
# from http://forgetfulbc.blogspot.com/2016/06/date.html
'url': 'http://vlog.xuite.net/embed/cE1xbENoLTI3NDQ3MzM2LmZsdg==?ar=0&as=0',
'info_dict': {
- 'id': 'cE1xbENoLTI3NDQ3MzM2LmZsdg==',
+ 'id': '27447336',
'ext': 'mp4',
'title': '男女平權只是口號?專家解釋約會時男生是否該幫女生付錢 (中字)',
- 'description': 'md5:f0abdcb69df300f522a5442ef3146f2a',
+ 'description': 'md5:1223810fa123b179083a3aed53574706',
'timestamp': 1466160960,
'upload_date': '20160617',
'uploader': 'B.C. & Lowy',
@@ -86,29 +86,9 @@ class XuiteIE(InfoExtractor):
'only_matching': True,
}]
- @staticmethod
- def base64_decode_utf8(data):
- return base64.b64decode(data.encode('utf-8')).decode('utf-8')
-
- @staticmethod
- def base64_encode_utf8(data):
- return base64.b64encode(data.encode('utf-8')).decode('utf-8')
-
- def _extract_flv_config(self, encoded_media_id):
- flv_config = self._download_xml(
- 'http://vlog.xuite.net/flash/player?media=%s' % encoded_media_id,
- 'flv config')
- prop_dict = {}
- for prop in flv_config.findall('./property'):
- prop_id = self.base64_decode_utf8(prop.attrib['id'])
- # CDATA may be empty in flv config
- if not prop.text:
- continue
- encoded_content = self.base64_decode_utf8(prop.text)
- prop_dict[prop_id] = compat_urllib_parse_unquote(encoded_content)
- return prop_dict
-
def _real_extract(self, url):
+ # /play/ URLs provide embedded video URL and more metadata
+ url = url.replace('/embed/', '/play/')
video_id = self._match_id(url)
webpage = self._download_webpage(url, video_id)
@@ -121,51 +101,53 @@ class XuiteIE(InfoExtractor):
'%s returned error: %s' % (self.IE_NAME, error_msg),
expected=True)
- encoded_media_id = self._search_regex(
- r'attributes\.name\s*=\s*"([^"]+)"', webpage,
- 'encoded media id', default=None)
- if encoded_media_id is None:
- video_id = self._html_search_regex(
- r'data-mediaid="(\d+)"', webpage, 'media id')
- encoded_media_id = self.base64_encode_utf8(video_id)
- flv_config = self._extract_flv_config(encoded_media_id)
+ media_info = self._parse_json(self._search_regex(
+ r'var\s+mediaInfo\s*=\s*({.*});', webpage, 'media info'), video_id)
- FORMATS = {
- 'audio': 'mp3',
- 'video': 'mp4',
- }
+ video_id = media_info['MEDIA_ID']
formats = []
- for format_tag in ('src', 'hq_src'):
- video_url = flv_config.get(format_tag)
+ for key in ('html5Url', 'html5HQUrl'):
+ video_url = media_info.get(key)
if not video_url:
continue
format_id = self._search_regex(
- r'\bq=(.+?)\b', video_url, 'format id', default=format_tag)
+ r'\bq=(.+?)\b', video_url, 'format id', default=None)
formats.append({
'url': video_url,
- 'ext': FORMATS.get(flv_config['type'], 'mp4'),
+ 'ext': 'mp4' if format_id.isnumeric() else format_id,
'format_id': format_id,
'height': int(format_id) if format_id.isnumeric() else None,
})
self._sort_formats(formats)
- timestamp = flv_config.get('publish_datetime')
+ timestamp = media_info.get('PUBLISH_DATETIME')
if timestamp:
timestamp = parse_iso8601(timestamp + ' +0800', ' ')
- category = flv_config.get('category')
+ category = media_info.get('catName')
categories = [category] if category else []
+ uploader = media_info.get('NICKNAME')
+ uploader_url = None
+
+ author_div = get_element_by_attribute('itemprop', 'author', webpage)
+ if author_div:
+ uploader = uploader or self._html_search_meta('name', author_div)
+ uploader_url = self._html_search_regex(
+ r'<link[^>]+itemprop="url"[^>]+href="([^"]+)"', author_div,
+ 'uploader URL', fatal=False)
+
return {
'id': video_id,
- 'title': flv_config['title'],
- 'description': flv_config.get('description'),
- 'thumbnail': flv_config.get('thumb'),
+ 'title': media_info['TITLE'],
+ 'description': remove_end(media_info.get('metaDesc'), ' (Xuite 影音)'),
+ 'thumbnail': media_info.get('ogImageUrl'),
'timestamp': timestamp,
- 'uploader': flv_config.get('author_name'),
- 'uploader_id': flv_config.get('author_id'),
- 'duration': parse_duration(flv_config.get('duration')),
+ 'uploader': uploader,
+ 'uploader_id': media_info.get('MEMBER_ID'),
+ 'uploader_url': uploader_url,
+ 'duration': float_or_none(media_info.get('MEDIA_DURATION'), 1000000),
'categories': categories,
'formats': formats,
}
diff --git a/youtube_dl/extractor/xxxymovies.py b/youtube_dl/extractor/xxxymovies.py
index 5c8f17e..e34ebe3 100644
--- a/youtube_dl/extractor/xxxymovies.py
+++ b/youtube_dl/extractor/xxxymovies.py
@@ -39,8 +39,8 @@ class XXXYMoviesIE(InfoExtractor):
r"video_url\s*:\s*'([^']+)'", webpage, 'video URL')
title = self._html_search_regex(
- [r'<div class="block_header">\s*<h1>([^<]+)</h1>',
- r'<title>(.*?)\s*-\s*XXXYMovies\.com</title>'],
+ [r'<div[^>]+\bclass="block_header"[^>]*>\s*<h1>([^<]+)<',
+ r'<title>(.*?)\s*-\s*(?:XXXYMovies\.com|XXX\s+Movies)</title>'],
webpage, 'title')
thumbnail = self._search_regex(
diff --git a/youtube_dl/extractor/yam.py b/youtube_dl/extractor/yam.py
deleted file mode 100644
index ef55355..0000000
--- a/youtube_dl/extractor/yam.py
+++ /dev/null
@@ -1,123 +0,0 @@
-# coding: utf-8
-from __future__ import unicode_literals
-
-import re
-
-from .common import InfoExtractor
-from ..compat import compat_urlparse
-from ..utils import (
- float_or_none,
- month_by_abbreviation,
- ExtractorError,
- get_element_by_attribute,
-)
-
-
-class YamIE(InfoExtractor):
- IE_DESC = '蕃薯藤yam天空部落'
- _VALID_URL = r'https?://mymedia\.yam\.com/m/(?P<id>\d+)'
-
- _TESTS = [{
- # An audio hosted on Yam
- 'url': 'http://mymedia.yam.com/m/2283921',
- 'md5': 'c011b8e262a52d5473d9c2e3c9963b9c',
- 'info_dict': {
- 'id': '2283921',
- 'ext': 'mp3',
- 'title': '發現 - 趙薇 京華煙雲主題曲',
- 'description': '發現 - 趙薇 京華煙雲主題曲',
- 'uploader_id': 'princekt',
- 'upload_date': '20080807',
- 'duration': 313.0,
- }
- }, {
- # An external video hosted on YouTube
- 'url': 'http://mymedia.yam.com/m/3599430',
- 'md5': '03127cf10d8f35d120a9e8e52e3b17c6',
- 'info_dict': {
- 'id': 'CNpEoQlrIgA',
- 'ext': 'mp4',
- 'upload_date': '20150306',
- 'uploader': '新莊社大瑜伽社',
- 'description': 'md5:11e2e405311633ace874f2e6226c8b17',
- 'uploader_id': '2323agoy',
- 'title': '20090412陽明山二子坪-1',
- },
- 'skip': 'Video does not exist',
- }, {
- 'url': 'http://mymedia.yam.com/m/3598173',
- 'info_dict': {
- 'id': '3598173',
- 'ext': 'mp4',
- },
- 'skip': 'cause Yam system error',
- }, {
- 'url': 'http://mymedia.yam.com/m/3599437',
- 'info_dict': {
- 'id': '3599437',
- 'ext': 'mp4',
- },
- 'skip': 'invalid YouTube URL',
- }, {
- 'url': 'http://mymedia.yam.com/m/2373534',
- 'md5': '7ff74b91b7a817269d83796f8c5890b1',
- 'info_dict': {
- 'id': '2373534',
- 'ext': 'mp3',
- 'title': '林俊傑&蔡卓妍-小酒窩',
- 'description': 'md5:904003395a0fcce6cfb25028ff468420',
- 'upload_date': '20080928',
- 'uploader_id': 'onliner2',
- }
- }]
-
- def _real_extract(self, url):
- video_id = self._match_id(url)
- page = self._download_webpage(url, video_id)
-
- # Check for errors
- system_msg = self._html_search_regex(
- r'系統訊息(?:<br>|\n|\r)*([^<>]+)<br>', page, 'system message',
- default=None)
- if system_msg:
- raise ExtractorError(system_msg, expected=True)
-
- # Is it hosted externally on YouTube?
- youtube_url = self._html_search_regex(
- r'<embed src="(http://www.youtube.com/[^"]+)"',
- page, 'YouTube url', default=None)
- if youtube_url:
- return self.url_result(youtube_url, 'Youtube')
-
- title = self._html_search_regex(
- r'<h1[^>]+class="heading"[^>]*>\s*(.+)\s*</h1>', page, 'title')
-
- api_page = self._download_webpage(
- 'http://mymedia.yam.com/api/a/?pID=' + video_id, video_id,
- note='Downloading API page')
- api_result_obj = compat_urlparse.parse_qs(api_page)
-
- info_table = get_element_by_attribute('class', 'info', page)
- uploader_id = self._html_search_regex(
- r'<!-- 發表作者 -->:[\n ]+<a href="/([a-z0-9]+)"',
- info_table, 'uploader id', fatal=False)
- mobj = re.search(r'<!-- 發表於 -->(?P<mon>[A-Z][a-z]{2})\s+' +
- r'(?P<day>\d{1,2}), (?P<year>\d{4})', page)
- if mobj:
- upload_date = '%s%02d%02d' % (
- mobj.group('year'),
- month_by_abbreviation(mobj.group('mon')),
- int(mobj.group('day')))
- else:
- upload_date = None
- duration = float_or_none(api_result_obj['totaltime'][0], scale=1000)
-
- return {
- 'id': video_id,
- 'url': api_result_obj['mp3file'][0],
- 'title': title,
- 'description': self._html_search_meta('description', page),
- 'duration': duration,
- 'uploader_id': uploader_id,
- 'upload_date': upload_date,
- }
diff --git a/youtube_dl/extractor/yandexdisk.py b/youtube_dl/extractor/yandexdisk.py
new file mode 100644
index 0000000..e8f6ae1
--- /dev/null
+++ b/youtube_dl/extractor/yandexdisk.py
@@ -0,0 +1,118 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ float_or_none,
+ int_or_none,
+ try_get,
+ urlencode_postdata,
+)
+
+
+class YandexDiskIE(InfoExtractor):
+ _VALID_URL = r'https?://yadi\.sk/[di]/(?P<id>[^/?#&]+)'
+
+ _TESTS = [{
+ 'url': 'https://yadi.sk/i/VdOeDou8eZs6Y',
+ 'md5': '33955d7ae052f15853dc41f35f17581c',
+ 'info_dict': {
+ 'id': 'VdOeDou8eZs6Y',
+ 'ext': 'mp4',
+ 'title': '4.mp4',
+ 'duration': 168.6,
+ 'uploader': 'y.botova',
+ 'uploader_id': '300043621',
+ 'view_count': int,
+ },
+ }, {
+ 'url': 'https://yadi.sk/d/h3WAXvDS3Li3Ce',
+ 'only_matching': True,
+ }]
+
+ def _real_extract(self, url):
+ video_id = self._match_id(url)
+
+ status = self._download_webpage(
+ 'https://disk.yandex.com/auth/status', video_id, query={
+ 'urlOrigin': url,
+ 'source': 'public',
+ 'md5': 'false',
+ })
+
+ sk = self._search_regex(
+ r'(["\'])sk(?:External)?\1\s*:\s*(["\'])(?P<value>(?:(?!\2).)+)\2',
+ status, 'sk', group='value')
+
+ webpage = self._download_webpage(url, video_id)
+
+ models = self._parse_json(
+ self._search_regex(
+ r'<script[^>]+id=["\']models-client[^>]+>\s*(\[.+?\])\s*</script',
+ webpage, 'video JSON'),
+ video_id)
+
+ data = next(
+ model['data'] for model in models
+ if model.get('model') == 'resource')
+
+ video_hash = data['id']
+ title = data['name']
+
+ models = self._download_json(
+ 'https://disk.yandex.com/models/', video_id,
+ data=urlencode_postdata({
+ '_model.0': 'videoInfo',
+ 'id.0': video_hash,
+ '_model.1': 'do-get-resource-url',
+ 'id.1': video_hash,
+ 'version': '13.6',
+ 'sk': sk,
+ }), query={'_m': 'videoInfo'})['models']
+
+ videos = try_get(models, lambda x: x[0]['data']['videos'], list) or []
+ source_url = try_get(
+ models, lambda x: x[1]['data']['file'], compat_str)
+
+ formats = []
+ if source_url:
+ formats.append({
+ 'url': source_url,
+ 'format_id': 'source',
+ 'ext': determine_ext(title, 'mp4'),
+ 'quality': 1,
+ })
+ for video in videos:
+ format_url = video.get('url')
+ if not format_url:
+ continue
+ if determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ formats.append({
+ 'url': format_url,
+ })
+ self._sort_formats(formats)
+
+ duration = float_or_none(try_get(
+ models, lambda x: x[0]['data']['duration']), 1000)
+ uploader = try_get(
+ data, lambda x: x['user']['display_name'], compat_str)
+ uploader_id = try_get(
+ data, lambda x: x['user']['uid'], compat_str)
+ view_count = int_or_none(try_get(
+ data, lambda x: x['meta']['views_counter']))
+
+ return {
+ 'id': video_id,
+ 'title': title,
+ 'duration': duration,
+ 'uploader': uploader,
+ 'uploader_id': uploader_id,
+ 'view_count': view_count,
+ 'formats': formats,
+ }
diff --git a/youtube_dl/extractor/youjizz.py b/youtube_dl/extractor/youjizz.py
index b50f34e..f33fabe 100644
--- a/youtube_dl/extractor/youjizz.py
+++ b/youtube_dl/extractor/youjizz.py
@@ -1,39 +1,95 @@
from __future__ import unicode_literals
+import re
+
from .common import InfoExtractor
+from ..compat import compat_str
+from ..utils import (
+ determine_ext,
+ int_or_none,
+ parse_duration,
+)
class YouJizzIE(InfoExtractor):
- _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]+)?-(?P<id>[0-9]+)\.html(?:$|[?#])'
+ _VALID_URL = r'https?://(?:\w+\.)?youjizz\.com/videos/(?:[^/#?]*-(?P<id>\d+)\.html|embed/(?P<embed_id>\d+))'
_TESTS = [{
'url': 'http://www.youjizz.com/videos/zeichentrick-1-2189178.html',
- 'md5': '78fc1901148284c69af12640e01c6310',
+ 'md5': 'b1e1dfaa8bb9537d8b84eeda9cf4acf4',
'info_dict': {
'id': '2189178',
'ext': 'mp4',
'title': 'Zeichentrick 1',
'age_limit': 18,
+ 'duration': 2874,
}
}, {
'url': 'http://www.youjizz.com/videos/-2189178.html',
'only_matching': True,
+ }, {
+ 'url': 'https://www.youjizz.com/videos/embed/31991001',
+ 'only_matching': True,
}]
def _real_extract(self, url):
- video_id = self._match_id(url)
+ mobj = re.match(self._VALID_URL, url)
+ video_id = mobj.group('id') or mobj.group('embed_id')
+
webpage = self._download_webpage(url, video_id)
- # YouJizz's HTML5 player has invalid HTML
- webpage = webpage.replace('"controls', '" controls')
- age_limit = self._rta_search(webpage)
- video_title = self._html_search_regex(
- r'<title>\s*(.*)\s*</title>', webpage, 'title')
- info_dict = self._parse_html5_media_entries(url, webpage, video_id)[0]
+ title = self._html_search_regex(
+ r'<title>(.+?)</title>', webpage, 'title')
+
+ formats = []
+
+ encodings = self._parse_json(
+ self._search_regex(
+ r'encodings\s*=\s*(\[.+?\]);\n', webpage, 'encodings',
+ default='[]'),
+ video_id, fatal=False)
+ for encoding in encodings:
+ if not isinstance(encoding, dict):
+ continue
+ format_url = encoding.get('filename')
+ if not isinstance(format_url, compat_str):
+ continue
+ if determine_ext(format_url) == 'm3u8':
+ formats.extend(self._extract_m3u8_formats(
+ format_url, video_id, 'mp4', entry_protocol='m3u8_native',
+ m3u8_id='hls', fatal=False))
+ else:
+ format_id = encoding.get('name') or encoding.get('quality')
+ height = int_or_none(self._search_regex(
+ r'^(\d+)[pP]', format_id, 'height', default=None))
+ formats.append({
+ 'url': format_url,
+ 'format_id': format_id,
+ 'height': height,
+ })
+
+ if formats:
+ info_dict = {
+ 'formats': formats,
+ }
+ else:
+ # YouJizz's HTML5 player has invalid HTML
+ webpage = webpage.replace('"controls', '" controls')
+ info_dict = self._parse_html5_media_entries(
+ url, webpage, video_id)[0]
+
+ duration = parse_duration(self._search_regex(
+ r'<strong>Runtime:</strong>([^<]+)', webpage, 'duration',
+ default=None))
+ uploader = self._search_regex(
+ r'<strong>Uploaded By:.*?<a[^>]*>([^<]+)', webpage, 'uploader',
+ default=None)
info_dict.update({
'id': video_id,
- 'title': video_title,
- 'age_limit': age_limit,
+ 'title': title,
+ 'age_limit': self._rta_search(webpage),
+ 'duration': duration,
+ 'uploader': uploader,
})
return info_dict
diff --git a/youtube_dl/extractor/youku.py b/youtube_dl/extractor/youku.py
index dcce15d..0c4bc2e 100644
--- a/youtube_dl/extractor/youku.py
+++ b/youtube_dl/extractor/youku.py
@@ -1,7 +1,6 @@
# coding: utf-8
from __future__ import unicode_literals
-import itertools
import random
import re
import string
@@ -14,7 +13,6 @@ from ..utils import (
js_to_json,
str_or_none,
strip_jsonp,
- urljoin,
)
@@ -222,17 +220,42 @@ class YoukuShowIE(InfoExtractor):
_VALID_URL = r'https?://list\.youku\.com/show/id_(?P<id>[0-9a-z]+)\.html'
IE_NAME = 'youku:show'
- _TEST = {
+ _TESTS = [{
'url': 'http://list.youku.com/show/id_zc7c670be07ff11e48b3f.html',
'info_dict': {
'id': 'zc7c670be07ff11e48b3f',
- 'title': '花千骨 未删减版',
+ 'title': '花千骨 DVD版',
'description': 'md5:a1ae6f5618571bbeb5c9821f9c81b558',
},
'playlist_count': 50,
- }
+ }, {
+ # Episode number not starting from 1
+ 'url': 'http://list.youku.com/show/id_zefbfbd70efbfbd780bef.html',
+ 'info_dict': {
+ 'id': 'zefbfbd70efbfbd780bef',
+ 'title': '超级飞侠3',
+ 'description': 'md5:275715156abebe5ccc2a1992e9d56b98',
+ },
+ 'playlist_count': 24,
+ }, {
+ # Ongoing playlist. The initial page is the last one
+ 'url': 'http://list.youku.com/show/id_za7c275ecd7b411e1a19e.html',
+ 'only_matchine': True,
+ }]
- _PAGE_SIZE = 40
+ def _extract_entries(self, playlist_data_url, show_id, note, query):
+ query['callback'] = 'cb'
+ playlist_data = self._download_json(
+ playlist_data_url, show_id, query=query, note=note,
+ transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
+ drama_list = (get_element_by_class('p-drama-grid', playlist_data) or
+ get_element_by_class('p-drama-half-row', playlist_data))
+ if drama_list is None:
+ raise ExtractorError('No episodes found')
+ video_urls = re.findall(r'<a[^>]+href="([^"]+)"', drama_list)
+ return playlist_data, [
+ self.url_result(self._proto_relative_url(video_url, 'http:'), YoukuIE.ie_key())
+ for video_url in video_urls]
def _real_extract(self, url):
show_id = self._match_id(url)
@@ -242,30 +265,29 @@ class YoukuShowIE(InfoExtractor):
page_config = self._parse_json(self._search_regex(
r'var\s+PageConfig\s*=\s*({.+});', webpage, 'page config'),
show_id, transform_source=js_to_json)
- for idx in itertools.count(0):
- if idx == 0:
- playlist_data_url = 'http://list.youku.com/show/module'
- query = {'id': page_config['showid'], 'tab': 'point'}
- else:
- playlist_data_url = 'http://list.youku.com/show/point'
- query = {
- 'id': page_config['showid'],
- 'stage': 'reload_%d' % (self._PAGE_SIZE * idx + 1),
- }
- query['callback'] = 'cb'
- playlist_data = self._download_json(
- playlist_data_url, show_id, query=query,
+ first_page, initial_entries = self._extract_entries(
+ 'http://list.youku.com/show/module', show_id,
+ note='Downloading initial playlist data page',
+ query={
+ 'id': page_config['showid'],
+ 'tab': 'showInfo',
+ })
+ first_page_reload_id = self._html_search_regex(
+ r'<div[^>]+id="(reload_\d+)', first_page, 'first page reload id')
+ # The first reload_id has the same items as first_page
+ reload_ids = re.findall('<li[^>]+data-id="([^"]+)">', first_page)
+ for idx, reload_id in enumerate(reload_ids):
+ if reload_id == first_page_reload_id:
+ entries.extend(initial_entries)
+ continue
+ _, new_entries = self._extract_entries(
+ 'http://list.youku.com/show/episode', show_id,
note='Downloading playlist data page %d' % (idx + 1),
- transform_source=lambda s: js_to_json(strip_jsonp(s)))['html']
- video_urls = re.findall(
- r'<div[^>]+class="p-thumb"[^<]+<a[^>]+href="([^"]+)"',
- playlist_data)
- new_entries = [
- self.url_result(urljoin(url, video_url), YoukuIE.ie_key())
- for video_url in video_urls]
+ query={
+ 'id': page_config['showid'],
+ 'stage': reload_id,
+ })
entries.extend(new_entries)
- if len(new_entries) < self._PAGE_SIZE:
- break
desc = self._html_search_meta('description', webpage, fatal=False)
playlist_title = desc.split(',')[0] if desc else None
diff --git a/youtube_dl/extractor/youtube.py b/youtube_dl/extractor/youtube.py
index 77cd271..ad2e933 100644
--- a/youtube_dl/extractor/youtube.py
+++ b/youtube_dl/extractor/youtube.py
@@ -16,6 +16,7 @@ from ..jsinterp import JSInterpreter
from ..swfinterp import SWFInterpreter
from ..compat import (
compat_chr,
+ compat_kwargs,
compat_parse_qs,
compat_urllib_parse_unquote,
compat_urllib_parse_unquote_plus,
@@ -245,6 +246,11 @@ class YoutubeBaseInfoExtractor(InfoExtractor):
return True
+ def _download_webpage(self, *args, **kwargs):
+ kwargs.setdefault('query', {})['disable_polymer'] = 'true'
+ return super(YoutubeBaseInfoExtractor, self)._download_webpage(
+ *args, **compat_kwargs(kwargs))
+
def _real_initialize(self):
if self._downloader is None:
return
@@ -673,6 +679,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
},
},
# video_info is None (https://github.com/rg3/youtube-dl/issues/4421)
+ # YouTube Red ad is not captured for creator
{
'url': '__2ABJjxzNo',
'info_dict': {
@@ -1003,6 +1010,27 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
],
},
{
+ # The following content has been identified by the YouTube community
+ # as inappropriate or offensive to some audiences.
+ 'url': 'https://www.youtube.com/watch?v=6SJNVb0GnPI',
+ 'info_dict': {
+ 'id': '6SJNVb0GnPI',
+ 'ext': 'mp4',
+ 'title': 'Race Differences in Intelligence',
+ 'description': 'md5:5d161533167390427a1f8ee89a1fc6f1',
+ 'duration': 965,
+ 'upload_date': '20140124',
+ 'uploader': 'New Century Foundation',
+ 'uploader_id': 'UCEJYpZGqgUob0zVVEaLhvVg',
+ 'uploader_url': r're:https?://(?:www\.)?youtube\.com/channel/UCEJYpZGqgUob0zVVEaLhvVg',
+ 'license': 'Standard YouTube License',
+ 'view_count': int,
+ },
+ 'params': {
+ 'skip_download': True,
+ },
+ },
+ {
# itag 212
'url': '1t24XAntNCY',
'only_matching': True,
@@ -1346,6 +1374,43 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
playback_url, video_id, 'Marking watched',
'Unable to mark watched', fatal=False)
+ @staticmethod
+ def _extract_urls(webpage):
+ # Embedded YouTube player
+ entries = [
+ unescapeHTML(mobj.group('url'))
+ for mobj in re.finditer(r'''(?x)
+ (?:
+ <iframe[^>]+?src=|
+ data-video-url=|
+ <embed[^>]+?src=|
+ embedSWF\(?:\s*|
+ <object[^>]+data=|
+ new\s+SWFObject\(
+ )
+ (["\'])
+ (?P<url>(?:https?:)?//(?:www\.)?youtube(?:-nocookie)?\.com/
+ (?:embed|v|p)/.+?)
+ \1''', webpage)]
+
+ # lazyYT YouTube embed
+ entries.extend(list(map(
+ unescapeHTML,
+ re.findall(r'class="lazyYT" data-youtube-id="([^"]+)"', webpage))))
+
+ # Wordpress "YouTube Video Importer" plugin
+ matches = re.findall(r'''(?x)<div[^>]+
+ class=(?P<q1>[\'"])[^\'"]*\byvii_single_video_player\b[^\'"]*(?P=q1)[^>]+
+ data-video_id=(?P<q2>[\'"])([^\'"]+)(?P=q2)''', webpage)
+ entries.extend(m[-1] for m in matches)
+
+ return entries
+
+ @staticmethod
+ def _extract_url(webpage):
+ urls = YoutubeIE._extract_urls(webpage)
+ return urls[0] if urls else None
+
@classmethod
def extract_id(cls, url):
mobj = re.match(cls._VALID_URL, url, re.VERBOSE)
@@ -1436,9 +1501,14 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if dash_mpd and dash_mpd[0] not in dash_mpds:
dash_mpds.append(dash_mpd[0])
+ is_live = None
+ view_count = None
+
+ def extract_view_count(v_info):
+ return int_or_none(try_get(v_info, lambda x: x['view_count'][0]))
+
# Get video info
embed_webpage = None
- is_live = None
if re.search(r'player-age-gate-content">', video_webpage) is not None:
age_gate = True
# We simulate the access to the video from www.youtube.com/v/{video_id}
@@ -1508,6 +1578,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
continue
get_video_info = compat_parse_qs(video_info_webpage)
add_dash_mpd(get_video_info)
+ if view_count is None:
+ view_count = extract_view_count(get_video_info)
if not video_info:
video_info = get_video_info
if 'token' in get_video_info:
@@ -1591,10 +1663,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
return self.playlist_result(entries, video_id, video_title, video_description)
self.to_screen('Downloading just video %s because of --no-playlist' % video_id)
- if 'view_count' in video_info:
- view_count = int(video_info['view_count'][0])
- else:
- view_count = None
+ if view_count is None:
+ view_count = extract_view_count(video_info)
# Check for "rental" videos
if 'ypc_video_rental_bar_text' in video_info and 'author' not in video_info:
@@ -1638,10 +1708,8 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
if not upload_date:
upload_date = self._search_regex(
[r'(?s)id="eow-date.*?>(.*?)</span>',
- r'id="watch-uploader-info".*?>.*?(?:Published|Uploaded|Streamed live|Started) on (.+?)</strong>'],
+ r'(?:id="watch-uploader-info".*?>.*?|["\']simpleText["\']\s*:\s*["\'])(?:Published|Uploaded|Streamed live|Started) on (.+?)[<"\']'],
video_webpage, 'upload date', default=None)
- if upload_date:
- upload_date = ' '.join(re.sub(r'[/,-]', r' ', mobj.group(1)).split())
upload_date = unified_strdate(upload_date)
video_license = self._html_search_regex(
@@ -1649,7 +1717,21 @@ class YoutubeIE(YoutubeBaseInfoExtractor):
video_webpage, 'license', default=None)
m_music = re.search(
- r'<h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*<ul[^>]*>\s*<li>(?P<title>.+?) by (?P<creator>.+?)(?:\(.+?\))?</li',
+ r'''(?x)
+ <h4[^>]+class="title"[^>]*>\s*Music\s*</h4>\s*
+ <ul[^>]*>\s*
+ <li>(?P<title>.+?)
+ by (?P<creator>.+?)
+ (?:
+ \(.+?\)|
+ <a[^>]*
+ (?:
+ \bhref=["\']/red[^>]*>| # drop possible
+ >\s*Listen ad-free with YouTube Red # YouTube Red ad
+ )
+ .*?
+ )?</li
+ ''',
video_webpage)
if m_music:
video_alt_title = remove_quotes(unescapeHTML(m_music.group('title')))
@@ -2013,7 +2095,7 @@ class YoutubePlaylistIE(YoutubePlaylistBaseInfoExtractor):
|
(%(playlist_id)s)
)""" % {'playlist_id': YoutubeBaseInfoExtractor._PLAYLIST_ID_RE}
- _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s&disable_polymer=true'
+ _TEMPLATE_URL = 'https://www.youtube.com/playlist?list=%s'
_VIDEO_RE = r'href="\s*/watch\?v=(?P<id>[0-9A-Za-z_-]{11})&amp;[^"]*?index=(?P<index>\d+)(?:[^>]+>(?P<title>[^<]+))?'
IE_NAME = 'youtube:playlist'
_TESTS = [{
diff --git a/youtube_dl/options.py b/youtube_dl/options.py
index 79e9fd1..38439c9 100644
--- a/youtube_dl/options.py
+++ b/youtube_dl/options.py
@@ -20,6 +20,24 @@ from .utils import (
from .version import __version__
+def _hide_login_info(opts):
+ PRIVATE_OPTS = set(['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username'])
+ eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
+
+ def _scrub_eq(o):
+ m = eqre.match(o)
+ if m:
+ return m.group('key') + '=PRIVATE'
+ else:
+ return o
+
+ opts = list(map(_scrub_eq, opts))
+ for idx, opt in enumerate(opts):
+ if opt in PRIVATE_OPTS and idx + 1 < len(opts):
+ opts[idx + 1] = 'PRIVATE'
+ return opts
+
+
def parseOpts(overrideArguments=None):
def _readOptions(filename_bytes, default=[]):
try:
@@ -93,26 +111,6 @@ def parseOpts(overrideArguments=None):
def _comma_separated_values_options_callback(option, opt_str, value, parser):
setattr(parser.values, option.dest, value.split(','))
- def _hide_login_info(opts):
- PRIVATE_OPTS = ['-p', '--password', '-u', '--username', '--video-password', '--ap-password', '--ap-username']
- eqre = re.compile('^(?P<key>' + ('|'.join(re.escape(po) for po in PRIVATE_OPTS)) + ')=.+$')
-
- def _scrub_eq(o):
- m = eqre.match(o)
- if m:
- return m.group('key') + '=PRIVATE'
- else:
- return o
-
- opts = list(map(_scrub_eq, opts))
- for private_opt in PRIVATE_OPTS:
- try:
- i = opts.index(private_opt)
- opts[i + 1] = 'PRIVATE'
- except ValueError:
- pass
- return opts
-
# No need to wrap help messages if we're on a wide console
columns = compat_get_terminal_size().columns
max_width = columns if columns else 80
diff --git a/youtube_dl/postprocessor/ffmpeg.py b/youtube_dl/postprocessor/ffmpeg.py
index f021ea8..51256a3 100644
--- a/youtube_dl/postprocessor/ffmpeg.py
+++ b/youtube_dl/postprocessor/ffmpeg.py
@@ -542,7 +542,7 @@ class FFmpegFixupM3u8PP(FFmpegPostProcessor):
temp_filename = prepend_extension(filename, 'temp')
options = ['-c', 'copy', '-f', 'mp4', '-bsf:a', 'aac_adtstoasc']
- self._downloader.to_screen('[ffmpeg] Fixing malformated aac bitstream in "%s"' % filename)
+ self._downloader.to_screen('[ffmpeg] Fixing malformed AAC bitstream in "%s"' % filename)
self.run_ffmpeg(filename, temp_filename, options)
os.remove(encodeFilename(filename))
diff --git a/youtube_dl/utils.py b/youtube_dl/utils.py
index 39860e9..c42dd4c 100644
--- a/youtube_dl/utils.py
+++ b/youtube_dl/utils.py
@@ -365,9 +365,9 @@ def get_elements_by_attribute(attribute, value, html, escape_value=True):
retlist = []
for m in re.finditer(r'''(?xs)
<([a-zA-Z0-9:._-]+)
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s+%s=['"]?%s['"]?
- (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'))*?
+ (?:\s+[a-zA-Z0-9:._-]+(?:=[a-zA-Z0-9:._-]*|="[^"]*"|='[^']*'|))*?
\s*>
(?P<content>.*?)
</\1>
@@ -596,7 +596,7 @@ def unescapeHTML(s):
assert type(s) == compat_str
return re.sub(
- r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
+ r'&([^&;]+;)', lambda m: _htmlentity_transform(m.group(1)), s)
def get_subprocess_encoding():
@@ -1815,6 +1815,10 @@ def float_or_none(v, scale=1, invscale=1, default=None):
return default
+def bool_or_none(v, default=None):
+ return v if isinstance(v, bool) else default
+
+
def strip_or_none(v):
return None if v is None else v.strip()
@@ -2733,6 +2737,8 @@ def cli_option(params, command_option, param):
def cli_bool_option(params, command_option, param, true_value='true', false_value='false', separator=None):
param = params.get(param)
+ if param is None:
+ return []
assert isinstance(param, bool)
if separator:
return [command_option + separator + (true_value if param else false_value)]
diff --git a/youtube_dl/version.py b/youtube_dl/version.py
index b6d3788..cdcb32e 100644
--- a/youtube_dl/version.py
+++ b/youtube_dl/version.py
@@ -1,3 +1,3 @@
from __future__ import unicode_literals
-__version__ = '2017.06.25'
+__version__ = '2017.09.11'